Пример #1
0
    def tfidf_model(self):
        print('Logging Info - Get Tf-idf model...')
        tfidf_model_path = os.path.join(FEATURE_DIR,
                                        '{}_tfidf.model').format(self.genre)
        dict_path = os.path.join(FEATURE_DIR,
                                 '{}_tfidf.dict').format(self.genre)
        if os.path.exists(tfidf_model_path):
            dictionary = pickle_load(dict_path)
            tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            corpus = [
                text.split() for text in self.train_data['premise'] +
                self.train_data['hypothesis'] + self.dev_data['premise'] +
                self.dev_data['hypothesis'] + self.test_data['premise'] +
                self.test_data['hypothesis']
            ]
            dictionary = corpora.Dictionary(corpus)
            corpus = [dictionary.doc2bow(text) for text in corpus]
            tfidf_model = TfidfModel(corpus)

            del corpus
            tfidf_model.save(tfidf_model_path)
            pickle_dump(dict_path, dictionary)

        return dictionary, tfidf_model
Пример #2
0
class TFIDF(object):
    def __init__(self, max_workers):
        self.max_workers = max_workers
        self.log = logging.getLogger('tfidf_model')
        self.model, self.dictionary = None, None

    # @timed
    def train(self, preprocessed_docs):
        logger.info('Building dictionary')
        self.dictionary = Dictionary(preprocessed_docs)

        logger.info('Dictionary built with %d words. Building corpus',
                    len(self.dictionary))
        corpus = [self.dictionary.doc2bow(line) for line in preprocessed_docs
                  ]  # convert dataset to BoW format

        logger.info('Built corpus')
        self.model = TfidfModel(corpus)

    def save_model(self, model_path):
        logger.info('Saving TFIDF model to file: %s', model_path)
        self.model.save(model_path)

    def save_dictionary(self, dict_path):
        logger.info('Saving dictionary to file: %s', dict_path)
        self.dictionary.save(dict_path)

    def build_corpus(self, doc_list, dictionary):
        return list(map(dictionary.doc2bow, doc_list))

    @staticmethod
    def with_url_handling(max_workers):
        return TFIDF(max_workers)
Пример #3
0
def build_tfid_model(dictionary, corpus, should_rebuild):
    tfid = list()

    # DEBUG
    should_rebuild = True

    if not should_rebuild:
        try:
            print('Loading TFID Model backup...')
            tfid_file = utils.get_file_path(cfg.TFID_BACKUP)
            print('TFID file = {}'.format(tfid_file))

            tfid = LdaModel.load(tfid_file)

        except Exception as exc:
            utils.print_exception_details('Building TFID Model', exc)

    else:
        print('Building TFID Model...')
        tfid = TfidfModel(corpus)
        print('Done!')
        # Save Model Structures
        TFID_FILE = utils.get_file_path(cfg.TFID_BACKUP)
        tfid.save(TFID_FILE)

    return tfid
def make_tfidf(target_posts):
	df_data = pd.read_csv(os.path.join(CONFIG.DATASET_PATH, target_posts, 'posts.csv'), index_col=0, header=None, encoding='utf-8-sig')
	with open(os.path.join(CONFIG.DATASET_PATH, target_posts, 'word_idx.json'), "r", encoding='utf-8') as f:
		word_idx = json.load(f)
	with open(os.path.join(CONFIG.DATASET_PATH, target_posts, 'corpus.txt'), 'r', encoding='utf-8') as f:
		data = f.read()
		text_data = [data.split()]
	print("making documents...")
	dct = Dictionary(text_data)
	documents = [dct.doc2bow(value[1].split()) for index, value in df_data.iterrows()]
	print("embedding started")
	embedding_model = TfidfModel(documents, id2word=word_idx[0])
	model_name = "TFIDF_"+ target_posts + ".model"
	embedding_model.save(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
	print("embedding completed")

	corpus_tfidf = embedding_model[documents]
	d = {}
	for doc in corpus_tfidf:
		for id, value in doc:
			word = word_idx[0][str(id)]
			d[word] = value
	sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
	print(sorted_d[:10])
	dictionary_list = []
	sorted_d = sorted_d[:3000]
	for word, value in sorted_d:
		dictionary_list.append(word)

	with open(os.path.join(CONFIG.DATASET_PATH, target_posts, 'dictionary_list.p'), 'wb') as f:
		cPickle.dump(dictionary_list, f)
Пример #5
0
def create_tfidf_from_papers(
    path_to_jsonl_index: Path = BIOPAPERS_JSON_PATH,
    path_to_bow: Path = BOW_PATH,
    outfile: Path = TFIDF_VECTORIZER,
) -> TfidfModel:
    """
    Creates TFIDF model from BOW corpora.

    Parameters
    ----------
    path_to_jsonl_index: Path
        Path to json lines index
    path_to_bow: Path
        Path to Bag of Words Dictionary
    outfile: Path
        Path to TFIDF vectorizer

    Returns
    -------
    tfidf_model: TfidfModel
        Gensim TFIDF Model
    """
    # Load dictionary
    dictionary = Dictionary.load(str(path_to_bow))
    # Load corpus generator
    corpus = BiopapersCorpus(dictionary, path_to_jsonl_index)
    # Train TFIDF
    tfidf_model = TfidfModel(corpus)
    # Save TFIDF model to file:
    tfidf_model.save(str(outfile))

    return tfidf_model
Пример #6
0
def get_corpus(docs):
    print("Building corpus ...")
    tfidf_model = None

    # load corpus from disk 
    if ARGS.load_corpus: 
        corpus = MmCorpus(ARGS.path_corpus)

    else:
        corpus = [dictionary.doc2bow(doc) for doc in docs]

        # serialize corpus to disk to prevent memory problems if corpus gets too large
        MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_bow.mm', corpus)  
        corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_bow.mm')

        if ARGS.corpus_type == "TFIDF": 
            tfidf_model = TfidfModel(corpus)

            tfidf_model.save(ARGS.save_dir + "/models/tfidf_model.mm")
            corpus = tfidf_model[corpus]

            # serialize corpus to disk to prevent memory problems if corpus gets too large
            MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_tfidf.mm', corpus)  
            corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_tfidf.mm')
    return corpus, tfidf_model
Пример #7
0
def tfidf(dataframe, max_words=None):
    """Returns a tf-idf model for documents stored in a DataFrame.

    Precomputed models are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    max_words : int (default is 2000000)
        The maximum number of words stored by the model.

    Returns
    -------
    model : Gensim TfidfModel
        tf-idf model for documents stored in the DataFrame.
    """
    suffix = '_{}'.format(max_words) if max_words else ''
    filename = 'caches/models/tfidf{}.model'.format(suffix)

    if not os.path.isfile(filename):
        if max_words:
            dictionary = hashdictionary_corpus(dataframe, id_range=max_words)
        else:
            dictionary = dictionary_corpus(dataframe)
        tfidf_model = TfidfModel(dictionary=dictionary)
        tfidf_model.save(filename)
    else:
        tfidf_model = TfidfModel.load(filename)

    return tfidf_model
Пример #8
0
def train_lda():
	"""
	Usage: python Wechat_LDA.py wechat.csv
	"""
	with open(sys.argv[1], 'r') as wx:
		for f in wx:
			seg = jieba.cut(f)
			seg = [word for word in seg if word not in stopwords]
			with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg:
				wx_seg.write(' '.join(seg))

	documents = open('wechat_seg.txt', 'r')
	dictionary = corpora.Dictionary(LineSentence(documents))
	corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)]
	tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True)
	tfidf_model.save('wechat_seg.txt.tfidf_model')
	# corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus])
	lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1)
	lda_model.save('wechat_lda_model.pkl')

	topics = []
	for doc in corpus:
		topics.append(lda_model[doc])

	counts = np.zeros(100)
	for top_doc in topics:
		for ti, _ in top_doc:
			counts[ti] += 1

	words = lda_model.show_topic(counts.argmax(), 64)
	with open('top_words.txt', 'w') as tw:
		writer = UnicodeWriter(tw)
		for w in words:
			writer.writerow((w[0], int(float(w[1])*1000)))
Пример #9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Пример #10
0
def tfidf_w2v_top5w(all_docs_prepro):
    # TFIDF MODEL
    exists = os.path.isfile('embedding/models/tfidf_all.model')
    if exists:
        print('Tfidf embedding model already existing')
    else:
        dct = Dictionary(all_docs_prepro)  # fit dictionary
        corpus = [dct.doc2bow(line)
                  for line in all_docs_prepro]  # convert corpus to BoW format
        model_tfidf = TfidfModel(corpus)
        word_path = 'embedding/models/tfidf_all.model'
        model_tfidf.save(word_path)

    # WORD2VEC MODEL
    exists = os.path.isfile('embedding/models/word2vec_all.model')
    if exists:
        print('Word2vec embedding model already existing')
    else:
        print('Training word2vec on all answers')
        word_path = "embedding/models/word2vec_all.model"
        word_tempfile = get_tmpfile(word_path)
        word_model = Word2Vec(all_docs_prepro,
                              size=128,
                              window=5,
                              min_count=1,
                              workers=4)
        word_model.save(word_path)
Пример #11
0
def train(file=DATA_FILE, type=JSON):
    delete_previous_models()

    faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type)
    faq_df = clean_data(faq_df)
    faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess)
    faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess)
    print('Preprocessing Done')
    if DEBUG:
        print(faq_df.head())

    for mode in modes:
        model = modes[mode]
        dictionary = corpora.Dictionary(faq_df[model.column])
        dictionary.save(os.path.join(MODEL_DIR, model.dictionary))
        corpus = faq_df[model.column].map(dictionary.doc2bow)
        if DEBUG:
            print(f'{model.corpus} generated')
            print(corpus.head())
        corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus),
                                   corpus)
        tfidf_model = TfidfModel(corpus)
        if DEBUG:
            print(f'{model.tfidf} generated')
        tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf))
        tfidf = tfidf_model[corpus]
        lda_model = LdaMulticore(corpus=tfidf,
                                 id2word=dictionary,
                                 num_topics=30)
        lda_model.save(os.path.join(MODEL_DIR, model.model))
        if DEBUG:
            print(f'{model.model} generated')
            print(lda_model.print_topics(5))
    print('Training completed')
Пример #12
0
def main(argv):

    inputfile = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
    except getopt.GetoptError:
        print
        'training_model.py -i <inputfile> -o <outputfile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print
            'test.py -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg

    # Read messages from log file
    messages = [line for line in open(inputfile)]

    print('Log file contains {} lines'.format(len(messages)))

    tokens = Tokens(messages)
    tokenized = Tokens.clean_tokenized(
        tokens.pyonmttok(Tokens.TOKENIZER, messages))

    dct = Dictionary(tokenized)
    corpus = [dct.doc2bow(line) for line in tokenized]
    tfidf = TfidfModel(corpus, normalize=True)

    tfidf.save(outputfile)
def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
Пример #14
0
def train_tfidf(inpath=config.path_train_cut):
    train_df = pd.read_csv(inpath,
                           sep="\t",
                           header=None,
                           names=["id", "s1", "s2", "label"],
                           encoding="utf-8")
    tfidf_txt = train_df["s1"].tolist() + train_df["s2"].tolist()
    texts = [tokenize(text) for text in tfidf_txt]

    # remove words that appear only once
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    documents = [[token for token in text if frequency[token] > 1]
                 for text in texts]

    dictionary = Dictionary(documents)
    dictionary.save_as_text("./model/words.dic")

    # dictionary = Dictionary.load_from_text("./model/words.dic")

    class MyCorpus(object):
        def __iter__(self):
            for doc in documents:
                yield dictionary.doc2bow(doc)

    corpus = MyCorpus()
    MmCorpus.serialize("./model/corpus.mm", corpus)
    # corpus = MmCorpus("./model/corpus.mm")
    tfidf = TfidfModel(corpus)
    tfidf.save("./model/tf_idf.model")
Пример #15
0
 def convertBowTfidf(self, folderName, dataBow, dictionary):
     print "start building tfidf..."
     tfidf = TfidfModel(dataBow, id2word=dictionary, normalize=True)
     tfidf.save("data.tfidfModel")
     corpora.MmCorpus.serialize("dataTfidf.mm",
                                tfidf[dataBow],
                                progress=10000)
     print "data.tfidfModel and dataTfidf.mm have been saved."
Пример #16
0
 def trainModelM2(self, sampleUtterances_tokens, outpath):
     dictionary = corpora.Dictionary.load(
         os.path.join(outpath, "dictionary.dict"))
     corpus = [dictionary.doc2bow(line) for line in sampleUtterances_tokens]
     model = TfidfModel(corpus)
     index = similarities.MatrixSimilarity(model[corpus])
     model.save(os.path.join(outpath, "m2.model"))
     index.save(os.path.join(outpath, "m2.index"))
Пример #17
0
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    tfidfmodel = TfidfModel(corpus)
    index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary))
    index.save(matrix_name)
    tfidfmodel.save(model_name)
    dictionary.save(dic_name)
    return tfidfmodel, index, dictionary
Пример #18
0
class EmbedReplace(object):
    def __init__(self, sample_path, wv_path):
        self.samples = read_sample(sample_path)
        self.refs = [
            sample.split('<sep>')[1].split() for sample in self.samples
        ]
        self.wv = KeyedVectors.load_word2vec_format(wv_path, binary=False)

        if os.path.exists('saved/tfidf.model'):
            self.tfidf_model = TfidfModel.load('saved/tfidf.model')
            self.dct = Dictionary.load('saved/tfidf.dict')
            self.corpus = [self.dct.doc2bow(doc) for doc in self.refs]
        else:
            self.dct = Dictionary(self.refs)
            self.corpus = [self.dct.doc2bow(doc) for doc in self.refs]
            self.tfidf_model = TfidfModel(self.corpus)
            self.dct.save('saved/tfidf.dict')
            self.tfidf_model.save('saved/tfidf.model')
            self.vocab_size = len(self.dct.token2id)

    def vectorize(self, docs, vocab_size):
        return matutils.corpus2dense(docs, vocab_size)

    def extract_keywords(self, dct, tfidf, threshold=0.2, topk=5):
        tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True)
        return list(
            islice([dct[w] for w, score in tfidf if score > threshold], topk))

    def replace(self, token_list, doc):
        keywords = self.extract_keywords(self.dct, self.tfidf_model[doc])
        num = int(len(token_list) * 0.3)
        new_tokens = token_list.copy()
        while num == int(len(token_list) * 0.3):
            indexes = np.random.choice(len(token_list), num)
            for index in indexes:
                token = token_list[index]
                if isChinese(
                        token) and token not in keywords and token in self.wv:
                    new_tokens[index] = self.wv.most_similar(token,
                                                             topn=1)[0][0]
            num -= 1
        return ' '.join(new_tokens)

    def generate_samples(self, write_path):
        replaced = []
        count = 0
        for sample, token_list, doc in zip(self.samples, self.refs,
                                           self.corpus):
            count += 1
            if count % 100 == 0:
                print(count)
                write_samples(replaced, write_path)
                replaced = []
            replaced.append(
                sample.split('<sep>')[0] + '<sep>' +
                self.replace(token_list, doc))
Пример #19
0
 def get_tfidf(self,path):
     path = path + '.tfidf'   
     if not os.path.exists(path):
         tfidf_model = TfidfModel(self.corpus, smartirs='ntc')
         tfidf_model.save(path) 
         # перевзвешивание корпуса
         self.corpus = tfidf_model[self.corpus]
     else:
          tfidf_model = TfidfModel.load(path)
     return tfidf_model    
Пример #20
0
def lsa_twitter(cased_tokens):
    """ Latent Sentiment Analyis on random sampling of twitter search results for words listed in cased_tokens """

    # Only 5 of these tokens are saved for a no_below=2 filter:
    #   PyCons NLPS #PyCon2016 #NaturalLanguageProcessing #naturallanguageprocessing
    if cased_tokens is None:
        cased_tokens = ('PyConOpenSpaces PyCon PyCon2017 PyCon2018 PyCon2016 PyCon2015 OpenSpace PyconTutorial ' +
                        'NLP NaturalLanguageProcessing NLPInAction NaturalLanguageProcessingInAction NLPIA Twote Twip'
                        ).split()
        cased_tokens += [s + 's' for s in cased_tokens]

        cased_tokens += 'TotalGood TotalGoods HobsonLane Hob Hobs TotalGood.com ' \
                        'www.TotalGood.com http://www.TotalGood.com https://www.TotalGood.com'.split()

    allcase_tokens = cased_tokens + [s.lower() for s in cased_tokens]
    allcase_tokens += [s.title() for s in cased_tokens]
    allcase_tokens += [s.upper() for s in cased_tokens]
    KEEP_TOKENS = allcase_tokens + ['#' + s for s in allcase_tokens]

    # takes 15 minutes and 10GB of RAM for 500k tweets if you keep all 20M unique tokens/names URLs
    vocab_path = os.path.join(BIGDATA_PATH, 'vocab939370.pkl')
    if os.path.isfile(vocab_path):
        print('Loading vocab: {} ...'.format(vocab_path))
        vocab = Dictionary.load(vocab_path)
        print(' len(vocab) loaded: {}'.format(len(vocab.dfs)))
    else:
        tweets_path = os.path.join(BIGDATA_PATH, 'tweets.csv.gz')
        print('Loading tweets: {} ...'.format(tweets_path))
        tweets = read_csv(tweets_path)
        tweets = np.array(tweets.text.str.split())
        with gzip.open(os.path.join(BIGDATA_PATH, 'tweets.txt.gz'), 'w') as f:
            for tokens in tweets:
                f.write((' '.join(tokens) + '\n').encode('utf-8'))
        # tweets['text'] = tweets.text.apply(lambda s: eval(s).decode('utf-8'))
        # tweets['user'] = tweets.user.apply(lambda s: eval(s).decode('utf-8'))
        # tweets.to_csv('tweets.csv.gz', compression='gzip')
        print('Computing vocab from {} tweets...'.format(len(tweets)))
        vocab = Dictionary(tweets, no_below=NO_BELOW, no_above=NO_ABOVE, keep_tokens=set(KEEP_TOKENS))

    vocab.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N, keep_tokens=set(KEEP_TOKENS))
    print(' len(vocab) after filtering: {}'.format(len(vocab.dfs)))

    # no time at all, just a bookeeping step, doesn't actually compute anything
    tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
    tfidf.save(os.path.join(BIGDATA_PATH, 'tfidf{}.pkl'.format(len(vocab.dfs))))

    tweets = [vocab.doc2bow(tw) for tw in tweets]
    json.dump(tweets, gzip.open(os.path.join(BIGDATA_PATH, 'tweet_bows.json.gz'), 'w'))

    gc.collect()

    # LSA is more useful name than LSA
    lsa = LsiModel(tfidf[tweets], num_topics=200, id2word=vocab, extra_samples=100, power_iters=2)

    return lsa
Пример #21
0
def make_corpus():
    corpus = MyCorpus()
    tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    num_terms = 400
    lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms)
    # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use
    corpus.dictionary.save(os.path.join(HERE, "sogou.dict"))  # store the dictionary, for future reference
    tfidf_model.save(os.path.join(HERE, "sogou.model"))
    lsi_model.save(os.path.join(HERE, "sogou.lsi"))
    print "save dictionary and tfidf model"
    """    
def trainModelM2(tests, detector_tokens, sampleUtterances_tokens, productid,
                 outpath):
    dictionary = corpora.Dictionary.load(
        os.path.join(outpath, "dictionary.dict"))
    corpus = [dictionary.doc2bow(line) for line in sampleUtterances_tokens]
    model = TfidfModel(corpus)
    index = similarities.MatrixSimilarity(model[corpus])
    for test in tests:
        if not testModelForSearch(model, dictionary, index, test):
            return
    model.save(os.path.join(outpath, "m2.model"))
    index.save(os.path.join(outpath, "m2.index"))
Пример #23
0
def _tfidf(corpus, dictionary):
    tfidf_file_name = get_tfidf_file_name(CORPUS_FILES["label"])
    try:
        tfidf = TfidfModel.load(tfidf_file_name)
    except FileNotFoundError:
        corpus_numeric = [dictionary.doc2bow(document) for document in corpus]
        tfidf = TfidfModel(corpus=corpus_numeric)
        print("File does not exist - creating the tfidf model")

        create_file_and_folders_if_not_exist(tfidf_file_name)
        tfidf.save(tfidf_file_name)

    return tfidf
Пример #24
0
def make_corpus():
    corpus = MyCorpus()
    tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    num_terms = 400
    lsi_model = LsiModel(corpus_idf,
                         id2word=corpus.dictionary,
                         num_topics=num_terms)
    #corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use
    corpus.dictionary.save(os.path.join(
        HERE, 'sogou.dict'))  # store the dictionary, for future reference
    tfidf_model.save(os.path.join(HERE, 'sogou.model'))
    lsi_model.save(os.path.join(HERE, 'sogou.lsi'))
    print 'save dictionary and tfidf model'
    '''    
Пример #25
0
class Cos():
    def __init__(self):
        self.tfidf = {}
        self.dict = Dictionary()

    def init(self, traindata, dict_path, tfidf_path):
        self.dict = Dictionary(traindata)  # fit dictionary
        corpus = [self.dict.doc2bow(line) for line in traindata]  # convert corpus to BoW format
        self.tfidf = TfidfModel(corpus)  # fit model
        self.dict.save(dict_path)
        self.tfidf.save(tfidf_path)


    def load(self, dict_path, tfidf_path):
        self.dict = Dictionary.load(dict_path)
        self.tfidf = TfidfModel.load(tfidf_path)
Пример #26
0
def main(dataset_path):
    if not os.path.exists('../data/retriever/paragraph-ids.txt'):
        print('Writing paragraph ID to file...')
        with open('../data/retriever/paragraph-ids.txt', 'w') as f:
            for paragraph_id in load_ids(dataset_path):
                f.write(paragraph_id + '\n')

    dictionary_path = '../data/retriever/dct.pkl'
    if not os.path.exists(dictionary_path):
        print('Creating dictionary...')
        st = time.time()
        dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000)
        dct.save(dictionary_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating dictionary in {et - st}s.')
    else:
        print('Loading dictionary...')
        dct = Dictionary.load(dictionary_path)
        print('Dictionary loaded.')

    tfidf_path = '../data/retriever/tfidf.pkl'
    if not os.path.exists(tfidf_path):
        print('Creating model...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        model = TfidfModel(corpus)
        model.save(tfidf_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating model in {et - st}s.')
    else:
        print('Loading model...')
        model = TfidfModel.load(tfidf_path)
        print('Model loaded.')

    index_path = '../data/retriever/indexes/master-index'
    if not os.path.exists(index_path):
        print('Creating index...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        index = Similarity('../data/retriever/indexes/index', model[corpus],
                           len(dct))
        index.save(index_path)
        et = time.time()
        print(f'\rFinished creating index in {et - st}s.')
        print('Done')
    else:
        print('Nothing to do. Exiting...')
Пример #27
0
def build_dict_model(path):
    allFiles = glob.glob(path + "/*.tsv")
    _list = [
        pd.read_csv(f,
                    header=None,
                    delimiter="\t",
                    quoting=csv.QUOTE_NONE,
                    encoding='utf-8') for f in allFiles
    ]
    frame = pd.concat(_list)
    myhash = str(random.getrandbits(8))
    tokens = [word_tokenize(str(row)) for row in frame.ix[:, 3]]
    dictionary = Dictionary(tokens)
    dictionary.save("ressources/dictionary" + myhash)
    tfidf_model = TfidfModel([dictionary.doc2bow(t) for t in tokens],
                             id2word=dictionary)
    tfidf_model.save("ressources/tfidf_model" + myhash)
def build_tfidf(corpus_dir,model_filename):
    stemmer = nltk.stem.PorterStemmer()
    corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$')  # a memory-friendly iterator
    dictionary = corpora.Dictionary()

    bigram_transformer = Phrases(TextCorpus(corpus))

    for myfile in corpus.fileids():
        try:
            chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]]
            dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]])

        except Exception as e:
            print 'Warning error in file:', myfile

    model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary)
    model.save(model_filename)
Пример #29
0
 def loadTfidfModel(self, type='offline'):
     '''
     加载Tfidf模型,若模型不存在则建立模型
     '''
     filePath = self.cachePath + '%s_tfidf_%s.model' % (self.name, type)
     if os.path.isfile(filePath):
         tfidfModel = SaveLoad.load(filePath)
     else:
         startTime = datetime.now()
         if type not in self.dictionary:
             self.loadDictionary(type)
         tfidfModel = TfidfModel(dictionary=self.dictionary[type])
         # tfidfModel = makeTfidfModel(self.dictionary)
         tfidfModel.save(filePath)
         print('train tfidfModel time:', datetime.now() - startTime)
     self.tfidfModel[type] = tfidfModel
     return tfidfModel
Пример #30
0
def process():
    # read all the text files in the directory and build a corpus
    corpus = TextDirectoryCorpus("C://Users//Kumar Abhijeet//Project/Preprocess_data//JD//")
    # save word-id dictionary
    #corpus.dictionary.save_as_text('wordids_JD2.txt')
    # save matrix market format vectors
    MmCorpus.serialize('JD_bow.mm', corpus)

    # load word-id dictionary
    id2word = Dictionary.load('foobar.txtdic')
    # load matrix market format vectors
    mm = MmCorpus('JD_bow.mm')

    # train tfidf
    tfidf = TfidfModel(mm, id2word=id2word, normalize=True)
    # save tfidf model
    tfidf.save('tfidf_JD.model')
    # save tfidf vectors in matrix market format
    MmCorpus.serialize('tfidf_JD.mm', tfidf[mm])
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute(
        'SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.5,
                                    keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
Пример #32
0
def tfidf(model_name):
    """
    tf-idf
    """
    print("prepare data.")
    os.chdir("data")
    set_data(mode="doc")
    sentences = bow_read_docs(folder_name="tmp_file")

    dic = Dictionary(sentences)
    ## 「出現頻度が20未満の単語」と「30%以上の文書で出現する単語」を排除
    ## dic.filter_extremes(no_below = 20, no_above = 0.3)
    bow_corpus = [dic.doc2bow(d) for d in sentences]

    print("train model.")
    model = TfidfModel(bow_corpus)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)
Пример #33
0
class TFIDF(object):
    def __init__(self, max_workers, preprocessor=None):
        self.max_workers = max_workers
        self.log = logging.getLogger('tfidf_model')
        self.preprocessor = preprocessor if preprocessor is not None else Preprocessor(
            max_workers=max_workers)
        self.model, self.dictionary = None, None

    # @timed
    def train(self, doc_list):
        self.log.info(
            'TFIDF.train called. Starting preprocessing %d documents',
            len(doc_list))
        preprocessed_docs = self.preprocessor.process_docs(doc_list)

        self.log.info('Preprocessing ended. Building dictionary')
        dictionary = Dictionary(preprocessed_docs)

        self.log.info('Dictionary built with %d words. Building corpus',
                      len(self.dictionary))
        corpus = [dictionary.doc2bow(line) for line in preprocessed_docs
                  ]  # convert dataset to BoW format

        self.log.info('Built corpus')
        self.model = TfidfModel(corpus)

    def save_model(self, model_path):
        self.log.info('Saving LDA model to file: %s', model_path)
        self.model.save(model_path)

    def save_dictionary(self, dict_path):
        self.log.info('Saving dictionary to file: %s', dict_path)
        self.dictionary.save(dict_path)

    def build_corpus(self, doc_list, dictionary):
        return list(map(dictionary.doc2bow, doc_list))

    @staticmethod
    def with_url_handling(max_workers):
        return TFIDF(max_workers,
                     preprocessor=WithUrlPreprocessor(max_workers=max_workers))
Пример #34
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load back the id->word mapping directly from file
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm")
    mm = MmCorpus(fin)

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)

    # save the TfidfModel instance to file
    fout = path.join(datadir, "reuters21578.tfidf.model")
    tfidf.save(fout)

    # save TF-IDF vectors in matrix market format
    fout = path.join(datadir, "reuters21578.tfidf.mm")
    MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
Пример #35
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Пример #36
0
def build_tfidf_model(data_directory, corpus_path, wiki_text_output_path, model_output_path, multiwords=True, druid_cutoff_score=0.3):

    stemmer = nltk.stem.PorterStemmer()
    tokenid_dictionary = corpora.Dictionary()

    if not exists(wiki_text_output_path):
        logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path)
        # Convert Wikipedia XML dump into .txt format
        wikidump2text.convert(corpus_path, wiki_text_output_path)
    else:
        logger.info('Found ', wiki_text_output_path, ' not converting from the raw bz2 file.')

    # Load Multiword Expressions as Dictionary
    stopwords_path = join(data_directory, 'stopwords_en.txt')
    
    if multiwords:
        druid_path = join(data_directory, 'druid_en.bz2')
        druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score)
        logger.info('Loaded Druid with cutoff' + str(druid_cutoff_score))
    else:
        druid_dict = None

    logger.info("Building tfidf model...")
    start_time = time.time()

    if multiwords:
        logger.info('Using druid_en.bz2 in  ' + data_directory + ' as multiword dictionary.')
        articles = TextCorpus(wiki_text_output_path, druid_dict, multiwords=True)  # a memory-friendly iterator
    else:
        logger.info('Using no multiword dicitionary, just single words')
        articles = TextCorpus(wiki_text_output_path, None, multiwords=False)
    
    tokenid_dictionary.add_documents(articles)


    model = TfidfModel(BowCorpus(wiki_text_output_path, druid_dict, tokenid_dictionary, multiwords=multiwords), id2word=tokenid_dictionary)
    model.save(model_output_path)

    logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
    return os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')

corpus_dir = os.path.join(data_directory(), 'audio_transcripts')
model_filename = os.path.join(data_directory(), 'conversation.tfidf')

stemmer = nltk.stem.PorterStemmer()
corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$')  # a memory-friendly iterator
dictionary = corpora.Dictionary()

# Train bigram transformer
class TextCorpus(object):
    def __iter__(self):
        for file in corpus.fileids():
            yield [word.lower() for word in corpus.words(file)]

bigram_transformer = Phrases(TextCorpus())

for file in corpus.fileids():
    chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]]
    dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]])


class BowCorpus(object):
    def __iter__(self):
        for file in corpus.fileids():
            chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]]
            yield dictionary.doc2bow([stemmer.stem(chunk) for chunk in chunks])

model = TfidfModel(BowCorpus(), id2word=dictionary)
model.save(model_filename)
Пример #39
0
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
#			corpus.save(f_bow)

	# filter dictionary
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)

	# TRAINING

	# lsa model
	if not os.path.exists(f_lsa):
		lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim)
		lsa.save(f_lsa)

	# word2vec model
	class MyCorpus():
		def __iter__(self):
			for d in corpus.get_texts():
				yield [w for w in d if w in corpus.dictionary.token2id]
	if not os.path.exists(f_w2v):
		w2v = Word2Vec(MyCorpus(), size=w2v_dim, min_count=1, window=5)
Пример #40
0
class TfidfVectorizer():
    """
    Transform text to tf-idf representation
    """

    def __init__(self):

        self.base_path = os.path.dirname(__file__)
        self.dictionary_path = os.path.join(self.base_path, "dictionary")
        self.tf_idf_model_path = os.path.join(self.base_path, "tfidf")

        self.stemmer = NepStemmer()
        self.tf_idf_model = None

    def get_tokens(self, document):
        if not self.stemmer:
            raise Exception("Stemmer not available")

        return self.stemmer.get_stems(document)

    def construct_model(self, documents):
        logging.basicConfig(
            format='%(asctime)s:%(levelname)s:%(message)s',
            level=logging.INFO
        )

        logging.info("Obtaining word tokens")
        tokens = [self.get_tokens(document) for document in documents]
        # self.tf_idf_model = TfidfModel(tokens)

        logging.info("Constructing dictionary")
        self.dictionary = Dictionary(tokens)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
        self.dictionary.compactify()
        self.dictionary.save(self.dictionary_path)

        logging.info("Constructing TF-IDF model")
        self.tf_idf_model = TfidfModel(dictionary=self.dictionary)
        self.tf_idf_model.save(self.tf_idf_model_path)

    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)

    def doc2vector(self, document):
        """ Returns the sparse tf-idf vector for given document """

        tokens = self.get_tokens(document)
        bag_of_words = self.dictionary.doc2bow(tokens)

        return (self.tf_idf_model[bag_of_words])

    def obtain_feature_vector(self, document):
        """
        Returns a single dense tf-idf vector for a given document
        """

        self.load_data()

        tf_idf_vector = matutils.sparse2full(
            self.doc2vector(document),
            self.no_of_features
        ).reshape(1, -1)

        return tf_idf_vector

    def obtain_feature_matrix(self, documents):
        """
        Returns the tf-idf dense matrix for the given documents
        """

        self.load_data()

        input_matrix_sparse = [
            self.doc2vector(x)
            for x in documents
        ]

        no_of_features = len(self.tf_idf_model.idfs)

        input_matrix = matutils.corpus2dense(
            input_matrix_sparse,
            no_of_features
        ).transpose()

        return input_matrix
Пример #41
0
def main(train, model, dic):
    logging.basicConfig(level=logging.INFO)
    corpus = SentenceDocCorpus(train)
    tfidf = TfidfModel(corpus)
    tfidf.save(model)
    corpus.dictionary.save(dic)
    elif not opts.scaling:
        scaling = None
    else:
        raise ValueError("Only tfidf scaling is supported")

    word_model = opts.word_model

    if word_model:
        logging.info("Building word model")
        corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit)
    else:
        corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    voc = Dictionary(corpus)
    voc.filter_extremes(no_below=cutoff)
    voc.compactify()

    bow_corpus = (voc.doc2bow(art) for art in corpus)

    tfidf = None

    if scaling == 'tfidf':
        tfidf = TfidfModel(bow_corpus)
        bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus)

    model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc)
    model.save(model_fn)

    if tfidf:
        tfidf.save(model_fn + '.tfidf')
Пример #43
0
    # Remove stop words (additional removal of common words used in spoken language)
    stop_ids = []
    with open(stop_words_file, 'r') as infile:
        for line in infile:
            try:
                stop_ids.append(wiki.dictionary.token2id[line.lower().strip()])
            except KeyError:
                continue
    wiki.dictionary.filter_tokens(bad_ids=stop_ids)

    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
Пример #44
0
# In[ ]:

tfidf.


# In[42]:

tfidf.num_docs


# In[43]:

tfidf.num_nnz


# In[44]:

tfidf.save(os.path.join(DATA_PATH, 'tfidf'))


# In[45]:

tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))


# In[46]:

tfidf2.num_nnz


class TextCorpus(object):
    def __init__(self, filename):
        self.corpus = codecs.open(filename, 'r', encoding='utf-8')

    def __iter__(self):
        # One line contains one wiki article.
        for line in self.corpus:
            ngrams = druid_dict.find_ngrams(line.lower().split())
            yield [stemmer.stem(token) for token in ngrams]

articles = TextCorpus(wiki_text_output_path)  # a memory-friendly iterator
dictionary.add_documents(articles)


class BowCorpus(object):
    def __init__(self, filename):
        self.corpus = codecs.open(filename, 'r', encoding='utf-8')

    def __iter__(self):
        for line in self.corpus:
            ngrams = druid_dict.find_ngrams(line.lower().split())
            stemmed_article = [stemmer.stem(token) for token in ngrams]
            yield dictionary.doc2bow(stemmed_article)

model = TfidfModel(BowCorpus(wiki_text_output_path), id2word=dictionary)
model.save(model_output_path)

logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
Пример #46
0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program
    wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
    # only keep the most frequent words (out of total ~8.2m unique tokens)
    wiki.dictionary.filter_extremes(no_below=20, keep_n=DEFAULT_DICT_SIZE)
    wiki.dictionary.save_as_text(outp + '_wordids.txt')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt')

    # build tfidf, ~50min
    tfidf = TfidfModel(wiki, normalize=True)
    tfidf.save('tfidf_all_words')



    logger.info("finished running %s" % program)
Пример #47
0
        self.dictionary.save_as_text("wiki_en_wordids.txt")

    def __iter__(self):
        for tokens in iter_documents():
            yield self.dictionary.doc2bow(tokens)


corpus = MyCorpus()  # create a dictionary
corpora.MmCorpus.serialize("wiki_en_corpus.mm", corpus)  # store to disk, for later use

# for vector in corpus: # convert each document to a bag-of-word vector
#    print vector

print "Create models"
tfidf_model = TfidfModel(corpus)
tfidf_model.save("wiki_en_tfidf.model")

# lsi_model = LsiModel(corpus)

# topic_id = 0
# for topic in lsi_model.show_topics():
#    topic_id+=1
#    print "TOPIC (LSI) " + str(topic_id) + " : " + topic

# lsi_model.print_topic(20, topn=10)
# corpus_lsi = lsi_model[corpus]

corpus_tfidf = tfidf_model[corpus]

lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
# corpus_lsi_2 = lsi_model_2[corpus]