def main(): news_df = pd.read_pickle("news_df.pkl") # Bag Of Words - Vocab 1 dictionary: Dictionary = Dictionary.load('vocabulary1.gensim') bow_voc1_corpus = [ dictionary.doc2bow(doc_tokens) for doc_tokens in news_df['DocTokens'] ] pickle.dump(bow_voc1_corpus, open('bow_voc1_corpus.pkl', 'wb')) # Bag Of Words - Vocab 2 dictionary2: Dictionary = Dictionary.load('vocabulary2.gensim') bow_voc2_corpus = [ dictionary2.doc2bow(doc_tokens) for doc_tokens in news_df['DocTokens'] ] pickle.dump(bow_voc2_corpus, open('bow_voc2_corpus.pkl', 'wb')) # TF-IDF - Vocab 1 tfidf1 = TfidfModel(bow_voc1_corpus, smartirs='ntc') tfidf_voc1_corpus = tfidf1[bow_voc1_corpus] pickle.dump(tfidf_voc1_corpus, open('tfidf_voc1_corpus.pkl', 'wb')) # TF-IDF - Vocab 2 tfidf2 = TfidfModel(bow_voc2_corpus, smartirs='ntc') tfidf_voc2_corpus = tfidf2[bow_voc2_corpus] pickle.dump(tfidf_voc2_corpus, open('tfidf_voc2_corpus.pkl', 'wb'))
def __init__(self, name, dirs="D:\\Stack Flow\\data\\", n=200, flag=False): ''' given a bias -n for extract IDF smallest 200 words dir of file To return: context of whole keyword documents n for select model ''' self.n = n self.dir = dirs self.filename = name self.id = name.replace(".kdoc", "") d = Dictionary.load(dirs + "keyword.dict") self.tag = d[int(self.id)] context = open(dirs + "kdoc\\" + name, 'r').read() self.stopword = open(dirs + "stopword.txt", 'r').read().split('\n') doc = context.split("\n") self.raw = context self.context = [[ w for w in word.split() if w not in self.stopword and w[0].isalpha() == True ] for word in doc if (len(word) > 10)] self.dict = Dictionary.load(dirs + "facebook.dict") self.cod_context = [self.dict.doc2bow(doc) for doc in self.context] self.model = TfidfModel(self.cod_context) self.tf = TfidfModel(dictionary=self.dict) self.mixture = {} self.flag = flag if (flag): self.idfs() self.tfs()
def build_model(self): start = time.time() tickers = [ i for i in os.listdir(dir_cleaned_news) if i.endswith(".csv") ] corpus = [] for ticker in tickers: df = pd.read_csv(dir_cleaned_news + ticker, index_col=0) for tokenized_doc in tokenizer(df['content'], self.phraser): corpus += [self.dic.doc2bow(tokenized_doc)] if self.wlocal and self.wglobal: self.get_model = TfidfModel(corpus, dictionary=self.dic, wlocal=self.wlocal, wglobal=self.wglobal, smartirs=self.smartirs) else: self.get_model = TfidfModel(corpus, dictionary=self.dic, smartirs=self.smartirs) end = time.time() print(self.name, " finished ", end - start, " seconds")
def create_model(numTopics, docs, vmType, alpha): dct = Dictionary(docs) corpus = [dct.doc2bow(line) for line in docs] # Build Vector Models ###### # Binary Model if vmType == 'B': model = TfidfModel(corpus, smartirs='bnn') # Binary term frequency weighting # TFIDF Model elif vmType == 'T': model = TfidfModel(corpus, smartirs='tfn') # fit tfidf # Term Frequency model elif vmType == 't': model = TfidfModel(corpus, smartirs='tnn') # Term frequency only else: print("Invalid Vector Model parameter.") # Build LDA Model ########## corpus = model[corpus] lda = LdaModel(corpus=corpus, id2word=dct, num_topics=numTopics, alpha=alpha) return lda, corpus, dct
def generate_model(dictionary, bow_corpus, corpus_path): try: tfidf = TfidfModel.load(corpus_path + 'wiki-tfidf.model') print('tfidf model generated') except: tfidf = TfidfModel() tfidf = TfidfModel(bow_corpus, dictionary) tfidf._smart_save(corpus_path + 'wiki-tfidf.model') pass return tfidf
def create_tfidf(self): tfidf_model = TfidfModel(self.bag_of_words) self.bag_of_words = [ tfidf_model[vector] for vector in tqdm(self.bag_of_words, desc="Creating tf-idf matrix") ]
def topic_extraction(reviews, title): nooftopics = 10 # Joining bigrams for a review to capture the phrases like 'tasty pizza' for id, review in enumerate(reviews): reviews[id] = review + ["_".join(w) for w in ngrams(review, 2)] # Create a dictionary of words from overall reviews. dicionary = Dictionary(reviews) # change reviews into Bag of words model/ unigram model, here feature is term frequency. corpus = [dicionary.doc2bow(review) for review in reviews] # to transform from term frequency to tfidf matrix. tfidf = TfidfModel(corpus) tfidf_corpus = tfidf[corpus] # Making a lda model. lda_tf = ldamodel.LdaModel(corpus, id2word=dicionary, alpha='auto', num_topics=nooftopics, passes=5) lda_tfidf = ldamodel.LdaModel(tfidf_corpus, id2word=dicionary, alpha='auto', num_topics=nooftopics, passes=5) # with open("lda.pkl", "wb") as f: # pickle.dump(lda, f) topic_list = lda_tf.print_topics(num_topics=10, num_words=10) topic_list_tfidf = lda_tfidf.print_topics(num_topics=10, num_words=10) with open("topic_list.pkl", "wb") as f: pickle.dump(topic_list, f) draw_graph_for_topics(topic_list, title, nooftopics) return True
def __init__(self, documents): print("Initializing GloVe") if isinstance(documents[0], list): print("It is a list") documents = [[" ".join(document)] for document in documents if isinstance(document, list)] documents = [str(document) for document in documents] self.corpus = [ preprocess(document) for document in documents if type(document) is str ] self.documents = documents ''' Then we create a similarity matrix, that contains the similarity between each pair of words, weighted using the term frequency: ''' # Load the model: this is a big file, can take a while to download and open glove = api.load("glove-wiki-gigaword-50") print("Document loaded") self.similarity_index = WordEmbeddingSimilarityIndex(glove) self.dictionary = Dictionary(self.corpus) self.tfidf = TfidfModel(dictionary=self.dictionary) print("Model is running") # Create the term similarity matrix. self.similarity_matrix = SparseTermSimilarityMatrix( self.similarity_index, self.dictionary, self.tfidf) print("Everything has been initialized")
def train_tfidf_model(): corpus, dictionary, titles = retrieve_data() # first, construct tfidf print("tfidf") tfidf = TfidfModel(corpus) # initialize model corpus_tfidf = tfidf[corpus] return corpus_tfidf, dictionary, titles
def train(self): if not os.path.exists(os.path.join(DATA_ANSWER_PATH, 'tfidf.model')): traindata = p.load(open(CORPUS_PATH, 'rb')) for qid in self.trainset: duplicates = self.trainset[qid]['duplicates'] for duplicate in duplicates: question = duplicate['rel_question']['tokens'] traindata.append(question) rel_comments = duplicate['rel_comments'] for rel_comment in rel_comments: q2 = rel_comment['tokens'] traindata.append(q2) self.dict = Dictionary(traindata) # fit dictionary corpus = [self.dict.doc2bow(line) for line in traindata] # convert corpus to BoW format self.tfidf = TfidfModel(corpus) # fit model self.dict.save(os.path.join(DATA_ANSWER_PATH, 'dict.model')) self.tfidf.save(os.path.join(DATA_ANSWER_PATH, 'tfidf.model')) else: self.dict = Dictionary.load( os.path.join(DATA_ANSWER_PATH, 'dict.model')) self.tfidf = TfidfModel.load( os.path.join(DATA_ANSWER_PATH, 'tfidf.model'))
def walid_similarity_query(self, answer: str, key: str): if len(answer) == 0 or len(key) == 0: return False if self.model_ready: documents = [answer, key] if self.verbose: print( f'{len(documents)} documents loaded and ready to preprocess' ) corpus = [self.preprocess(document) for document in documents] if self.verbose: print(f'{len(corpus)} documents loaded into corpus') dictionary = Dictionary(corpus) tfidf = TfidfModel(dictionary=dictionary) similarity_matrix = SparseTermSimilarityMatrix( self.similarity_index, dictionary, tfidf) answer_bow = dictionary.doc2bow(self.preprocess(answer)) key_bow = dictionary.doc2bow(self.preprocess(key)) # Measure soft cosine similarity scores = similarity_matrix.inner_product(answer_bow, key_bow, normalized=True) return scores else: raise NotReadyError('Word embedding model is not ready.')
def _build_vocab(self, max_vocab_cnt): all_words = [] for data in self.valid + self.non_valid: all_words.append(data["title"] + data["content"]) vocab = Dictionary(all_words) raw_vocab_size = len(vocab) vocab.filter_extremes(no_below=5) vocab.filter_extremes(keep_n=max_vocab_cnt) len_1_words = list( filter( lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["a", "i"] and True or False, vocab.values())) vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words))) if self.config.use_dict == "seq" and self.config.enable_pad: vocab.token2id[PAD] = len(vocab) vocab.compactify() self.pad_wid = vocab.token2id.get(PAD) self.vocab_seq = vocab # seq dictionary # build bow dictionary self.vocab_bow = copy.deepcopy(vocab) self.vocab_bow.filter_tokens( map(self.vocab_bow.token2id.get, STOPWORDS)) # filter stop words self.vocab_bow.compactify() if self.config.tfidf: tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words] self.tfidf_model = TfidfModel(tfidf_corpus) print("Load corpus with non_valid size %d, valid size %d, " "raw vocab size %d seq vocab size %d, bow vocab size %d" % (len(self.non_valid), len(self.valid), raw_vocab_size, len(self.vocab_seq), len(self.vocab_bow)))
def train(self, classdict, nb_topics, *args, **kwargs): """ Train the topic modeler. :param classdict: training data :param nb_topics: number of latent topics :param args: arguments to pass to the `train` method for gensim topic models :param kwargs: arguments to pass to the `train` method for gensim topic models :return: None :type classdict: dict :type nb_topics: int """ self.nb_topics = nb_topics self.generate_corpus(classdict) if self.toweigh: self.tfidf = TfidfModel(self.corpus) normcorpus = self.tfidf[self.corpus] else: self.tfidf = None normcorpus = self.corpus self.topicmodel = gensim_topic_model_dict[self.algorithm]( normcorpus, num_topics=self.nb_topics, *args, **kwargs) self.matsim = MatrixSimilarity(self.topicmodel[normcorpus]) # change the flag self.trained = True
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print "Gathering sentences and removing stopwords" for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [ word for word in TreebankWordTokenizer().tokenize(line.lower()) if word not in stopwords ] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) # print(documents) print len(documents), "documents read" print len(self.dictionary), " unique tokens", self.dictionary
def create_tfidf_from_papers( path_to_jsonl_index: Path = BIOPAPERS_JSON_PATH, path_to_bow: Path = BOW_PATH, outfile: Path = TFIDF_VECTORIZER, ) -> TfidfModel: """ Creates TFIDF model from BOW corpora. Parameters ---------- path_to_jsonl_index: Path Path to json lines index path_to_bow: Path Path to Bag of Words Dictionary outfile: Path Path to TFIDF vectorizer Returns ------- tfidf_model: TfidfModel Gensim TFIDF Model """ # Load dictionary dictionary = Dictionary.load(str(path_to_bow)) # Load corpus generator corpus = BiopapersCorpus(dictionary, path_to_jsonl_index) # Train TFIDF tfidf_model = TfidfModel(corpus) # Save TFIDF model to file: tfidf_model.save(str(outfile)) return tfidf_model
def tfidf_filter(dataset, threshold): tokens = [] #print('tokenizing documents...') for doc in dataset: #doc = clean_text(doc) tokenize = regTokenize(doc) tokens.append(tokenize) #print('creating dictionary...') dct = Dictionary(tokens) corpus = [dct.doc2bow(line) for line in tokens] #print(len(corpus)) #print('creating tf-idf model...') model = TfidfModel(corpus, id2word=dct) low_value_words = [] for bow in corpus: low_value_words += [ id for id, value in model[bow] if (value < threshold) ] #and dct[id] != "reforma_tributaria")] #print("low_value_words:",len(low_value_words)) dct.filter_tokens(bad_ids=low_value_words) new_corpus = [dct.doc2bow(doc) for doc in tokens] #print(len(new_corpus)) corp = [] for doc in new_corpus: corp.append([dct[id] for id, value in doc]) return corp
def summarize(self, text): self.sentences = self.factory.text2sentences(text) self.num_sentences = len(self.sentences) self.corpus = SentenceCorpus(self.sentences, self.max_dictionary_size) self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True) self._inject_tfidfs() self._build_matrix()
def build_tfid_model(dictionary, corpus, should_rebuild): tfid = list() # DEBUG should_rebuild = True if not should_rebuild: try: print('Loading TFID Model backup...') tfid_file = utils.get_file_path(cfg.TFID_BACKUP) print('TFID file = {}'.format(tfid_file)) tfid = LdaModel.load(tfid_file) except Exception as exc: utils.print_exception_details('Building TFID Model', exc) else: print('Building TFID Model...') tfid = TfidfModel(corpus) print('Done!') # Save Model Structures TFID_FILE = utils.get_file_path(cfg.TFID_BACKUP) tfid.save(TFID_FILE) return tfid
def tfidf_similarity(corpus, dictionary, categories, seed_article_title): mm, metadata, index = corpus # Create tfidf model tfidf = TfidfModel(dictionary = dictionary) # Get offset of seed article seed_article_offset = None for article_index, offset in enumerate(index): article_id, article_title = metadata[article_index] if article_title == seed_article_title: seed_article_offset = offset # Load seed article if seed_article_offset is None: logging.error('Seed article "%s" not found', seed_article_title) else: logging.info('Loading seed article "%s"', seed_article_title) seed_article = dict(mm.docbyoffset(seed_article_offset)) def tfidf_similarity_query(title, content): tokens = wikicorpus.tokenize(wikicorpus.filter_wiki(content)) vector = dict(tfidf[dictionary.doc2bow(tokens)]) return cosine_similarity(seed_article, vector) return SearchQuery(tfidf_similarity_query)
def tfidf_model(self): print('Logging Info - Get Tf-idf model...') tfidf_model_path = os.path.join(FEATURE_DIR, '{}_tfidf.model').format(self.genre) dict_path = os.path.join(FEATURE_DIR, '{}_tfidf.dict').format(self.genre) if os.path.exists(tfidf_model_path): dictionary = pickle_load(dict_path) tfidf_model = TfidfModel.load(tfidf_model_path) else: corpus = [ text.split() for text in self.train_data['premise'] + self.train_data['hypothesis'] + self.dev_data['premise'] + self.dev_data['hypothesis'] + self.test_data['premise'] + self.test_data['hypothesis'] ] dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] tfidf_model = TfidfModel(corpus) del corpus tfidf_model.save(tfidf_model_path) pickle_dump(dict_path, dictionary) return dictionary, tfidf_model
def train(file=DATA_FILE, type=JSON): delete_previous_models() faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type) faq_df = clean_data(faq_df) faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess) faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess) print('Preprocessing Done') if DEBUG: print(faq_df.head()) for mode in modes: model = modes[mode] dictionary = corpora.Dictionary(faq_df[model.column]) dictionary.save(os.path.join(MODEL_DIR, model.dictionary)) corpus = faq_df[model.column].map(dictionary.doc2bow) if DEBUG: print(f'{model.corpus} generated') print(corpus.head()) corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus), corpus) tfidf_model = TfidfModel(corpus) if DEBUG: print(f'{model.tfidf} generated') tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf)) tfidf = tfidf_model[corpus] lda_model = LdaMulticore(corpus=tfidf, id2word=dictionary, num_topics=30) lda_model.save(os.path.join(MODEL_DIR, model.model)) if DEBUG: print(f'{model.model} generated') print(lda_model.print_topics(5)) print('Training completed')
def get_corpus(docs): print("Building corpus ...") tfidf_model = None # load corpus from disk if ARGS.load_corpus: corpus = MmCorpus(ARGS.path_corpus) else: corpus = [dictionary.doc2bow(doc) for doc in docs] # serialize corpus to disk to prevent memory problems if corpus gets too large MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_bow.mm', corpus) corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_bow.mm') if ARGS.corpus_type == "TFIDF": tfidf_model = TfidfModel(corpus) tfidf_model.save(ARGS.save_dir + "/models/tfidf_model.mm") corpus = tfidf_model[corpus] # serialize corpus to disk to prevent memory problems if corpus gets too large MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_tfidf.mm', corpus) corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_tfidf.mm') return corpus, tfidf_model
def recommend(self, article): article.tokenize().remove_stop_words().lemmatize() n_grams = article.get_n_grams(2) vocab = Dictionary([n_grams]) corpus = [vocab.doc2bow(n_grams)] # convert corpus to BoW format model = TfidfModel(corpus) vector = 0 for n in model[corpus[0]]: vector += n[1] distances = [] for c in self.new_centroids: distance = math.fabs(np.linalg.norm(vector - c)) distances.append(distance) min = np.array(distances).argmin() all_recommended = [] i = 0 for c in self.clusters: if c == min: all_recommended.append(i) i += 1 print(all_recommended) recommended_article_ids = [] for i in range(0, 3): random_article = random.choice(all_recommended) recommended_article_ids.append(random_article) return recommended_article_ids
def getSparseMatrixSimilarity(keyword, texts): # 1、将【文本集】生成【分词列表】 texts = [jieba.lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(jieba.lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): print('kw 与 text%d 相似度为:%.2f' % (e, s)) print(sparse_matrix) print(similarities)
def samilarRate(texts, keyword): # 传入texts,keyword # 文本集和搜索词 # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) result = [] sorft = [] for e, s in enumerate(similarities, 1): result.append('kw 与 text%d 相似度为:%.2f' % (e, s)) sorft.append(s) return result, sorft
def tfidf_w2v_top5w(all_docs_prepro): # TFIDF MODEL exists = os.path.isfile('embedding/models/tfidf_all.model') if exists: print('Tfidf embedding model already existing') else: dct = Dictionary(all_docs_prepro) # fit dictionary corpus = [dct.doc2bow(line) for line in all_docs_prepro] # convert corpus to BoW format model_tfidf = TfidfModel(corpus) word_path = 'embedding/models/tfidf_all.model' model_tfidf.save(word_path) # WORD2VEC MODEL exists = os.path.isfile('embedding/models/word2vec_all.model') if exists: print('Word2vec embedding model already existing') else: print('Training word2vec on all answers') word_path = "embedding/models/word2vec_all.model" word_tempfile = get_tmpfile(word_path) word_model = Word2Vec(all_docs_prepro, size=128, window=5, min_count=1, workers=4) word_model.save(word_path)
def train(self): tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = LdaMulticore(corpus=corpus_tfidf, id2word=dictionary, num_topics=100) lda.save('lda.model')
def cluster_data(state): # Bigram model data_words_bigrams = make_bigrams(state) INPUT = data_words_bigrams # Create Dictionary id2word = corpora.Dictionary(INPUT) # Create Corpus texts = INPUT # Filter out words that occur less than and greater than id2word.filter_extremes(no_below=state.no_below, no_above=state.no_above) # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] TOPICS_LIST = range(1, state.lda_topics + 1) lda_models = [] coherence_scores = [] for TOPICS in TOPICS_LIST: lda_model = run_LDA_model(corpus, id2word, TOPICS) lda_models.append(lda_model) coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v') score = coherence_model_lda.get_coherence() coherence_scores.append(score) return coherence_scores, lda_models, corpus
def make_cor_dict(tf_idf=True, is_reply=IS_REPLY): print("Corpus | Dictionary 생성 중...") videos = get_data() corpus = [] dictionary = corpora.Dictionary() for video in videos: for comment in video['comments']: if is_reply: for reply in comment['replies']: dictionary.add_documents([reply['tokens']]) corpus += [reply['tokens']] dictionary.add_documents([comment['tokens']]) corpus += [comment['tokens']] dictionary.add_documents([video['tokens']]) corpus += [video['tokens']] # 등장 빈도수 및 길이로 딕셔너리 최종 필터링 => 고려해볼 것. 왜냐? 댓글당 Token 개수가 너무 적다. dictionary.filter_extremes(no_below=MIN_COUNT) print("Corpus: ", len(corpus)) # 딕셔너리 기반으로 모든 토큰을 정수로 인코딩 corpus = [dictionary.doc2bow(tokens) for tokens in corpus] # 코퍼스 TF-IDF 수식 적용 if tf_idf: print(":::: TF-IDF 적용 중...") tfidf = TfidfModel(corpus) corpus = tfidf[corpus] return corpus, dictionary
def create_corpus_and_dict(documents): """ Retrieve all the necessary data to train the LSI model. """ if not os.path.exists('./tmp/dictionary.dict'): print("Starting construction dictionary now") dictionary = corpora.Dictionary(documents) dictionary.save('./tmp/dictionary.dict') else: print("Dictionary already constructed, loading now...") dictionary = corpora.Dictionary() dictionary = dictionary.load('./tmp/dictionary.dict') #construct BOW corpus if not os.path.exists('./tmp/bow_corpus.mm'): print("Starting construction bow corpus now") bow_corpus = [dictionary.doc2bow(text) for text in documents] corpora.MmCorpus.serialize('./tmp/bow_corpus.mm', bow_corpus) else: print('BOW corpus already created, loading now...') bow_corpus = corpora.MmCorpus('./tmp/bow_corpus.mm') #construct TFIDF corpus if not os.path.exists('./tmp/tfidf_corpus.mm'): print("Starting construction TFIDF corpus now") corpus = [dictionary.doc2bow(text) for text in documents] model_tfidf = TfidfModel(corpus) tfidf_corpus = model_tfidf[corpus] corpora.MmCorpus.serialize('./tmp/tfidf_corpus.mm', tfidf_corpus) else: print('TFIDF corpus already created, loading now...') tfidf_corpus = corpora.MmCorpus('./tmp/tfidf_corpus.mm') return dictionary, bow_corpus, tfidf_corpus