def testRandomState(self): model_1 = nmf.Nmf(common_corpus, id2word=common_dictionary, num_topics=2, passes=100, random_state=42) model_2 = nmf.Nmf(common_corpus, id2word=common_dictionary, num_topics=2, passes=100, random_state=0) self.assertTrue( np.allclose(self.model.get_topics(), model_1.get_topics())) self.assertFalse( np.allclose(self.model.get_topics(), model_2.get_topics()))
def make_nmf_model(): tfidf_model = TfidfModel.load((output_dir / 'tfidf_model.pkl').as_posix()) nmf_model = nmf.Nmf(nmf_iterator( CONTENT_FILES, Dict.load((output_dir / 'dict.pkl').as_posix()), tfidf_model), num_topics=TOPIC_NUM) nmf_model.save((output_dir / 'nmf_model.pkl').as_posix())
def setUp(self): self.model = nmf.Nmf( common_corpus, id2word=common_dictionary, chunksize=1, num_topics=2, passes=100, random_state=42, )
def testGenerator(self): model_1 = nmf.Nmf( iter(common_corpus * 100), id2word=common_dictionary, chunksize=1, num_topics=2, passes=1, random_state=42, ) model_2 = nmf.Nmf( common_corpus * 100, id2word=common_dictionary, chunksize=1, num_topics=2, passes=1, random_state=42, ) self.assertTrue(np.allclose(model_1.get_topics(), model_2.get_topics()))
def get_best_model(token_list, min_topic_num=3, max_topic_num=14, coherence_metric="c_v", model_type="lsi"): model_list = [] coherence_values = [] #create the corpus for the model corpus, tfidf_vect = create_corpus_and_vectorizer(token_list) for topics_num in range(min_topic_num, max_topic_num + 1): #Create the LsiModels with increasing number of Topics\ if model_type == "nmf": model = nmf.Nmf(tfidf_vect[corpus], id2word=dataset, num_topics=topics_num) else: model = LsiModel(tfidf_vect[corpus], id2word=dataset, num_topics=topics_num) model_list.append(model) topics_model = [[ word for word, prob in topic ] for topicid, topic in model.show_topics(formatted=False)] #Create the CoherenceModel and evaluate its score coherence_model = CoherenceModel(topics=topics_model, texts=token_list, dictionary=dataset, coherence=coherence_metric, window_size=30) coherence_values.append(coherence_model.get_coherence()) try: index_value = coherence_values.index(max(coherence_values)) except: index_value = 0 best_model = model_list[index_value] return best_model, corpus
def run_all(data, model_type, n_topics=10, coherence='all'): topics = None texts, dictionary, corpus = data if (model_type == 'LDA'): lda = LdaMulticore(corpus=corpus, num_topics=n_topics, id2word=dictionary, passes=5) topics = get_gensim_topics(lda, n_topics) elif (model_type == 'FA'): tf_vectorizer = CountVectorizer() tftexts = [' '.join(text) for text in texts] tf = tf_vectorizer.fit_transform(tftexts) tf_feature_names = tf_vectorizer.get_feature_names() tf = tf.toarray() famodel = FARotate(n_components=n_topics, rotation='varimax') famodel.fit(tf) topics = get_sklearn_topics(famodel, n_topics, tf_feature_names) elif (model_type == 'NMF'): nmfmodel = nmf.Nmf( corpus=corpus, num_topics=n_topics, id2word=dictionary, chunksize=2000, passes=5, random_state=42, ) topics = get_gensim_topics(nmfmodel, n_topics) def coherence_scores(coherence, topics): cm = NewCoherence(topics=topics, corpus=corpus, dictionary=dictionary, coherence=coherence) #model_score = cm.get_coherence() topic_coherences = cm.get_all_coherences_per_topic() return topic_coherences coherences = coherence_scores(coherence, topics) topics = [{"Topic": " ".join(topic)} for topic in topics] topicsdf = pd.DataFrame(data=topics) coherencesdf = pd.DataFrame(data=coherences) both = pd.concat([topicsdf, coherencesdf.round(4)], axis=1) pd.set_option('display.max_colwidth', 200) #display(both) col_options = { 'width': 70, } col_defs = { 'Topic': { 'width': 560, } } show = qgrid.show_grid(both, column_options=col_options, column_definitions=col_defs, grid_options={ 'forceFitColumns': False, 'maxVisibleRows': 100 }) display(show) return topics, coherences, both
def most_similar_texts(self, X, num_examples, text_column_name, num_topics=None): """ Uses NMF clustering to create n topics based on adjusted word frequencies Parameters -------- X: DataFrame num_examples: int text_column_name: str num_topics: int Optional - if none algorithm will determine best number Returns -------- topic_words_df: DataFrame Top 15 words/phrases per topic combined_df: DataFrame Original text with topic number assigned to each """ X = X[~X[text_column_name].isna()] X = X[X[text_column_name] != ""] X = X[X[text_column_name] != " "] X = X[X[text_column_name] != "NA"] X = X[X[text_column_name] != "n/a"] X = X[X[text_column_name] != "N/A"] X = X[X[text_column_name] != "na"] all_stop_words = (set(ENGLISH_STOP_WORDS) | set(["-PRON-"]) | set(string.punctuation) | set([" "])) ct = CleanText() vectorizer = TfidfVectorizer( tokenizer=ct.lematize, ngram_range=(1, 3), stop_words=all_stop_words, min_df=5, max_df=0.4, ) vectors = vectorizer.fit_transform(X[text_column_name]).todense() # Adding words/phrases used in text data frequencies back into the dataset (so we can see feature importances later) vocab = vectorizer.get_feature_names() vector_df = pd.DataFrame(vectors, columns=vocab, index=X.index) if X.shape[0] < 20: return "Too few examples to categorize." if not num_topics: # In case 1, add 1 to get at least 2 # The rest are based on eyeballing numbers min_topics = ceil(X.shape[0] * 0.01) + 1 max_topics = ceil(X.shape[0] * 0.2) step = ceil((max_topics - min_topics) / 5) topic_nums = list(np.arange(min_topics, max_topics, step)) texts = X[text_column_name].apply(ct.lematize) # In gensim a dictionary is a mapping between words and their integer id dictionary = Dictionary(texts) # Filter out extremes to limit the number of features dictionary.filter_extremes(no_below=2, no_above=0.85, keep_n=5000) # Create the bag-of-words format (list of (token_id, token_count)) corpus = [dictionary.doc2bow(text) for text in texts] coherence_scores = [] for num in topic_nums: model = nmf.Nmf( corpus=corpus, num_topics=num, id2word=dictionary, chunksize=2000, passes=5, kappa=0.1, minimum_probability=0.01, w_max_iter=300, w_stop_condition=0.0001, h_max_iter=100, h_stop_condition=0.001, eval_every=10, normalize=True, random_state=42, ) cm = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence="u_mass") coherence_scores.append(round(cm.get_coherence(), 5)) scores = list(zip(topic_nums, coherence_scores)) chosen_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0] else: chosen_num_topics = num_topics model = NMF(n_components=chosen_num_topics, random_state=42) model.fit(vectors) component_loadings = model.transform(vectors) top_topics = pd.DataFrame(np.argmax(component_loadings, axis=1), columns=["top_topic_num"]) top_topic_loading = pd.DataFrame(np.max(component_loadings, axis=1), columns=["top_topic_loading"]) X.reset_index(inplace=True, drop=False) vector_df.reset_index(inplace=True, drop=True) # Fix for duplicate text_column_name vector_df.columns = [x + "_vector" for x in vector_df.columns] combined_df = pd.concat([X, vector_df, top_topics, top_topic_loading], axis=1) combined_df.sort_values(by="top_topic_loading", ascending=False, inplace=True) combined_df = pd.concat([X, vector_df, top_topics], axis=1) topic_words = {} sample_texts_lst = [] for topic, comp in enumerate(model.components_): word_idx = np.argsort(comp)[::-1][:num_examples] topic_words[topic] = [vocab[i] for i in word_idx] sample_texts_lst.append( list(combined_df[combined_df["top_topic_num"] == topic] [text_column_name].values[:num_examples])) topic_words_df = pd.DataFrame(columns=[ "topic_num", "num_in_category", "top_words_and_phrases", "sample_texts", ]) topic_words_df["topic_num"] = [k for k, _ in topic_words.items()] topic_words_df["num_in_category"] = ( combined_df.groupby("top_topic_num").count().iloc[:, 0]) topic_words_df["top_words_and_phrases"] = [ x for x in topic_words.values() ] topic_words_df["sample_texts"] = sample_texts_lst topic_words_explode = pd.DataFrame( topic_words_df["sample_texts"].tolist(), index=topic_words_df.index, ) topic_words_explode.columns = [ "example{}".format(num) for num in range(len(topic_words_explode.columns)) ] concated_topics = pd.concat( [ topic_words_df[[ "topic_num", "num_in_category", "top_words_and_phrases" ]], topic_words_explode, ], axis=1, ) print("Topics created with top words & example texts:") print(concated_topics) return ( concated_topics, combined_df[["index", text_column_name, "top_topic_num"]], )
doc = "liver bladder desctruction" vec_bow = dictionary.doc2bow(doc.lower().split()) f = open("output.txt", "a") """ for i in range(0, lsi.num_topics): print(lsi.print_topic(i,10)) vec_lsi1 = lsi[vec_bow] sims = index[vec_lsi1] sims = sorted(enumerate(sims), key=lambda item: -item[1]) cm1 = CoherenceModel(model=lsi, corpus=corpus, coherence='u_mass') coherence = cm1.get_coherence() print('#####################################################') print(coherence) """ #print(sims) nmfmodel = nmf.Nmf(corpus, num_topics=43, id2word=dictionary, normalize=True) for i in range(0, 43): print(nmfmodel.print_topic(i, 10)) print('#########################') print("DOCUMENT TOPICS OF MEDICOSOCIAL STUDIES OF HEMOPHILIA") print(nmfmodel.get_document_topics(vec_bow)) #print(nmfmodel._W) #print(nmfmodel._h) print(np.array(nmfmodel._W).shape) print(np.array(nmfmodel._h).shape) #print(nmfmodel._w_max_iter) #vec_lsi2 = lsi[vec_bow] #sims = index[vec_lsi2] #sims = sorted(enumerate(sims), key=lambda item: -item[1]) #cm2 = CoherenceModel(model=nmfmodel, corpus=corpus, coherence='u_mass') #coherence = cm2.get_coherence() #print('#####################################################')
def NMF(request): query = "" query_response = None file_list = None file_list_dictionary = None search_result_dictionary = None documents = [] for counter in range(1033): temp = open("IR/" + str(counter + 1) + ".txt", 'r') documents.append(temp.read()) temp.close() stop_words = stopwords.words('english') texts = [[ word for word in document.lower().split() if word not in stop_words ] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/ir.mm', corpus) nmfmodel = nmf.Nmf(corpus, num_topics=43, id2word=dictionary, normalize=True) if request.method == "POST": form = SearchForm(request.POST) if form.is_valid(): query_response = list() user_query = form.save() user_query.save() query = user_query.query doc = user_query.query index = similarities.MatrixSimilarity(nmfmodel[corpus]) vec_bow = dictionary.doc2bow(doc.split()) vec_nmf = nmfmodel[vec_bow] sims = index[vec_nmf] sims = sorted(enumerate(sims, 1), key=lambda item: -item[1]) file_list = list() for element in sims[0:5]: file_list.append(element[0]) temp = None for text in file_list: temp = open("IR/" + str(text) + ".txt", 'r') query_response.append(temp.read()) temp.close() #print(query_response) file_list_dictionary = dict() file_list_dictionary = { i: file_list[i - 1] for i in range(1, len(file_list) + 1) } search_result_dictionary = { i: query_response[i - 1] for i in range(1, len(query_response) + 1) } else: form = SearchForm() return render( request, "nmf.html", { 'form': form, 'query': query, 'answer': file_list, 'search_results': query_response, 'file_dictionary': file_list_dictionary, 'search_result_dictionary': search_result_dictionary })
def train_model(self, dataset, hyperparameters=None, top_words=10): """ Train the model and return output Parameters ---------- dataset : dataset to use to build the model hyperparameters : hyperparameters to build the model top_words : if greather than 0 returns the most significant words for each topic in the output Default True Returns ------- result : dictionary with up to 3 entries, 'topics', 'topic-word-matrix' and 'topic-document-matrix' """ if hyperparameters is None: hyperparameters = {} if self.use_partitions: partition = dataset.get_partitioned_corpus(use_validation=False) else: partition = [dataset.get_corpus(), []] if self.id2word is None: self.id2word = corpora.Dictionary(dataset.get_corpus()) if self.id_corpus is None: self.id_corpus = [ self.id2word.doc2bow(document) for document in partition[0] ] hyperparameters["corpus"] = self.id_corpus hyperparameters["id2word"] = self.id2word self.hyperparameters.update(hyperparameters) self.trained_model = nmf.Nmf(**self.hyperparameters) result = {} result["topic-word-matrix"] = self.trained_model.get_topics() if top_words > 0: topics_output = [] for topic in result["topic-word-matrix"]: top_k = np.argsort(topic)[-top_words:] top_k_words = list(reversed([self.id2word[i] for i in top_k])) topics_output.append(top_k_words) result["topics"] = topics_output result["topic-document-matrix"] = self._get_topic_document_matrix() if self.use_partitions: new_corpus = [ self.id2word.doc2bow(document) for document in partition[1] ] if self.update_with_test: self.trained_model.update(new_corpus) self.id_corpus.extend(new_corpus) result[ "test-topic-word-matrix"] = self.trained_model.get_topics( ) if top_words > 0: topics_output = [] for topic in result["test-topic-word-matrix"]: top_k = np.argsort(topic)[-top_words:] top_k_words = list( reversed([self.id2word[i] for i in top_k])) topics_output.append(top_k_words) result["test-topics"] = topics_output result[ "test-topic-document-matrix"] = self._get_topic_document_matrix( ) else: result[ "test-topic-document-matrix"] = self._get_topic_document_matrix( new_corpus) return result
texts.append(words) # create dictionary print('--- creating dictionary ---') id2word = corpora.Dictionary(texts) id2word.save('./{}_tmp/dfid2word'.format(substr)) # create corpus print('--- creating corpus for topic modeling ---') corpus = [id2word.doc2bow(text) for text in texts] # topic model print('--- training topic model ---') topic_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, passes=1) if (args.topic == 2): topic_model = nmf.Nmf(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, passes=1) topic_model.save('./{}_tmp/topic_model'.format(substr)) topic_model = gensim.models.LdaModel.load('./{}_tmp/topic_model'.format(substr)) id2word = gensim.corpora.Dictionary.load('./{}_tmp/dfid2word'.format(substr)) # split words based on topics print('--- creating topic and all corpus ---') for texts in df['text']: words = texts.split(' ') bow = id2word.doc2bow(words) topic_probs = topic_model[bow] topic = max(topic_probs, key=itemgetter(1))[0] with open('./{}_tmp/datacorpus_'.format(substr) + str(topic) + '.txt', 'a') as f: f.write(' '.join(words) + '\n')