예제 #1
0
    def testRandomState(self):
        model_1 = nmf.Nmf(common_corpus,
                          id2word=common_dictionary,
                          num_topics=2,
                          passes=100,
                          random_state=42)
        model_2 = nmf.Nmf(common_corpus,
                          id2word=common_dictionary,
                          num_topics=2,
                          passes=100,
                          random_state=0)

        self.assertTrue(
            np.allclose(self.model.get_topics(), model_1.get_topics()))
        self.assertFalse(
            np.allclose(self.model.get_topics(), model_2.get_topics()))
예제 #2
0
def make_nmf_model():
    tfidf_model = TfidfModel.load((output_dir / 'tfidf_model.pkl').as_posix())
    nmf_model = nmf.Nmf(nmf_iterator(
        CONTENT_FILES, Dict.load((output_dir / 'dict.pkl').as_posix()),
        tfidf_model),
                        num_topics=TOPIC_NUM)
    nmf_model.save((output_dir / 'nmf_model.pkl').as_posix())
예제 #3
0
 def setUp(self):
     self.model = nmf.Nmf(
         common_corpus,
         id2word=common_dictionary,
         chunksize=1,
         num_topics=2,
         passes=100,
         random_state=42,
     )
예제 #4
0
    def testGenerator(self):
        model_1 = nmf.Nmf(
            iter(common_corpus * 100),
            id2word=common_dictionary,
            chunksize=1,
            num_topics=2,
            passes=1,
            random_state=42,
        )

        model_2 = nmf.Nmf(
            common_corpus * 100,
            id2word=common_dictionary,
            chunksize=1,
            num_topics=2,
            passes=1,
            random_state=42,
        )

        self.assertTrue(np.allclose(model_1.get_topics(), model_2.get_topics()))
예제 #5
0
def get_best_model(token_list,
                   min_topic_num=3,
                   max_topic_num=14,
                   coherence_metric="c_v",
                   model_type="lsi"):
    model_list = []
    coherence_values = []
    #create the corpus for the model
    corpus, tfidf_vect = create_corpus_and_vectorizer(token_list)
    for topics_num in range(min_topic_num, max_topic_num + 1):
        #Create the LsiModels with increasing number of Topics\
        if model_type == "nmf":
            model = nmf.Nmf(tfidf_vect[corpus],
                            id2word=dataset,
                            num_topics=topics_num)
        else:
            model = LsiModel(tfidf_vect[corpus],
                             id2word=dataset,
                             num_topics=topics_num)
        model_list.append(model)

        topics_model = [[
            word for word, prob in topic
        ] for topicid, topic in model.show_topics(formatted=False)]
        #Create the CoherenceModel and evaluate its score
        coherence_model = CoherenceModel(topics=topics_model,
                                         texts=token_list,
                                         dictionary=dataset,
                                         coherence=coherence_metric,
                                         window_size=30)
        coherence_values.append(coherence_model.get_coherence())
    try:
        index_value = coherence_values.index(max(coherence_values))
    except:
        index_value = 0
    best_model = model_list[index_value]
    return best_model, corpus
예제 #6
0
def run_all(data, model_type, n_topics=10, coherence='all'):
    topics = None
    texts, dictionary, corpus = data
    if (model_type == 'LDA'):

        lda = LdaMulticore(corpus=corpus,
                           num_topics=n_topics,
                           id2word=dictionary,
                           passes=5)
        topics = get_gensim_topics(lda, n_topics)

    elif (model_type == 'FA'):
        tf_vectorizer = CountVectorizer()
        tftexts = [' '.join(text) for text in texts]
        tf = tf_vectorizer.fit_transform(tftexts)
        tf_feature_names = tf_vectorizer.get_feature_names()
        tf = tf.toarray()
        famodel = FARotate(n_components=n_topics, rotation='varimax')
        famodel.fit(tf)
        topics = get_sklearn_topics(famodel, n_topics, tf_feature_names)
    elif (model_type == 'NMF'):
        nmfmodel = nmf.Nmf(
            corpus=corpus,
            num_topics=n_topics,
            id2word=dictionary,
            chunksize=2000,
            passes=5,
            random_state=42,
        )
        topics = get_gensim_topics(nmfmodel, n_topics)

    def coherence_scores(coherence, topics):
        cm = NewCoherence(topics=topics,
                          corpus=corpus,
                          dictionary=dictionary,
                          coherence=coherence)
        #model_score = cm.get_coherence()
        topic_coherences = cm.get_all_coherences_per_topic()
        return topic_coherences

    coherences = coherence_scores(coherence, topics)
    topics = [{"Topic": " ".join(topic)} for topic in topics]
    topicsdf = pd.DataFrame(data=topics)
    coherencesdf = pd.DataFrame(data=coherences)
    both = pd.concat([topicsdf, coherencesdf.round(4)], axis=1)
    pd.set_option('display.max_colwidth', 200)
    #display(both)
    col_options = {
        'width': 70,
    }
    col_defs = {
        'Topic': {
            'width': 560,
        }
    }

    show = qgrid.show_grid(both,
                           column_options=col_options,
                           column_definitions=col_defs,
                           grid_options={
                               'forceFitColumns': False,
                               'maxVisibleRows': 100
                           })
    display(show)
    return topics, coherences, both
예제 #7
0
    def most_similar_texts(self,
                           X,
                           num_examples,
                           text_column_name,
                           num_topics=None):
        """
        Uses NMF clustering to create n topics based on adjusted word frequencies

        Parameters
        --------
        X: DataFrame
        num_examples: int
        text_column_name: str
        num_topics: int
            Optional - if none algorithm will determine best number

        Returns
        --------
        topic_words_df: DataFrame
            Top 15 words/phrases per topic
        combined_df: DataFrame
            Original text with topic number assigned to each

        """
        X = X[~X[text_column_name].isna()]
        X = X[X[text_column_name] != ""]
        X = X[X[text_column_name] != " "]
        X = X[X[text_column_name] != "NA"]
        X = X[X[text_column_name] != "n/a"]
        X = X[X[text_column_name] != "N/A"]
        X = X[X[text_column_name] != "na"]

        all_stop_words = (set(ENGLISH_STOP_WORDS)
                          | set(["-PRON-"])
                          | set(string.punctuation)
                          | set([" "]))

        ct = CleanText()
        vectorizer = TfidfVectorizer(
            tokenizer=ct.lematize,
            ngram_range=(1, 3),
            stop_words=all_stop_words,
            min_df=5,
            max_df=0.4,
        )
        vectors = vectorizer.fit_transform(X[text_column_name]).todense()

        # Adding words/phrases used in text data frequencies back into the dataset (so we can see feature importances later)
        vocab = vectorizer.get_feature_names()
        vector_df = pd.DataFrame(vectors, columns=vocab, index=X.index)

        if X.shape[0] < 20:
            return "Too few examples to categorize."

        if not num_topics:

            # In case 1, add 1 to get at least 2
            # The rest are based on eyeballing numbers
            min_topics = ceil(X.shape[0] * 0.01) + 1
            max_topics = ceil(X.shape[0] * 0.2)
            step = ceil((max_topics - min_topics) / 5)

            topic_nums = list(np.arange(min_topics, max_topics, step))

            texts = X[text_column_name].apply(ct.lematize)

            # In gensim a dictionary is a mapping between words and their integer id
            dictionary = Dictionary(texts)

            # Filter out extremes to limit the number of features
            dictionary.filter_extremes(no_below=2, no_above=0.85, keep_n=5000)

            # Create the bag-of-words format (list of (token_id, token_count))
            corpus = [dictionary.doc2bow(text) for text in texts]

            coherence_scores = []

            for num in topic_nums:
                model = nmf.Nmf(
                    corpus=corpus,
                    num_topics=num,
                    id2word=dictionary,
                    chunksize=2000,
                    passes=5,
                    kappa=0.1,
                    minimum_probability=0.01,
                    w_max_iter=300,
                    w_stop_condition=0.0001,
                    h_max_iter=100,
                    h_stop_condition=0.001,
                    eval_every=10,
                    normalize=True,
                    random_state=42,
                )

                cm = CoherenceModel(model=model,
                                    texts=texts,
                                    dictionary=dictionary,
                                    coherence="u_mass")

                coherence_scores.append(round(cm.get_coherence(), 5))

            scores = list(zip(topic_nums, coherence_scores))
            chosen_num_topics = sorted(scores, key=itemgetter(1),
                                       reverse=True)[0][0]
        else:
            chosen_num_topics = num_topics

        model = NMF(n_components=chosen_num_topics, random_state=42)
        model.fit(vectors)
        component_loadings = model.transform(vectors)

        top_topics = pd.DataFrame(np.argmax(component_loadings, axis=1),
                                  columns=["top_topic_num"])

        top_topic_loading = pd.DataFrame(np.max(component_loadings, axis=1),
                                         columns=["top_topic_loading"])

        X.reset_index(inplace=True, drop=False)
        vector_df.reset_index(inplace=True, drop=True)

        # Fix for duplicate text_column_name
        vector_df.columns = [x + "_vector" for x in vector_df.columns]

        combined_df = pd.concat([X, vector_df, top_topics, top_topic_loading],
                                axis=1)

        combined_df.sort_values(by="top_topic_loading",
                                ascending=False,
                                inplace=True)

        combined_df = pd.concat([X, vector_df, top_topics], axis=1)

        topic_words = {}
        sample_texts_lst = []
        for topic, comp in enumerate(model.components_):
            word_idx = np.argsort(comp)[::-1][:num_examples]
            topic_words[topic] = [vocab[i] for i in word_idx]
            sample_texts_lst.append(
                list(combined_df[combined_df["top_topic_num"] == topic]
                     [text_column_name].values[:num_examples]))

        topic_words_df = pd.DataFrame(columns=[
            "topic_num",
            "num_in_category",
            "top_words_and_phrases",
            "sample_texts",
        ])

        topic_words_df["topic_num"] = [k for k, _ in topic_words.items()]
        topic_words_df["num_in_category"] = (
            combined_df.groupby("top_topic_num").count().iloc[:, 0])
        topic_words_df["top_words_and_phrases"] = [
            x for x in topic_words.values()
        ]
        topic_words_df["sample_texts"] = sample_texts_lst

        topic_words_explode = pd.DataFrame(
            topic_words_df["sample_texts"].tolist(),
            index=topic_words_df.index,
        )

        topic_words_explode.columns = [
            "example{}".format(num)
            for num in range(len(topic_words_explode.columns))
        ]

        concated_topics = pd.concat(
            [
                topic_words_df[[
                    "topic_num", "num_in_category", "top_words_and_phrases"
                ]],
                topic_words_explode,
            ],
            axis=1,
        )

        print("Topics created with top words & example texts:")
        print(concated_topics)

        return (
            concated_topics,
            combined_df[["index", text_column_name, "top_topic_num"]],
        )
예제 #8
0
doc = "liver bladder desctruction"
vec_bow = dictionary.doc2bow(doc.lower().split())
f = open("output.txt", "a")
"""
for i in range(0, lsi.num_topics):
   print(lsi.print_topic(i,10))
vec_lsi1 = lsi[vec_bow]
sims = index[vec_lsi1]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
cm1 = CoherenceModel(model=lsi, corpus=corpus, coherence='u_mass')
coherence = cm1.get_coherence()
print('#####################################################')
print(coherence)     """

#print(sims)
nmfmodel = nmf.Nmf(corpus, num_topics=43, id2word=dictionary, normalize=True)
for i in range(0, 43):
    print(nmfmodel.print_topic(i, 10))
    print('#########################')
print("DOCUMENT TOPICS OF MEDICOSOCIAL STUDIES OF HEMOPHILIA")
print(nmfmodel.get_document_topics(vec_bow))
#print(nmfmodel._W)
#print(nmfmodel._h)
print(np.array(nmfmodel._W).shape)
print(np.array(nmfmodel._h).shape)
#print(nmfmodel._w_max_iter) #vec_lsi2 = lsi[vec_bow]
#sims = index[vec_lsi2]
#sims = sorted(enumerate(sims), key=lambda item: -item[1])
#cm2 = CoherenceModel(model=nmfmodel, corpus=corpus, coherence='u_mass')
#coherence = cm2.get_coherence()
#print('#####################################################')
예제 #9
0
def NMF(request):
    query = ""
    query_response = None
    file_list = None
    file_list_dictionary = None
    search_result_dictionary = None
    documents = []
    for counter in range(1033):
        temp = open("IR/" + str(counter + 1) + ".txt", 'r')
        documents.append(temp.read())
        temp.close()
    stop_words = stopwords.words('english')
    texts = [[
        word for word in document.lower().split() if word not in stop_words
    ] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/ir.mm', corpus)
    nmfmodel = nmf.Nmf(corpus,
                       num_topics=43,
                       id2word=dictionary,
                       normalize=True)
    if request.method == "POST":
        form = SearchForm(request.POST)
        if form.is_valid():
            query_response = list()
            user_query = form.save()
            user_query.save()
            query = user_query.query
            doc = user_query.query
            index = similarities.MatrixSimilarity(nmfmodel[corpus])
            vec_bow = dictionary.doc2bow(doc.split())
            vec_nmf = nmfmodel[vec_bow]
            sims = index[vec_nmf]
            sims = sorted(enumerate(sims, 1), key=lambda item: -item[1])
            file_list = list()
            for element in sims[0:5]:
                file_list.append(element[0])
            temp = None
            for text in file_list:
                temp = open("IR/" + str(text) + ".txt", 'r')
                query_response.append(temp.read())
                temp.close()
            #print(query_response)
            file_list_dictionary = dict()
            file_list_dictionary = {
                i: file_list[i - 1]
                for i in range(1,
                               len(file_list) + 1)
            }
            search_result_dictionary = {
                i: query_response[i - 1]
                for i in range(1,
                               len(query_response) + 1)
            }
    else:
        form = SearchForm()
    return render(
        request, "nmf.html", {
            'form': form,
            'query': query,
            'answer': file_list,
            'search_results': query_response,
            'file_dictionary': file_list_dictionary,
            'search_result_dictionary': search_result_dictionary
        })
예제 #10
0
    def train_model(self, dataset, hyperparameters=None, top_words=10):
        """
        Train the model and return output

        Parameters
        ----------
        dataset : dataset to use to build the model
        hyperparameters : hyperparameters to build the model
        top_words : if greather than 0 returns the most significant words
                 for each topic in the output
                 Default True

        Returns
        -------
        result : dictionary with up to 3 entries,
                 'topics', 'topic-word-matrix' and
                 'topic-document-matrix'
        """
        if hyperparameters is None:
            hyperparameters = {}
        if self.use_partitions:
            partition = dataset.get_partitioned_corpus(use_validation=False)
        else:
            partition = [dataset.get_corpus(), []]

        if self.id2word is None:
            self.id2word = corpora.Dictionary(dataset.get_corpus())
        if self.id_corpus is None:
            self.id_corpus = [
                self.id2word.doc2bow(document) for document in partition[0]
            ]

        hyperparameters["corpus"] = self.id_corpus
        hyperparameters["id2word"] = self.id2word
        self.hyperparameters.update(hyperparameters)

        self.trained_model = nmf.Nmf(**self.hyperparameters)

        result = {}

        result["topic-word-matrix"] = self.trained_model.get_topics()

        if top_words > 0:
            topics_output = []
            for topic in result["topic-word-matrix"]:
                top_k = np.argsort(topic)[-top_words:]
                top_k_words = list(reversed([self.id2word[i] for i in top_k]))
                topics_output.append(top_k_words)
            result["topics"] = topics_output

        result["topic-document-matrix"] = self._get_topic_document_matrix()

        if self.use_partitions:
            new_corpus = [
                self.id2word.doc2bow(document) for document in partition[1]
            ]
            if self.update_with_test:
                self.trained_model.update(new_corpus)
                self.id_corpus.extend(new_corpus)

                result[
                    "test-topic-word-matrix"] = self.trained_model.get_topics(
                    )

                if top_words > 0:
                    topics_output = []
                    for topic in result["test-topic-word-matrix"]:
                        top_k = np.argsort(topic)[-top_words:]
                        top_k_words = list(
                            reversed([self.id2word[i] for i in top_k]))
                        topics_output.append(top_k_words)
                    result["test-topics"] = topics_output

                result[
                    "test-topic-document-matrix"] = self._get_topic_document_matrix(
                    )
            else:
                result[
                    "test-topic-document-matrix"] = self._get_topic_document_matrix(
                        new_corpus)
        return result
예제 #11
0
    texts.append(words)

# create dictionary
print('--- creating dictionary ---')
id2word = corpora.Dictionary(texts)
id2word.save('./{}_tmp/dfid2word'.format(substr))

# create corpus
print('--- creating corpus for topic modeling ---')
corpus = [id2word.doc2bow(text) for text in texts]

# topic model
print('--- training topic model ---')
topic_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, passes=1)
if (args.topic == 2):
    topic_model = nmf.Nmf(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, passes=1)
topic_model.save('./{}_tmp/topic_model'.format(substr))

topic_model = gensim.models.LdaModel.load('./{}_tmp/topic_model'.format(substr))
id2word = gensim.corpora.Dictionary.load('./{}_tmp/dfid2word'.format(substr))

# split words based on topics
print('--- creating topic and all corpus ---')
for texts in df['text']:
    words = texts.split(' ')
    bow = id2word.doc2bow(words)
    topic_probs = topic_model[bow]
    topic = max(topic_probs, key=itemgetter(1))[0]
    with open('./{}_tmp/datacorpus_'.format(substr) + str(topic) + '.txt', 'a') as f:
        f.write(' '.join(words) + '\n')