Exemplo n.º 1
0
 def __buildNMF(self, num_topics, chunksize, passes):
     self.__model = Nmf(self.__corpus,
                        id2word=self.__corpus.getDictionary(),
                        num_topics=num_topics,
                        chunksize=chunksize,
                        passes=passes,
                        eval_every=None,
                        random_state=10)
Exemplo n.º 2
0
def topic_modeling(method, num_topics, corpus, dictionary):

    if method == 'LDA':
        # Build LDA model
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42,
                                                update_every=1,
                                                chunksize=10,
                                                passes=10,
                                                alpha='symmetric',
                                                iterations=100,
                                                per_word_topics=True)

    elif method == 'NMF':
        #Build NMF model
        model = Nmf(corpus=corpus,
                    num_topics=num_topics,
                    id2word=dictionary,
                    chunksize=10,
                    passes=10,
                    kappa=0.5,
                    w_max_iter=200,
                    h_max_iter=50,
                    eval_every=1,
                    normalize=True,
                    random_state=42)

    else:
        raise ValueError('method is invalid')

    return model
Exemplo n.º 3
0
    def load(self, model_name):
        self.__modelName = model_name

        if model_name == 'lda':
            self.__model = LdaMulticore.load(self.__modelFile)
        elif model_name == 'nmf':
            self.__model = Nmf.load(self.__modelFile)
def create_nmf_model(id_dict, corpus, num_topics):
    nmf_model = Nmf(corpus=corpus,
                    id2word=id_dict,
                    num_topics=num_topics,
                    random_state=100,
                    chunksize=100,
                    passes=50)
    return nmf_model
Exemplo n.º 5
0
 def __buildLDA(self, num_topics, chunksize, passes):
     self.__model = LdaMulticore(self.__corpus,
                                 id2word=self.__corpus.getDictionary(),
                                 num_topics=num_topics,
                                 chunksize=chunksize,
                                 passes=passes,
                                 eval_every=None,
                                 workers=40,
                                 random_state=10)
Exemplo n.º 6
0
def find_cv():  #Find Number of K
    for num in topic_nums:
        nmf = Nmf(corpus=corpus,
                  num_topics=num,
                  id2word=dictionary,
                  normalize=True)
        cm = CoherenceModel(model=nmf,
                            texts=texts,
                            dictionary=dictionary,
                            coherence='c_v')
        coherence_scores.append(round(cm.get_coherence(), 5))
Exemplo n.º 7
0
    def __create_model(self, algo, topic_qtt):
        model = None

        if (algo == TopicModelingAlgorithm.LDA):
            model = LdaModel(corpus=self.__corpus,
                             num_topics=topic_qtt,
                             id2word=self.__id2_words,
                             random_state=1)
        elif (algo == TopicModelingAlgorithm.LSA):
            model = LsiModel(corpus=self.__corpus,
                             num_topics=topic_qtt,
                             id2word=self.__id2_words)
        elif (algo == TopicModelingAlgorithm.NMF):
            model = Nmf(corpus=self.__corpus,
                        num_topics=topic_qtt,
                        random_state=1)

        return model
Exemplo n.º 8
0
def nmf_search(texts, query, num_topics, passes=20, random_state=None):
    tfidf_model, dic, text_tfidf_weights = get_tfidfmodel_and_weights(texts)

    # NMFモデルを作成
    nmf_model = Nmf(corpus=text_tfidf_weights,
                    id2word=dic,
                    num_topics=num_topics,
                    passes=passes,
                    random_state=random_state)

    # TF・IDFによる文書ベクトルをトピックベースのベクトルに変換
    nmf_weights = nmf_model[text_tfidf_weights]

    index = MatrixSimilarity(nmf_weights, num_features=len(dic))

    # クエリのトピックベースのベクトルを作成
    query_bows = get_bows([query], dic)
    query_tfidf_weights = get_weights(query_bows, dic, tfidf_model)
    query_nmf_weights = nmf_model[query_tfidf_weights]

    # クエリとの類似性で文書をランキング
    sims = index[query_nmf_weights[0]]
    return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
Exemplo n.º 9
0
def nmf(filename):

    # tf-idf
    min_df = 10
    max_df = 0.60
    max_features = 5000
    # sklearn-model nmf

    # get data
    newspaper_input = pd.read_csv(filename, na_filter=False)

    newspaper_input['processed_text'] = newspaper_input['article'].apply(
        process_text)
    texts = newspaper_input['processed_text']

    # create dictionary to pass as input to gensim model
    dictionary = Dictionary(newspaper_input['processed_text'])

    # filter out words that are above or below the thresholds set
    dictionary.filter_extremes(no_below=10, no_above=0.60, keep_n=5000)

    # convert to bag of words (corpus) to pass to gensim nmf model
    # [[(word_id, # times word appears in document),...],...]
    corpus = [dictionary.doc2bow(text) for text in texts]

    # find optimal number of topics using gensim NMF https://radimrehurek.com/gensim/models/nmf.html
    # testing topic numbers 10,15,20...55 to find best number to fit the data
    topic_nums = list(np.arange(10, 56, 5))
    coherence_scores = []
    for num in topic_nums:
        # initialize NMF model
        nmf = Nmf(
            corpus=corpus,
            num_topics=num,
            id2word=dictionary,
            chunksize=
            500,  #Number of documents to be used in each training chunk
            passes=10,  #Number of full passes over the training corpus
            kappa=0.1,  #Gradient descent step size
            minimum_probability=0.001,
            w_max_iter=
            300,  # Maximum number of iterations to train W per each batch
            w_stop_condition=
            0.0001,  # If error difference gets less than that, training of W stops for the current batch
            h_max_iter=100,
            h_stop_condition=0.001,
            normalize=True,
            random_state=42)

        # initialize Coherence Model https://radimrehurek.com/gensim/models/coherencemodel.html
        # Calculate topic coherence for topic models
        cm = CoherenceModel(model=nmf,
                            texts=texts,
                            dictionary=dictionary,
                            coherence='c_v')

        coherence_scores.append(round(cm.get_coherence(), 5))

    # get list of different topic numbers and their respective scores
    scores = list(zip(topic_nums, coherence_scores))
    # sort scores by score (not topic_num)
    scores = sorted(scores, key=itemgetter(1), reverse=True)
    # get the best number of topics
    best_num_topics, best_coherence_score = scores[0]
    # best_coherence_score = scores[0][1]
    print('scores: ', scores)

    print('num_topics: ', str(best_num_topics))
    print('coherence: ', str(best_coherence_score))
    # print(df.head())

    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),  # unigrams and bigrams
        max_df=0.60,
        min_df=10,
        max_features=5000,
        preprocessor=' '.join)

    # fit+transform: returns document-term matrix (frequency of word i in document j)
    tfidf = tfidf_vectorizer.fit_transform(texts)
    # all the words we'll be looking at
    tfidf_fn = tfidf_vectorizer.get_feature_names()

    # grid search for best alpha, l1_ratio combination
    # measured by lowest sum squared residual
    # l1_ratio: regularization mixing parameter (0 => l2 penalty, 1 => l1 penalty, (0,1) => mixture)
    # alpha: constant that multiplies the regularization terms (0 => no regularization)
    squared_residuals = []
    params = {}
    models = []
    sorted_articles_dfs = []
    complete_topics_dfs = []
    alphas = list(np.arange(0.0, 1.2, 0.2))
    l1_ratios = list(np.arange(0.0, 1.2, 0.2))
    count_params = 0
    successes = 0
    count_successes = {}
    for a in alphas:
        for b in l1_ratios:
            # print('alpha: {}, l1_ratio: {}'.format(a,b))

            # learn a model
            nmf = NMF(
                n_components=best_num_topics,
                init=
                'nndsvd',  # Non-negative double singular value decomposition
                max_iter=500,
                l1_ratio=b,
                solver='cd',  # coordinate descent
                alpha=a,
                tol=0.0001,  # 0.001
                random_state=42).fit(tfidf)

            try:
                # transforms documents -> document-term matrix, transforms data according to model
                docweights = nmf.transform(tfidf)  # (articles x topics)

                # topic dataframe: (best_num_topics x 8)
                # (topic num : top 8 words that describe the topic)
                n_top_words = 8
                topic_df = topic_table(nmf, tfidf_fn, n_top_words).T

                # clean the topic words
                topic_df['topics'] = topic_df.apply(lambda x: [' '.join(x)],
                                                    axis=1)
                topic_df['topics'] = topic_df['topics'].str[0]
                topic_df['topics'] = topic_df['topics'].apply(
                    lambda x: whitespace_tokenizer(x))
                topic_df['topics'] = topic_df['topics'].apply(
                    lambda x: unique_words(x))
                topic_df['topics'] = topic_df['topics'].apply(
                    lambda x: [' '.join(x)])
                topic_df['topics'] = topic_df['topics'].str[0]

                # clean topic dataframe
                topic_df = topic_df['topics'].reset_index()
                topic_df.columns = ['topic_num', 'topics']

                topics = topic_df[['topic_num', 'topics']]

                # assign topics to each article
                title = newspaper_input['title'].tolist()
                df_temp = pd.DataFrame({
                    'title': title,
                    'topic_num': docweights.argmax(axis=1)
                })
                merged_topic = df_temp.merge(topic_df,
                                             on='topic_num',
                                             how='left')
                complete_df = merged_topic.merge(newspaper_input,
                                                 on='title',
                                                 how='left')

                # complete_df = complete_df.drop('processed_text', axis=1)

                # maybe unecessary ?
                complete_df = complete_df.drop_duplicates(subset=['title'])
                sorted_articles = complete_df.sort_values(by=['topic_num'])

                # get num articles per topic
                num_articles_per_topic = []
                for topic in range(best_num_topics):
                    count = 0
                    for index, row in sorted_articles.iterrows():
                        if row['topic_num'] == topic:
                            count += 1
                    num_articles_per_topic.append(count)

                # keep track of how many articles are given each topic
                topics['num_articles'] = num_articles_per_topic

                # matrices from nmf (A = WH)
                mat_A = tfidf_vectorizer.transform(texts)
                mat_W = nmf.components_
                mat_H = nmf.transform(mat_A)

                # residuals: measurement of how well the topics approximate the data (observed value - predicted value)
                # 0 -> topic perfectly predicts data
                # residual = Frobenius norm tf-idf weights (A) - coefficients of topics (H) X coefficients of topics (W)
                r = np.zeros(mat_A.shape[0])  # num articles
                for row in range(mat_A.shape[0]):
                    r[row] = np.linalg.norm(
                        mat_A[row, :] - mat_H[row, :].dot(mat_W), 'fro')

                sum_sqrt_res = round(sum(np.sqrt(r)), 3)
                squared_residuals.append(sum_sqrt_res)

                # add avg residual column to topics
                complete_df['resid'] = r
                sorted_articles = complete_df.sort_values(by=['topic_num'])
                resid_data = complete_df[[
                    'topic_num', 'resid'
                ]].groupby('topic_num').mean().sort_values(by='resid')
                complete_topics = topics.merge(resid_data,
                                               on='topic_num',
                                               how='left')

                # save results
                sorted_articles_dfs.append(sorted_articles)
                complete_topics_dfs.append(complete_topics)
                models.append(nmf)

                count_successes[count_params] = successes
                successes += 1

            except Exception as e:
                # print('test {}, error occurred'.format(count_params))
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print(exc_type, fname, exc_tb.tb_lineno)

            # print('test {} complete'.format(count_params))
            params[count_params] = (a, b)
            count_params += 1

    # find best params
    params_test = np.arange(36)
    resid_scores = list(zip(params_test, squared_residuals))
    resid_scores = sorted(resid_scores, key=itemgetter(1))
    best_params = resid_scores[0][0]
    print('test #{} had best residual score'.format(best_params))
    print('params: a={}, b={}'.format(params[best_params][0],
                                      params[best_params][1]))
    print('residual scores: {}'.format(resid_scores))

    best_articles = sorted_articles_dfs[count_successes[best_params]]
    best_topics = complete_topics_dfs[count_successes[best_params]]

    # call function that uses svc model to predict category based on topic words
    best_topics = predict_category(best_topics)

    # save best topics
    for idx, row in best_topics.iterrows():
        new_words = ''
        topics_itr = row['topics'].split()
        for word in topics_itr:
            new_words += get_unstemmed_word(word)
            new_words += ' '
        best_topics.at[idx, 'topics'] = new_words

    categories = []
    for idx, row in best_articles.iterrows():
        topic_num = row['topic_num']
        topics = best_topics.at[topic_num, 'topics']
        categories.append(best_topics.at[topic_num, 'predicted_category'])
        best_articles.at[idx, 'topics'] = topics
    best_articles['predicted_category'] = categories

    best_articles = best_articles.drop('processed_text', axis=1)
    best_articles = best_articles.drop('Unnamed: 0', axis=1)

    best_articles.to_csv('../output/topic/articles_with_nmf_topics.csv',
                         header=True,
                         index=False)
    best_topics.to_csv('../output/topic/nmf_generated_topics.csv',
                       header=True,
                       index=False)

    # save model
    with open('nmf_model.pickle', 'wb') as output:
        pickle.dump(models[best_params], output)

    with open('nmf_tfidf.pickle', 'wb') as output:
        pickle.dump(tfidf_vectorizer, output)
# import argparse

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.nmf import Nmf
from gensim.models import TfidfModel

from codebase.utils import TweetRawCorpusStream
from codebase.topic_utilities import export_dtm

if __name__ == "__main__":

    corpora_path = "./corpora/"
    model_path = "./models/"
    num_topics = 50
    model_suffix = "-{}topics".format(num_topics)
    modelTag = "Seventh-and-EighthWeek-Tweets-Rolling"

    nmf = Nmf.load("{}{}{}.model".format(model_path, modelTag, model_suffix))

    fileTag_list = ["First-and-SecondWeek-Tweets-Rolling"]
    for fileTag in fileTag_list:
        tfidf_corpus = MmCorpus('{}{}-tf-idf.mm'.format(corpora_path, fileTag))
        export_dtm(nmf=nmf, corpus=tfidf_corpus,\
            out_path="{}{}{}-dtm.csv".format(model_path, fileTag, model_suffix),\
            stop_at=None)
Exemplo n.º 11
0
common_dictionary = Dictionary(docs)
common_corpus = [common_dictionary.doc2bow(text) for text in docs]

# for k in range(4, 10):
#     nmf = Nmf(common_corpus, num_topics=k)
#     c_model = CoherenceModel(model=nmf, corpus=common_corpus, dictionary=common_dictionary, texts=docs, coherence='c_v')
#     print(k, c_model.get_coherence())
#     x = PrettyTable()
#     x.field_names = [''] + [ "t" + str(i+1) for i in range(0,10)]
#     for i in range(0,k):
#         x.add_row([i] + [ common_dictionary[term] for (term, w)  in nmf.get_topic_terms(i)])
#     print(x)

from gensim.matutils import jaccard
import random 
nmf = Nmf(common_corpus, num_topics=9)

texts = random.choices(docs, k=20)
texts = [docs[0], docs[20], docs[80], docs[90], docs[200], docs[210]] #[docs[i] for i in range(0, len(docs), 30)]

def get_most_likely_topic(doc):
    bow = common_dictionary.doc2bow(doc)
    topics, probabilities = zip(*nmf.get_document_topics(bow))
    max_p = max(probabilities)
    topic = topics[probabilities.index(max_p)]
    return topic

colors =  ["skyblue", "pink", "red", "green", "yellow", "cyan", "purple", "magenta", "orange", "blue"]
def get_node_color(i):
    return colors[get_most_likely_topic(texts[i])]
    # return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink'
Exemplo n.º 12
0
    def compute_coherence_values(self,
                                 limit,
                                 start=2,
                                 step=3,
                                 model_type="lda",
                                 corpus_type="bow",
                                 show_details=False):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics
        model_type : lda or mallet or nmf
        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        self.get_term_doc_frequency()
        coherence_values = []
        model_list = []
        topics_num_arr = []
        os.environ[
            'MALLET_HOME'] = 'C:\\Mallet\\'  # for windows make sure you put the mallet folder under C and unzip it
        mallet_path = 'C:\\Mallet\\bin\\mallet'

        corpus_to_train = self.corpus
        if corpus_type == 'tfidf':
            print("training on TFIDF")
            tfidf = models.TfidfModel(self.corpus)
            corpus_to_train = tfidf[self.corpus]
        else:
            print("training on BOW")

        #mallet_path = 'C:...mallet_unzipped\\mallet-2.0.8\\bin\\mallet'
        for num_topics in range(start, limit, step):
            if model_type == "lda":
                model = gensim.models.ldamodel.LdaModel(corpus=corpus_to_train,
                                                        id2word=self.id2word,
                                                        num_topics=num_topics,
                                                        random_state=1,
                                                        update_every=1,
                                                        chunksize=10,
                                                        passes=10,
                                                        alpha='auto',
                                                        per_word_topics=True)
            elif model_type == "mallet":
                model = gensim.models.wrappers.LdaMallet(
                    mallet_path,
                    corpus=corpus_to_train,
                    num_topics=num_topics,
                    id2word=self.id2word,
                    random_seed=1)
            elif model_type == "nmf":
                model = Nmf(corpus=corpus_to_train,
                            num_topics=num_topics,
                            id2word=self.id2word,
                            random_state=1)

            else:
                print(
                    'model {} is not supported. the models are *lda*, *mallet* and *nmf*'
                )
            model_list.append(model)
            topics_num_arr.append(num_topics)
            coherence_model = CoherenceModel(model=model,
                                             texts=self.data_lemmatized,
                                             dictionary=self.id2word,
                                             coherence='c_v')
            coherence_num = coherence_model.get_coherence()
            coherence_values.append(coherence_num)

            print(num_topics, ':    -    coherence:', coherence_num)
        optimal_idx = np.argmax(coherence_values)
        self.model = model_list[optimal_idx]
        self.num_topics = topics_num_arr[optimal_idx]
        print("optimal model has a coherence value of ",
              round(coherence_values[optimal_idx], 2), ' and # topics: ',
              (topics_num_arr[optimal_idx]))

        # Visualize
        topic_modeler.show_coherence_vals_graph(coherence_values,
                                                limit,
                                                start=start,
                                                step=step)

        if show_details:
            for m, cv in zip(topics_num_arr, coherence_values):
                print("Num Topics =", m, " has Coherence Value of",
                      round(cv, 4))
        return model_list, coherence_values
def main(query,output_filename,window=50,topicn=50):
	print ('Training nmf model began')
	frame = inspect.currentframe()
	args, _, _, values = inspect.getargvalues(frame)
	query_parameters = [(i, values[i]) for i in args]
	document_collection_original=blacklab.search_blacklab(query,window=window,lemma=True, include_match=False)
	print ("Search finished")
	document_collection=[match['complete_match'].strip() for match in document_collection_original[0:100]]

	#Use the phraser model
	
	phraser_model = Phraser(Phrases.load(constants.OUTPUT_FOLDER+'phrase_model'))
	document_collection=[' '.join(phraser_model[match['complete_match'].strip().split()]) for match in document_collection_original]
	print ("Phraser model done")
	#get rid of stop words
	document_collection_filtered = document_collection
	'''
	for text in document_collection:
		new_text = []
		for word in text.split():
			if (word not in set(stopwords.words('english')) and (word[0] in string.ascii_uppercase + string.ascii_lowercase)):
				new_text.append(word)
		document_collection_filtered.append(' '.join(new_text))
	'''
	print ("Filtering done")
	
	#build the corpus
	preprocessed_corpus = []

	for i,text in enumerate(document_collection_filtered):
		if i==0:
			print (i)
			text = text.split()
			
			
			dct=gensim_utils.initialize_gensim_dictionary([text])
		else:
			print (i)
			text = text.split()
			gensim_utils.add_documents_to_gensim_dictionary(dct,[text])
	#Filter it here
	
	dct.filter_extremes(no_below=10, no_above=0.95)
	
	gensim_corpus = [dct.doc2bow(bag_of_word.split()) for bag_of_word in document_collection_filtered]
	
	#text = document_collection_filtered[0].split()
	nmf = Nmf(gensim_corpus, num_topics=50)
	words = list(dct.token2id.keys())

	topics =  nmf.print_topics(50)
	for topic in topics:

		topic_words = topic[1].split('+')
		print_topic = []
		for topic_word in topic_words:
			print_topic.append(words[int(topic_word.split('*')[1][1:].strip()[:-1])])
		print (' '.join(print_topic))

	#get topic of a given document: nmf.get_document_topics(gensim_corpus[0])
	#dct.token2id.keys()
	#nmf.show_topic(10)
	#nmf.get_document_topics(dct.doc2bow(preprocessed_corpus[0]))
	pdb.set_trace()
Exemplo n.º 14
0
class TopicModel(object):
    def __init__(self):
        self.__corpus = None
        self.__modelName = None

        self.__model = None
        self.__modelFile = 'results/model.bin'

        self.__coherenceModel = None

    def setCorpus(self, corpus):
        self.__corpus = corpus

    def getCoherence(self):
        return self.__coherenceModel.get_coherence()

    def getDocumentTopics(self, document, threshold=None):
        return self.__model.get_document_topics(document, threshold)

    def build(self, model_name, num_topics, chunksize, passes, corpus=None):
        self.__modelName = model_name
        # Update corpus if necessary
        if isinstance(corpus, Corpus):
            self.__corpus = corpus
        # Build topic model
        if model_name == 'lda':
            self.__buildLDA(num_topics, chunksize, passes)
        elif model_name == 'nmf':
            self.__buildNMF(num_topics, chunksize, passes)
        # Build coherence model
        self.__buildCoherenceModel()

    def __buildLDA(self, num_topics, chunksize, passes):
        self.__model = LdaMulticore(self.__corpus,
                                    id2word=self.__corpus.getDictionary(),
                                    num_topics=num_topics,
                                    chunksize=chunksize,
                                    passes=passes,
                                    eval_every=None,
                                    workers=40,
                                    random_state=10)

    def __buildNMF(self, num_topics, chunksize, passes):
        self.__model = Nmf(self.__corpus,
                           id2word=self.__corpus.getDictionary(),
                           num_topics=num_topics,
                           chunksize=chunksize,
                           passes=passes,
                           eval_every=None,
                           random_state=10)

    def __buildCoherenceModel(self):
        self.__coherenceModel = CoherenceModel(model=self.__model,
                                               texts=self.__corpus.getTexts(),
                                               coherence='c_v',
                                               processes=7)

    def __printTopics(self):
        print('  Topics')
        for idx, topic in self.__model.print_topics(-1):
            print('    {}: {}'.format(idx, topic))

    def save(self):
        self.__model.save(self.__modelFile)

    def load(self, model_name):
        self.__modelName = model_name

        if model_name == 'lda':
            self.__model = LdaMulticore.load(self.__modelFile)
        elif model_name == 'nmf':
            self.__model = Nmf.load(self.__modelFile)
                        dest="preFileTag",
                        help='preFileTag used to select previous nmf model')
    args = parser.parse_args()

    num_topics = args.num_topics
    fileTag = args.fileTag
    preFileTag = args.preFileTag
    preDictTag = args.preDictTag
    corpora_path = "./corpora/"
    model_path = "./models/"
    model_suffix = "-{}topics".format(num_topics)

    #### Step 1, Load Corpus ####
    if (preFileTag == None) and (preDictTag == None):
        dct = Dictionary.load('{}{}.dict'.format(corpora_path, fileTag))
    # for rolling way first built model
    elif (preFileTag == None) and (preDictTag != None):
        dct = Dictionary.load('{}{}.dict'.format(corpora_path, preDictTag))

    tfidf_corpus = MmCorpus('{}{}-tf-idf.mm'.format(corpora_path, fileTag))

    #### Step 2, train NMF to extract topic patterns ####
    if preFileTag == None:
        nmf = Nmf(tfidf_corpus, id2word=dct, num_topics=num_topics)
    elif preFileTag != None:
        nmf = Nmf.load("{}{}{}.model".format(model_path, preFileTag,
                                             model_suffix))
        nmf.update(tfidf_corpus)

    #### Step 3, export model ####
    nmf.save("{}{}{}.model".format(model_path, fileTag, model_suffix))
Exemplo n.º 16
0
    def train(self,
              data=AbstractModel.ROOT + '/data/test.txt',
              num_topics=20,
              preprocessing=False,
              passes=1,
              kappa=1.0,
              minimum_probability=0.01,
              w_max_iter=200,
              w_stop_condition=0.0001,
              h_max_iter=50,
              h_stop_condition=0.001,
              eval_every=10,
              normalize=True,
              random_state=None):
        """
        Train the model and generate the results on the corpus
            :param data: The training corpus as path or list of strings
            :param int num_topics: The desired number of topics
            :param bool preprocessing: If true, apply preprocessing to the corpus
            :param int passes: Number of full passes over the training corpus. Leave at default passes=1 if your input is an iterator.
            :param float kappa: Gradient descent step size. Larger value makes the model train faster, but could lead to non-convergence if set too large.
            :param float minimum_probability: If normalize is True, topics with smaller probabilities are filtered out. If normalize is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s.
            :param float w_max_iter: Maximum number of iterations to train W per each batch.
            :param float w_stop_condition: If error difference gets less than that, training of W stops for the current batch.
            :param float h_max_iter: Maximum number of iterations to train h per each batch.
            :param float h_stop_condition: If error difference gets less than that, training of h stops for the current batch.
            :param int eval_every: Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low.
            :param bool normalize:  Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.
            :param int random_state: Seed for random generator. Needed for reproducibility.

        """
        frequency = defaultdict(int)
        data = input_to_list_string(data, preprocessing)
        for text in data:
            for token in text.split(' '):
                frequency[token] += 1

        if preprocessing:
            data = map(preprocess, data)

        texts = [[
            token for token in text.split(' ')
            if frequency[token] > 1 and len(token) > 0
        ] for text in data]

        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        nmf_model = Nmf(corpus,
                        id2word=dictionary,
                        num_topics=num_topics,
                        kappa=kappa,
                        minimum_probability=minimum_probability,
                        w_max_iter=w_max_iter,
                        w_stop_condition=w_stop_condition,
                        h_max_iter=h_max_iter,
                        h_stop_condition=h_stop_condition,
                        eval_every=eval_every,
                        normalize=normalize,
                        random_state=random_state)

        self.model = nmf_model
        self.dictionary = dictionary
        self.corpus_predictions = nmf_model[corpus]

        return 'success'
Exemplo n.º 17
0
# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(5, 75 + 1, 5))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []
for i in tqdm(range(0, len(topic_nums))):
    for num in topic_nums:
        nmf = Nmf(corpus=corpus,
                  num_topics=num,
                  id2word=dictionary,
                  chunksize=2000,
                  passes=5,
                  kappa=.1,
                  minimum_probability=0.01,
                  w_max_iter=300,
                  w_stop_condition=0.0001,
                  h_max_iter=100,
                  h_stop_condition=0.001,
                  eval_every=10,
                  normalize=True,
                  random_state=42)
    time.sleep(0.5)

    # Run the coherence model to get the score
    cm = CoherenceModel(model=nmf,
                        texts=texts,
                        dictionary=dictionary,
                        coherence='c_v')

    coherence_scores.append(round(cm.get_coherence(), 5))
Exemplo n.º 18
0
# TODO Several useful descriptive insight methods in this article
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#15visualizethetopicskeywords
# https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/#14.-pyLDAVis

if not os.path.exists("img"):
    os.mkdir("img")

#####################################################################################
# Wordcloud by topic
#####################################################################################

# Load model(s)
# lda = LdaModel.load("models/lda")
dct = gensim.utils.SaveLoad.load("models/dct")
corpus = gensim.corpora.MmCorpus("models/corpus")
nmf = Nmf.load("models/nmf")

# Generate word cloud for final model
labels = {
    1: "Economic activity",
    2: "Policy action",
    3: "Economic outlook",
    4: " Employment",
    5: "Financial Markets",
    6: "Inflation"
}
for topic in range(0, NUM_TOPICS):
    termsnmf = nmf.show_topic(topic, topn=50)
    # Model returns list of tuples, wordcloud wants a dictionary instead
    wordcloudnmf = WordCloud(
        background_color="white").generate_from_frequencies(dict(termsnmf))
Exemplo n.º 19
0
def nmf_coherence_scores(text_l, min_df, max_df):
    '''
    Build Gensim NMF model, calculate coherence scores for various numbers of topics,
    and plot coherence scores against number of topics
    '''
    texts = [word_tokenize(text) for text in text_l]

    # Create a dictionary
    dictionary = corpora.Dictionary(texts)

    # Filter out extremes to limit the number of features
    dictionary.filter_extremes(
        no_below=min_df,
        no_above=max_df
    )

    # Create the bag-of-words format (list of (token_id, token_count))
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Create a list of the topic numbers we want to try
    topic_nums = list(np.arange(5, 75 + 1, 5))

    # Run NMF model and calculate coherence score for each number of topics
    coherence_scores = []

    for num in topic_nums:
        nmf = Nmf(
            corpus=corpus,
            num_topics=num,
            id2word=dictionary,
            chunksize=2000,
            passes=5,
            kappa=.1,
            minimum_probability=0.01,
            w_max_iter=300,
            w_stop_condition=0.0001,
            h_max_iter=100,
            h_stop_condition=0.001,
            eval_every=10,
            normalize=True,
            random_state=42
        )
        
        cm = CoherenceModel(
            model=nmf,
            texts=texts,
            dictionary=dictionary,
            coherence='c_v'
        )
        
        coherence_scores.append(round(cm.get_coherence(), 5))

    # Get the number of topics with the highest coherence score
    scores = list(zip(topic_nums, coherence_scores))
    best_num_topics = sorted(scores, key=operator.itemgetter(1), reverse=True)[0][0]
    print(scores)
    print(best_num_topics)
    
    # Plot coherence scores
    plt.figure(figsize=(8,5))
    plt.plot(topic_nums, coherence_scores, color='r', linewidth=2)
    plt.title('NMF Model Optimization: Coherence Scores', fontsize=16)
    plt.xlabel('Number of topics', fontsize=14)
    plt.ylabel('Coherence score', fontsize=14)
    plt.xticks(np.arange(5,80,5), fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()