예제 #1
0
    def top_topics(self, corpus, texts=None, dictionary=None, window_size=None,
                   coherence='u_mass', topn=20, processes=-1):
        """Get the topics sorted by coherence.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        texts : list of list of str, optional
            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
            probability estimator .
        dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Dictionary mapping of id word to create corpus.
            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
        window_size : int, optional
            Is the size of the window to be used for coherence measures using boolean sliding window as their
            probability estimator. For 'u_mass' this doesn't matter.
            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
            Coherence measure to be used.
            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.
        processes : int, optional
            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
            num_cpus - 1.

        Returns
        -------
        list of (list of (int, str), float)
            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
            are distributions of words, represented as a list of pairs of word IDs and their probabilities.

        """
        cm = CoherenceModel(
            model=self, corpus=corpus, texts=texts, dictionary=dictionary,
            window_size=window_size, coherence=coherence, topn=topn,
            processes=processes
        )
        coherence_scores = cm.get_coherence_per_topic()

        str_topics = []
        for topic in self.get_topics():  # topic = array of vocab_size floats, one per term
            bestn = matutils.argsort(topic, topn=topn, reverse=True)  # top terms for topic
            beststr = [(topic[_id], self.id2word[_id]) for _id in bestn]  # membership, token
            str_topics.append(beststr)  # list of topn (float membership, token) tuples

        scored_topics = zip(str_topics, coherence_scores)
        return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
예제 #2
0
        #print(i,row)
        for j,(topic_num,prop_topic) in enumerate(row):
            w.write(str(prop_topic))
            w.write(',')
        
        w.write('\b\n')
print('\nNumer of topic : ', num_topic )
print('\nCoherence Score: ', np.round(coherence,4))
'''

# RUNNING MODEL WITH MALLET
mallet_path = 'mallet-2.0.8/bin/mallet'
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=15, id2word=id2word, workers = 2, random_seed = 0)

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
print('--------------------3')

#optimal_model = model
#model_topics = optimal_model.show_topics(formatted=False)
ldamodel=model

with open ('am_LDA_wmallet_metadata.csv', 'w') as w: 
    for i,row in enumerate(ldamodel[corpus]):
        w.write(asin[i])
        w.write(',')
        #print(i,row)
        for j,(topic_num,prop_topic) in enumerate(row):
            w.write(str(prop_topic))
doc_lda = lda_model[corpus]

# In[44]:

# Model perplexity and topic coherence provide a convenient
# measure to judge how good a given topic model is.
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
# a measure of how good the model is. lower the better.

# In[46]:

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=data_lemmatized,
                                     dictionary=id2word,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# In[48]:

# Visualize the topics
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

# In[49]:

pyLDAvis.display(vis)

# In[109]:
예제 #4
0
    # Check resulting topics.
    listOfTopics = ldaModel.print_topics(num_topics=numberOfTopics,
                                         num_words=15)
    for index, i in enumerate(listOfTopics):
        string = str(i[1])
        for c in "0123456789+*\".":
            string = string.replace(c, "")
        string = string.replace("  ", " ")
        print(string)
    # calculate & display perplexity
    print('\nPerplexity: ', ldaModel.log_perplexity(
        corpus))  # a measure of how good the model is. lower the better.

    # calculate & display coherence
    coherenceModel = CoherenceModel(model=ldaModel,
                                    texts=document,
                                    dictionary=dictionary,
                                    coherence='c_v')
    ldaCoherence = coherenceModel.get_coherence()
    print('\nCoherence Score: ', ldaCoherence)

    # assign a file name based on the loop number so that models aren't overridden during successive iterations.
    path = './models/both/nouns_only'
    if not os.path.exists(path):
        os.makedirs(path)
    ldaModel.save(f'./models/both/nouns_only/model1-{loopNum}.model')

# save the model dataframe for use in later sections.
modelDataframe.to_csv('./dataframes/model_df.csv')
예제 #5
0
    def most_similar_texts(self,
                           X,
                           num_examples,
                           text_column_name,
                           num_topics=None):
        """
        Uses NMF clustering to create n topics based on adjusted word frequencies

        Parameters
        --------
        X: DataFrame
        num_examples: int
        text_column_name: str
        num_topics: int
            Optional - if none algorithm will determine best number

        Returns
        --------
        topic_words_df: DataFrame
            Top 15 words/phrases per topic
        combined_df: DataFrame
            Original text with topic number assigned to each

        """
        X = X[~X[text_column_name].isna()]
        X = X[X[text_column_name] != ""]
        X = X[X[text_column_name] != " "]
        X = X[X[text_column_name] != "NA"]
        X = X[X[text_column_name] != "n/a"]
        X = X[X[text_column_name] != "N/A"]
        X = X[X[text_column_name] != "na"]

        all_stop_words = (set(ENGLISH_STOP_WORDS)
                          | set(["-PRON-"])
                          | set(string.punctuation)
                          | set([" "]))

        ct = CleanText()
        vectorizer = TfidfVectorizer(
            tokenizer=ct.lematize,
            ngram_range=(1, 3),
            stop_words=all_stop_words,
            min_df=5,
            max_df=0.4,
        )
        vectors = vectorizer.fit_transform(X[text_column_name]).todense()

        # Adding words/phrases used in text data frequencies back into the dataset (so we can see feature importances later)
        vocab = vectorizer.get_feature_names()
        vector_df = pd.DataFrame(vectors, columns=vocab, index=X.index)

        if X.shape[0] < 20:
            return "Too few examples to categorize."

        if not num_topics:

            # In case 1, add 1 to get at least 2
            # The rest are based on eyeballing numbers
            min_topics = ceil(X.shape[0] * 0.01) + 1
            max_topics = ceil(X.shape[0] * 0.2)
            step = ceil((max_topics - min_topics) / 5)

            topic_nums = list(np.arange(min_topics, max_topics, step))

            texts = X[text_column_name].apply(ct.lematize)

            # In gensim a dictionary is a mapping between words and their integer id
            dictionary = Dictionary(texts)

            # Filter out extremes to limit the number of features
            dictionary.filter_extremes(no_below=2, no_above=0.85, keep_n=5000)

            # Create the bag-of-words format (list of (token_id, token_count))
            corpus = [dictionary.doc2bow(text) for text in texts]

            coherence_scores = []

            for num in topic_nums:
                model = nmf.Nmf(
                    corpus=corpus,
                    num_topics=num,
                    id2word=dictionary,
                    chunksize=2000,
                    passes=5,
                    kappa=0.1,
                    minimum_probability=0.01,
                    w_max_iter=300,
                    w_stop_condition=0.0001,
                    h_max_iter=100,
                    h_stop_condition=0.001,
                    eval_every=10,
                    normalize=True,
                    random_state=42,
                )

                cm = CoherenceModel(model=model,
                                    texts=texts,
                                    dictionary=dictionary,
                                    coherence="u_mass")

                coherence_scores.append(round(cm.get_coherence(), 5))

            scores = list(zip(topic_nums, coherence_scores))
            chosen_num_topics = sorted(scores, key=itemgetter(1),
                                       reverse=True)[0][0]
        else:
            chosen_num_topics = num_topics

        model = NMF(n_components=chosen_num_topics, random_state=42)
        model.fit(vectors)
        component_loadings = model.transform(vectors)

        top_topics = pd.DataFrame(np.argmax(component_loadings, axis=1),
                                  columns=["top_topic_num"])

        top_topic_loading = pd.DataFrame(np.max(component_loadings, axis=1),
                                         columns=["top_topic_loading"])

        X.reset_index(inplace=True, drop=False)
        vector_df.reset_index(inplace=True, drop=True)

        # Fix for duplicate text_column_name
        vector_df.columns = [x + "_vector" for x in vector_df.columns]

        combined_df = pd.concat([X, vector_df, top_topics, top_topic_loading],
                                axis=1)

        combined_df.sort_values(by="top_topic_loading",
                                ascending=False,
                                inplace=True)

        combined_df = pd.concat([X, vector_df, top_topics], axis=1)

        topic_words = {}
        sample_texts_lst = []
        for topic, comp in enumerate(model.components_):
            word_idx = np.argsort(comp)[::-1][:num_examples]
            topic_words[topic] = [vocab[i] for i in word_idx]
            sample_texts_lst.append(
                list(combined_df[combined_df["top_topic_num"] == topic]
                     [text_column_name].values[:num_examples]))

        topic_words_df = pd.DataFrame(columns=[
            "topic_num",
            "num_in_category",
            "top_words_and_phrases",
            "sample_texts",
        ])

        topic_words_df["topic_num"] = [k for k, _ in topic_words.items()]
        topic_words_df["num_in_category"] = (
            combined_df.groupby("top_topic_num").count().iloc[:, 0])
        topic_words_df["top_words_and_phrases"] = [
            x for x in topic_words.values()
        ]
        topic_words_df["sample_texts"] = sample_texts_lst

        topic_words_explode = pd.DataFrame(
            topic_words_df["sample_texts"].tolist(),
            index=topic_words_df.index,
        )

        topic_words_explode.columns = [
            "example{}".format(num)
            for num in range(len(topic_words_explode.columns))
        ]

        concated_topics = pd.concat(
            [
                topic_words_df[[
                    "topic_num", "num_in_category", "top_words_and_phrases"
                ]],
                topic_words_explode,
            ],
            axis=1,
        )

        print("Topics created with top words & example texts:")
        print(concated_topics)

        return (
            concated_topics,
            combined_df[["index", text_column_name, "top_topic_num"]],
        )
예제 #6
0
# ### Evaluate - model #1

# In[91]:


# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus_train)
perplexity


# In[92]:


# TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=filtered_data, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1


# In[94]:


# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
coherence_1


# In[97]:

# Running and Trainign LDA model on the document term matrix.

lda_model = Lda(doc_term_matrix,
                num_topics=num_topics,
                id2word=dictionary,
                random_state=500,
                passes=passes)
#pprint(lda_model.print_topics(num_words=30))

print("--------------- TOPICs Using LDA------------------------------")
for i, topic in lda_model.show_topics(formatted=True,
                                      num_topics=num_topics,
                                      num_words=30):
    print(str(i) + ": " + topic)
    print()

print(
    "----------------Perplexity and Coherence Score for LDA----------------------------"
)
print('\nPerplexity: ', lda_model.log_perplexity(
    doc_term_matrix))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=all_data,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
예제 #8
0
                                   passes = 10,
                                   eval_every=1,
                                   workers = None)

lda_model.print_topics()

for idx, topic in lda_model.print_topics(-1):
    print("Category: {} \nWords: {}".format(idx, topic ))
    print("\n")

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus_dict)) 
#Perplexity:  -6.83

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dict_corpus, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
#Coherence Score:  0.4264283394676994

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics
예제 #9
0
파일: LDA.py 프로젝트: AB19/TopicModeler
 def getCoherence(self):
     coherenceModel = CoherenceModel(model=self.model,
                                     texts=self.data,
                                     dictionary=self.id2word,
                                     coherence='c_v')
     return coherenceModel.get_coherence()
예제 #10
0
        for n, topic in lm.show_topics(num_topics=-1, formatted=False):
            topic = [word for word, _ in topic]
            cm = CoherenceModel(topics=[topic], texts=texts, dictionary=dictionary, window_size=10)
            coherence_values[n] = cm.get_coherence()
        top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
    return lm, top_topics

lm, top_topics = ret_top_model()

print(top_topics[:5])

pprint([lm.show_topic(topicid) for topicid, c_v in top_topics[:10]])

ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')
예제 #11
0
def main():

    # Get a list of all tweet texts from MongoDB
    #----------------------------------
    print('\nLoading from MongoDB..')
    cursor = collection.find({"tweet_date":"2020-12-08"})

    data = []
    for doc in cursor:
        data.append(doc["text_preprocessed"])
        #print(doc["text_preprocessed"])

    #print(text_data)
    #----------------------------------


    # Create a dictionary
    # ----------------------------------
    print('\nCreating dictionary..')
    data = [d.split() for d in data]
    dictionary = gensim.corpora.Dictionary(data)
    #print(len(id2word))

    dictionary.filter_extremes(no_below=2, no_above=.99) # Filtering Extremes
    #print(len(id2word))
    # ----------------------------------


    # Creating a corpus object
    # ----------------------------------
    print('\nCreating corpus..')
    corpus = [dictionary.doc2bow(d) for d in data]
    # ----------------------------------


    # LDA model
    # ----------------------------------
    print('\nBuilding LDA model..')
    LDA_model = gensim.models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=5)  # Instantiating a Base LDA model
    # ----------------------------------


    # Create Topics
    # ----------------------------------
    print('\nTopics:')
    words = [re.findall(r'"([^"]*)"', t[1]) for t in LDA_model.print_topics()]   # Filtering for words
    topics = [' '.join(t[0:10]) for t in words]

    for id, t in enumerate(topics): # Getting the topics
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")
    # ----------------------------------


    # Print topics with propabilities
    # ----------------------------------
    print('\nTopics with propabilities:')
    for i in LDA_model.print_topics():
        for j in i: print(j)
    # ----------------------------------


    # Get most frequent words of each topic
    # ----------------------------------
    print('\nMost frequent words by topic:')
    topic_words = []
    for i in range(NUM_TOPICS):
        tt = LDA_model.get_topic_terms(i, 20)
        topic_words.append([dictionary[pair[0]] for pair in tt])

    # output
    for i in range(NUM_TOPICS):
        print(f"\n------ Topic {i} ------")
        print(topic_words[i])
    # ----------------------------------


    # Compute Coherence and Perplexity
    # ----------------------------------
    #Compute Perplexity, a measure of how good the model is. lower the better
    print('\nComputing Coherence and Perplexity..')
    base_perplexity = LDA_model.log_perplexity(corpus)
    print('\nPerplexity: ', base_perplexity)

    # Compute Coherence Score
    coherence_model = CoherenceModel(model=LDA_model, texts=data,
                                   dictionary=dictionary, coherence='c_v')
    coherence_lda_model_base = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model_base)
    # ----------------------------------


    # Creating Topic Distance Visualization
    # ----------------------------------
    print('\nCreating visualization..')
    visualisation = pyLDAvis.gensim.prepare(LDA_model, corpus, dictionary)
    pyLDAvis.save_html(visualisation, 'LDA_Visualization.html')
예제 #12
0
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
	# Init output
	sent_topics_df = pd.DataFrame()

	# Get main topic in each document
	for i, row in enumerate(ldamodel[corpus]):
	row = sorted(row, key=lambda x: (x[1]), reverse=True)
	# Get the Dominant topic, Perc Contribution and Keywords for each document
	for j, (topic_num, prop_topic) in enumerate(row):
		if j == 0:  # => dominant topic
			wp = ldamodel.show_topic(topic_num)
			topic_keywords = ", ".join([word for word, prop in wp])
			sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
		else:
			break
	sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

	# Add original text to the end of the output
	contents = pd.Series(texts)
	sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
	return(sent_topics_df)


def sent_to_words(sentences):
	for sentence in sentences:
		yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
	return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    	return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    	return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    	"""https://spacy.io/api/annotation"""
    	texts_out = []
    	for sent in texts:
        	doc = nlp(" ".join(sent)) 
        	texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    	return texts_out

bfinputdir = '/home/mike/QA_Project/lda_Data/Bug_Fix_sentiment_Data/'
biinputdir = '/home/mike/QA_Project/lda_Data/Bug_intro_sentiment_Data/'
bfout = '/home/mike/QA_Project/lda_Data/Bug_Fix_topic_output/'
biout = '/home/mike/QA_Project/lda_Data/Bug_intro_topic_output/'
bf_list= []
bi_list= []
total_list= []
for file in os.listdir(bfinputdir):
	file
	with open (bfinputdir+file, "r", encoding="utf8") as f:
		list1= []
		name= file.split('.')[0] 
		new_file = open(bfout+name+'_Lda1.txt','w')
		for line in f:	
			line=line.rstrip()			
			list1.append(line)
			bf_list.append(line)
			total_list.append(line)
		
		# Remove Emails
		list1 = [re.sub('\S*@\S*\s?', '', sent) for sent in list1]

		# Remove new line characters
		list1 = [re.sub('\s+', ' ', sent) for sent in list1]

		# Remove distracting single quotes
		list1 = [re.sub("\'", "", sent) for sent in list1]

		pprint(list1[:1])
		data_words = list(sent_to_words(list1))

		print(data_words[:1])
		# Build the bigram and trigram models
		bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
		trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

		# Faster way to get a sentence clubbed as a trigram/bigram
		bigram_mod = gensim.models.phrases.Phraser(bigram)
		trigram_mod = gensim.models.phrases.Phraser(trigram)

		# See trigram example
		print(trigram_mod[bigram_mod[data_words[0]]])
		# Remove Stop Words
		data_words_nostops = remove_stopwords(data_words)

		# Form Bigrams
		data_words_bigrams = make_bigrams(data_words_nostops)

		# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
		# python3 -m spacy download en
		nlp = spacy.load('en', disable=['parser', 'ner'])

		# Do lemmatization keeping only noun, adj, vb, adv
		data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

		print(data_lemmatized[:1])
		# Create Dictionary
		id2word = corpora.Dictionary(data_lemmatized)

		# Create Corpus
		texts = data_lemmatized

		# Term Document Frequency
		corpus = [id2word.doc2bow(text) for text in texts]

		# View
		print(corpus[:1])
		# Build LDA model
		lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
				                           id2word=id2word,
				                           num_topics=20, 
				                           random_state=100,
				                           update_every=1,
				                           chunksize=100,
				                           passes=10,
				                           alpha='auto',
				                           per_word_topics=True)
		# Print the Keyword in the 10 topics
		pprint(lda_model.print_topics())
		doc_lda = lda_model[corpus]
		# Compute Perplexity
		print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

		# Compute Coherence Score
		coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
		coherence_lda = coherence_model_lda.get_coherence()
		print('\nCoherence Score: ', coherence_lda)
		# Visualize the topics
		pyLDAvis.enable_notebook()
		vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
		vis
		df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

		# Format
		df_dominant_topic = df_topic_sents_keywords.reset_index()
		df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

		# Show
		df_dominant_topic.head(10)
		# Number of Documents for Each Topic
		topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

		# Percentage of Documents for Each Topic
		topic_contribution = round(topic_counts/topic_counts.sum(), 4)

		# Topic Number and Keywords
		topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

		# Concatenate Column wise
		df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

		# Change Column names
		df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

		# Show
		df_dominant_topics
예제 #13
0
from model.util.file_parser import parse_dir_json

if __name__ == '__main__':
    init_logger()
    log = logging.getLogger('lda_model')

    config = LdaConfig(sys.argv[1], 'lda_model').get_current_config()

    _, docs = zip(*parse_dir_json(config['data_path']))

    preprocessed_docs = Preprocessor(
        max_workers=config['max_workers']).process_docs(docs)

    log.info("Loading model from %s", config['model_path'])
    lda_model = LdaMulticore.load(config['model_path'])
    log.info("Loading dictionary from %s", config['dict_path'])
    dictionary = Dictionary.load(config['dict_path'])

    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=preprocessed_docs,
                                         dictionary=dictionary,
                                         coherence='c_v')

    coherence_lda = coherence_model_lda.get_coherence()

    import csv

    with open(config['coherence_path'], "a") as csv_file:
        writer = csv.writer(csv_file, delimiter=';')
        writer.writerow([config['topics'], coherence_lda])
예제 #14
0
def abandon():
    stopWords = set(stopwords.words('english'))

    for w in string.punctuation:
        stopWords.add(w)

    stops_words = [
        "rt", "…", "...", "URL", "http", "https", "“", "”", "‘", "’", "get",
        "2", "new", "one", "i'm", "make", "go", "good", "say", "says", "know",
        "day", "..", "take", "got", "1", "going", "4", "3", "two", "n", "like",
        "via", "u", "would", "still", "first", "really", "watch", "see",
        "even", "that's", "look", "way", "last", "said", "let", "twitter",
        "ever", "always", "another", "many", "things", "may", "big", "come",
        "keep", "5", "time", "much", "want", "think", "us", "love", "people",
        "need"
    ]

    for w in stops_words:
        stopWords.add(w)

    tokenizer = CustomTweetTokenizer(preserve_case=False,
                                     reduce_len=True,
                                     strip_handles=False,
                                     normalize_usernames=False,
                                     normalize_urls=True,
                                     keep_allupper=False)

    cnt = Counter()
    texts = []
    # comm = json.load(open("data/louvain_rst.json"))
    # users_comm = {str(u) for u in comm if comm[u] == 0}
    # print(len(users_comm))

    # loading data
    data = pd.read_csv("data/ira-tweets-ele.csv",
                       usecols=["tweet_text", "userid"])
    for i, row in tqdm(data.iterrows()):
        # if row["userid"] not in users_comm:
        #     continue
        words = tokenizer.tokenize(row["tweet_text"])
        words = [w for w in words if w not in stopWords and w]
        # if words[0] == "RT":
        #     continue
        for w in words:
            cnt[w] += 1
        texts.append(words)
    print(len(texts))
    json.dump(cnt.most_common(), open("data/word_cloud.json", "w"), indent=2)

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(t) for t in texts]

    def average_distance(v_tops):
        _sum = 0
        _cnt = 0
        for i in range(len(v_tops)):
            for j in range(i + 1, len(v_tops)):
                _sum += scipy.spatial.distance.cosine(v_tops[i], v_tops[j])
                _cnt += 1
        return _sum / _cnt

    with open("data/IRA_topics.txt", "w") as f:
        for n in range(2, 12):
            print(f"N = {n}")
            lda = LdaModel(corpus, num_topics=n, random_state=42)
            v_topics = lda.get_topics()
            lda.save(f"model/lda-ira-{n}.mod")
            # pprint(lda.print_topics())

            f.write(f"Perplexity: {lda.log_perplexity(corpus)}"
                    )  # a measure of how good the model is. lower the better.

            # Compute Coherence Score
            coherence_model_lda = CoherenceModel(model=lda,
                                                 texts=corpus,
                                                 coherence='c_v')
            coherence_lda = coherence_model_lda.get_coherence()
            f.write(f"Coherence Score: {coherence_lda}")
            f.write(f"~Average distance: {average_distance(v_topics)}\n")
            # show
            x = lda.show_topics(num_topics=n, num_words=20, formatted=False)
            topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
            dictionary.id2token = {
                v: k
                for k, v in dictionary.token2id.items()
            }
            # Below Code Prints Topics and Words
            for topic, words in topics_words:
                f.write(
                    str(topic) + " :: " +
                    str([dictionary.id2token[int(w)] for w in words]) + "\n")
            f.write("\n")
import joblib
from gensim.models import CoherenceModel

#load the model
lda_model = joblib.load('62topiclda.pkl')

#load the dictionary
dictionary = joblib.load('dictionary.pkl')

#load the corpus
bow_corpus = joblib.load('bow_corpus.pkl')

if __name__ == "__main__":
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=bow_corpus.tolist(),
                                         dictionary=dictionary,
                                         coherence='c_v')

    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
        # MANUAL APPROACH, CoherenceModel below does the same, but only provides the aggregated values
        output = model.top_topics(corpus=transtfidf, texts=minutes, coherence='u_mass', topn=20)
        topicScores = [item[1] for item in output]
        avgScore = 0
        for score in topicScores:
            avgScore += score
        avgScoreArr.append(avgScore/num_topics)
        topicScoreArr.append(topicScores)
        print(avgScore, avgScore/num_topics)

        print("Starting to apply coherence model")

        cm = CoherenceModel(
            model=model,
            corpus=transtfidf,
            texts=minutes,
            dictionary=dct,
            coherence='u_mass'
        )
        coherenceScoreAlt.append(round(cm.get_coherence(), 5))

        # print("Finished using coherence model, next iteration")

    pickle.dump(topicScoreArr, open("coherenceDump", "wb"))
    pickle.dump(coherenceScoreAlt, open("coherenceDumpAlt", "wb"))

# exit()

coherenceScore = pickle.load(open("coherenceDump", "rb"))
coherenceScoreAlt = pickle.load(open("coherenceDumpAlt", "rb"))
# In[31]:

pprint(ldamodel.print_topics())
doc_lda = ldamodel[doc_term_matrix]

# In[32]:

# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix))
# a measure of how good the model is. lower the better.

# In[33]:

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel,
                                     texts=doc_clean,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# ### 4.5 Visualize the topics
#
# ***

# In[35]:

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
vis

# In[36]:
예제 #18
0
model_list = []
data_list = []
dict_list = []
with open(file_name, 'rb') as f:
    while True:
        try:
            iteration, model, time_arr, data, id2word, _ = pickle.load(f)
            model_list.append(model)
            data_list.append(data)
            dict_list.append(id2word)
        except:
            break

coherence_list = []
count = 0
for i in range(0, len(model_list)):
    model = model_list[i]
    data = data_list[i]
    id2word = dict_list[i]
    print(id2word)
    count += 1
    print('Iteration ' + str(count))
    coherencemodel = CoherenceModel(model=model,
                                    texts=data,
                                    dictionary=id2word,
                                    coherence='c_v')
    coherence_list.append(coherencemodel.get_coherence())

with open(output, 'wb') as f:
    pickle.dump((coherence_list, time_arr), f)
예제 #19
0
파일: lda_mp.py 프로젝트: cvraut/Music_Econ
pickle_loc = lambda t: "lda_ml_pickles/lda_mp_{}_topics_{}_songs.pickle".format(
    t, len(texts))

min_topics = 3
max_topics = 100

topics_to_coherence = {}

for topics in range(min_topics, max_topics + 1):
    lda_model_dist = gensim.models.LdaMulticore(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=topics,
                                                random_state=100,
                                                chunksize=100,
                                                passes=10,
                                                alpha='symmetric',
                                                per_word_topics=True)
    coherence_model_lda_dist = CoherenceModel(model=lda_model_dist,
                                              texts=texts,
                                              dictionary=id2word,
                                              coherence='c_v')
    coherence_lda_dist = coherence_model_lda_dist.get_coherence()
    topics_to_coherence[topics] = coherence_lda_dist
    pickle.dump(lda_model_dist, open(pickle_loc(topics), 'wb'))
    pickle.dump(topics_to_coherence,
                open("lda_ml_pickles/topics_to_coherence.pickle", "wb"))
    print("Done with {} topics using {} song records!".format(
        topics, len(texts)))
print("\n\nwhew, all done! :)")
# Show
df3_dominant_topic.head(10)


# In[46]:


lda_model.log_perplexity(doc_term_matrix) #Perplexoity, lower the better


# In[47]:


from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenised_corpus, dictionary=dictionary,coherence="c_v")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


# In[48]:


def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
예제 #21
0
def runlda(rawdata):
    # Convert to list
    data = []
    data.extend(tokenize.sent_tokenize(rawdata))
    print(data)
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

    # Tokenize into words
    print('Tokenizing')
    data_words = list(sent_to_words(data))

    # Build the bigram and trigram models
    print('Creating bigrams and trigrams')
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    print('Building bigram and trigram models')
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Define functions for stopwords, bigrams, trigrams and lemmatization
    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        """https://spacy.io/api/annotation"""
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent)) 
            texts_out.append([token.lemma_ for token in doc])# if token.pos_ in allowed_postags])
        return texts_out

    # Remove Stop Words
    print('Removing stopwords')
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    print('Forming bigrams')
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    print('Lemmatizing')
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    print('Creating dictionary')
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    print('Creating corpus')
    texts = data_lemmatized

    # Term Document Frequency
    print('Creating term frequency list')
    corpus = [id2word.doc2bow(text) for text in texts]

    cwd = os.getcwd()
    mallet_path = cwd + '/mallet-2.0.8/bin/mallet' # update this path
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=id2word)
    
    def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics

        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        coherence_values = []
        model_list = []
        for num_topics in range(start, limit, step):
            print('Calculating {}-topic model'.format(num_topics))
            model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values

    # Can take a long time to run.
    limit=5; start=4; step=1;
    model_list, coherence_values = compute_coherence_values(dictionary=id2word,
                                                            corpus=corpus,
                                                            texts=data_lemmatized,
                                                            start=start,
                                                            limit=limit,
                                                            step=step)
    
    # Print the coherence scores
    x = range(start, limit, step)
    for m, cv in zip(x, coherence_values):
        print("Num Topics =", m, " has Coherence Value of", round(cv, 6))
    
    # Select the model and print the topics
    index, value = max(enumerate(coherence_values), key=operator.itemgetter(1))
    print(index)
    optimal_model = model_list[index]
    model_topics = optimal_model.show_topics(num_topics=1000, formatted=False)

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

    # Compute Perplexity
    print ('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print ('Coherence Score: ', coherence_lda)

    def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
        # Init output
        sent_topics_df = pd.DataFrame()

        # Get main topic in each document
        for i, row in enumerate(ldamodel[corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

        # Add original text to the end of the output
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return(sent_topics_df)

    print('Verify topics')
    df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    print('Format key words display')

    # Number of Documents for Each Topic
    topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()
    print('Topic count')

    # Percentage of Documents for Each Topic
    topic_contribution = round(topic_counts/topic_counts.sum(), 4)
    print('Topic contribution')

    # Group top 5 sentences under each topic
    sent_topics_sorteddf_mallet = pd.DataFrame()

    sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

    for i, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                                grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                                axis=0)

    # Reset Index    
    sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

    # Format
    sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

    # Show
    sent_topics_sorteddf_mallet

    # Topic Number and Keywords
    print('Add items')
    topic_num_keywords = sent_topics_sorteddf_mallet[['Topic_Num', 'Keywords']]
    print('Topic number and keywords')

    # Concatenate Column wise
    df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
    print('Concatenate column')

    # Change Column names
    df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Percent_Documents']
    print('Change column names')

    return df_dominant_topics.to_json()
    
                  max_words=5,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = ldamodel.show_topics(formatted=False)

fig, axes = plt.subplots(1, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary=ldamodel.id2word, mds='mmds')
vis 

coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts_lem, dictionary=dictionary1, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
예제 #23
0
# In[18]:

lsitopics = [[word for word, prob in topic]
             for topicid, topic in lsimodel.show_topics(formatted=False)]

hdptopics = [[word for word, prob in topic]
             for topicid, topic in hdpmodel.show_topics(formatted=False)]

ldatopics = [[word for word, prob in topic]
             for topicid, topic in ldamodel.show_topics(formatted=False)]

# In[19]:

lsi_coherence = CoherenceModel(topics=lsitopics[:10],
                               texts=texts,
                               dictionary=dictionary,
                               window_size=10).get_coherence()

hdp_coherence = CoherenceModel(topics=hdptopics[:10],
                               texts=texts,
                               dictionary=dictionary,
                               window_size=10).get_coherence()

lda_coherence = CoherenceModel(topics=ldatopics,
                               texts=texts,
                               dictionary=dictionary,
                               window_size=10).get_coherence()

# In[20]:

예제 #24
0
파일: plsa_test.py 프로젝트: ismglv/lda2vec
# View
print(corpus[:1])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

tfidf = gensim.models.TfidfModel(corpus)  # step 1 -- initialize a model
tfidf_corpus = tfidf[corpus]

lsi_model = gensim.models.LsiModel(tfidf_corpus,
                                   id2word=id2word,
                                   num_topics=300)

coherence_model = CoherenceModel(model=lsi_model,
                                 texts=texts,
                                 dictionary=id2word,
                                 coherence='c_v')

coherence = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence)

df_topic_sents_keywords = model_visualization.format_topics_sentences(
    lsi_model, corpus, texts)

df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
    'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'
]

# to get doc topics df_dominant_topic['Keywords'][doc_num]
# to get doc topics dominant quality df_dominant_topic['Dominant_Topic'][doc_num]
예제 #25
0
def get_coherence(model, text, dict):
    coherence_model = CoherenceModel(model=model,
                                     texts=text,
                                     dictionary=id2word,
                                     coherence='c_v')
    return coherence_model.get_coherence()
예제 #26
0
texts = clean_text_list
corpus = [id2word.doc2bow(text) for text in texts]
best_coh_score = 0
best_topics = 0
for i in range(8, 30):
    lda_model_loop = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                     id2word=id2word,
                                                     num_topics=i,
                                                     random_state=100,
                                                     update_every=1,
                                                     chunksize=100,
                                                     passes=20,
                                                     alpha='auto',
                                                     per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model_loop,
                                         texts=texts,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    if coherence_lda > best_coh_score:
        best_coh_score = coherence_lda
        best_topics = i
    print('Topics:', i)
    print('Conherence score: ', coherence_lda)
print(best_coh_score)
print(best_topics)
lda_model_best = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                 id2word=id2word,
                                                 num_topics=best_topics,
                                                 random_state=100,
                                                 update_every=1,
                                                 chunksize=100,
예제 #27
0
wandb.init(config=config, project="topical_language_generation_sweeps")

#data preparation
cached_dir = "/home/rohola/cached_models"
tokenizer = TransformerGPT2Tokenizer(cached_dir)
dataset = TopicalDataset(config.dataset_dir, tokenizer)

docs = [doc for doc in dataset]

dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=config.no_below, no_above=config.no_above)

corpus = [dictionary.doc2bow(doc) for doc in docs]
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi_model = LsiModel(
    corpus_tfidf,
    id2word=dictionary,
    num_topics=config.num_topics,
)

#cm = CoherenceModel(model=lsi_model, corpus=corpus, coherence='u_mass')
cm = CoherenceModel(model=lsi_model,
                    texts=docs,
                    dictionary=dictionary,
                    coherence='c_w2v')
# coherence = cm.get_coherence()
# print("coherence: ", coherence)
wandb.log({"coherence": cm.get_coherence()})
예제 #28
0
lda_model = LdaModel(doc_term_matrix,
                     num_topics=5,
                     id2word=dictionary,
                     iterations=10,
                     random_state=2)
# extract topics for headlines
topics = lda_model.print_topics(num_topics=5, num_words=10)
# pprint topics
print(topics)

# Code ends here

# --------------
# coherence score
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=clean_headlines,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()


# Function to calculate coherence values
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
예제 #29
0
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

get_topics(lda, num_topics)

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, id2word)


lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)]

lsi_coherence = CoherenceModel(model=lsimodel,topics=lsitopics,dictionary=id2word, texts=train_headlines,window_size=10).get_coherence()
lda_coherence = CoherenceModel(model=lda,topics=ldatopics,dictionary=id2word,texts=train_headlines,window_size=10).get_coherence()

#lda_coherence =CoherenceModel(model=lsimodel, corpus=corpus, coherence='u_mass').get_coherence() 

def  evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    print(coherences)
    x = np.arange(n)
    def compute_coherence_score():
        coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)



        def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
            """
            Compute c_v coherence for various number of topics

            Parameters:
            ----------
            dictionary : Gensim dictionary
            corpus : Gensim corpus
            texts : List of input texts
            limit : Max num of topics

            Returns:
            -------
            model_list : List of LDA topic models
            coherence_values : Coherence values corresponding to the LDA model with respective number of topics
            """
            coherence_values = []
            model_list = []
            for num_topics in range(start, limit, step):
                # Build LDA model
                model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                            id2word=id2word,
                                                            num_topics=num_topics,
                                                            random_state=100,
                                                            update_every=1,
                                                            chunksize=100,
                                                            passes=10,
                                                            alpha='auto',
                                                            per_word_topics=True)
                # model = gensim.models.wrappers.LdaMallet(lda_model, corpus=corpus, num_topics=num_topics, id2word=id2word)
                model_list.append(model)
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                print(coherencemodel.get_coherence())
                coherence_values.append(coherencemodel.get_coherence())

            return model_list, coherence_values

        # Can take a long time to run.
        model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized,
                                                                start=2, limit=40, step=6)

        # Show graph
        limit = 40;
        start = 2;
        step = 6;
        x = range(start, limit, step)
        plt.plot(x, coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend(("coherence_values"), loc='best')
        plt.show()

        print("LDA Gensim Printing")
        # Print the coherence scores
        for m, cv in zip(x, coherence_values):
            print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
예제 #31
0
    def trainlda(self, topics_n = 10):
        self.num_topics = topics_n
        
        alltexts = []
        for name,sentences in self.user_sentences.items():
            sentences = [item for sublist in sentences for item in sublist]
            alltexts.append(sentences)
        
        
#        if self.ngram_dictionary == None:
#            if self.ngram == 1:
#                self.ngram_dictionary = Dictionary(self.all_sentences)
#            elif self.ngram == 2:
#                self.ngram_dictionary = Dictionary(self.all_bigram_sentences)
#                
        if self.ngram_dictionary == None:
            if self.ngram == 1:
                self.ngram_dictionary = Dictionary(alltexts)
            elif self.ngram == 2:
                self.ngram_dictionary = Dictionary(alltexts)
                
            # filter tokens that are very rare or too common from
            # the dictionary (filter_extremes) and reassign integer ids (compactify)
            self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8)
            self.ngram_dictionary.compactify()


#        if self.ngram == 1:
#            sentences = self.all_sentences
#        elif self.ngram == 2:
#            sentences = self.all_bigram_sentences
            
#        ngram_bow_corpus = []
#        for sentence in sentences:
#            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))
#
#
#        self.lda = LdaMulticore(ngram_bow_corpus,
#                           num_topics = topics_n,
#                           id2word=self.ngram_dictionary,
#                           workers=3)
        

            
        ngram_bow_corpus = []
        for sentence in alltexts:
            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))


        self.lda = LdaMulticore(ngram_bow_corpus,
                           num_topics = topics_n,
                           id2word=self.ngram_dictionary,
                           workers=3)    
        
        
                # calculate the cohe
        topics=[]

        for i in range(self.lda.num_topics):
            terms = []
            for n in self.lda.show_topic(i):
                terms.append(n[0])
            topics.append(terms)
        
        cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass')
        cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v')
        cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci')
        cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi')

        return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence()