def return_topic_model(corpus, numpasses, numtopics, doc_term_matrix=None, dictionary=None): if doc_term_matrix is None and dictionary is None: # # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(corpus) # # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = np.array([dictionary.doc2bow(doc) for doc in corpus]) pruningstart = time.time() dictionary.filter_extremes(no_below=5, no_above=0.8, keep_n=10000) vals = dictionary.values() doc_term_matrix = [ dictionary.doc2bow(filter(lambda x: x in vals, doc)) for doc in corpus ] print "[INFO] Pruning to 10000 features took", time.time( ) - pruningstart, "Seconds" with open("matrix.json", "w") as df: json.dump(doc_term_matrix, df) dictionary.save("topics_dt_mtx/dict") # Running and Training LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=numtopics, id2word=dictionary, passes=numpasses, alpha=1.5) return ldamodel
def cluster(doc_term_matrix, num, word_dict): ldamodel = Lda(doc_term_matrix, num_topics=num, id2word=word_dict) doc_topics = ldamodel.get_document_topics( doc_term_matrix, minimum_probability=0.20) # needs tuning result = [[] for i in range(num)] for k, topic in enumerate(doc_topics): # Some articles do not have a topic if topic: topic.sort(key=itemgetter(1), reverse=True) result[topic[0][0]].append(k) return [map(lambda x: titles[x], result[k]) for k in len(result)]
stoplist = set( 'also use make people know many call include part find become like mean often different \ usually take wikt come give well get since type list say change see refer actually iii \ aisne kinds pas ask would way something need things want every str' .split()) stop_ids = [ dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id ] dictionary.filter_tokens(stop_ids) words, ids = dictionary.filter_n_most_frequent(50) print words, "\n\n", ids # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] #Creating the object for LDA model using gensim library & Training LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=50, id2word=dictionary, passes=50, iterations=500) ldafile = open('lda_model_sym_wiki.pkl', 'wb') cPickle.dump(ldamodel, ldafile) ldafile.close() #Print all the 50 topics for topic in ldamodel.print_topics(num_topics=50, num_words=10): print topic[0] + 1, " ", topic[1], "\n"
def program_clusters(pgms, n_topics, awds, papers): #First we need to filter the data by program code. Some grants have multiple program #codes, so we first filter through to determine which cells contain the program code #then we replace the exisiting program code(s) with the provided one. This ensures there #is only one code per award. papers = papers papers['year'] = pd.to_datetime(papers['year']) papers['citations per year'] = papers['citations'].divide([ ((datetime.datetime.today() - x).days) / 365.2422 for x in papers['year'] ]) num_pubs = papers.groupby('award number')[['publication' ]].count().reset_index() cits_year_mean = papers.groupby('award number')[['citations per year' ]].mean().reset_index() pgms = [ '6878', '6880', '6882', '6883', '6884', '6885', '9101', '9102', '6881' ] awds = awds awds = awds[awds['ProgramElementCode(s)'].str.contains('|'.join(pgms))] for x in pgms: awds['ProgramElementCode(s)'] = np.where( awds['ProgramElementCode(s)'].str.contains(x), x, awds['ProgramElementCode(s)']) awds['StartDate'] = pd.to_datetime(awds['StartDate']) awds['EndDate'] = pd.to_datetime(awds['EndDate']) awds['AwardedAmountToDate'] = [ x.replace('$', '') for x in awds['AwardedAmountToDate'] ] awds['AwardedAmountToDate'] = [ x.replace(',', '') for x in awds['AwardedAmountToDate'] ] awds['AwardedAmountToDate'] = pd.to_numeric(awds['AwardedAmountToDate']) awds = pd.merge(awds, num_pubs, left_on='AwardNumber', right_on='award number', how='left') awds = pd.merge(awds, cits_year_mean, left_on='AwardNumber', right_on='award number', how='left') awds.drop(columns=['award number_x', 'award number_y'], inplace=True) awds[['publication', 'citations per year' ]] = awds[['publication', 'citations per year']].replace(np.nan, 0) awds['pubs per year'] = np.where( awds['EndDate'] > datetime.datetime.today(), awds['publication'].divide([ ((datetime.datetime.today() - x).days) / 365.2422 for x in awds['StartDate'] ]), awds['publication'].divide( (awds['EndDate'] - awds['StartDate']).astype('timedelta64[D]') / 365.2422)) abstracts = awds[[ 'ProgramElementCode(s)', 'AwardNumber', 'Abstract', 'citations per year', 'pubs per year', 'AwardedAmountToDate' ]].copy() #This is a pretty clean data set, but there are some empty entries, so we #filter them out here abstracts = abstracts.dropna() #The first step in the tokenization process is splitting the abstract text #into a list of words. abstracts['clean_abstracts'] = [ doc.lower().split() for doc in abstracts['Abstract'] ] #we want to account for possible bigrams and trigams, which we search for #here bigram = Phrases(list(abstracts['clean_abstracts']), min_count=5, threshold=20) trigram = Phrases(bigram[list(abstracts['clean_abstracts'])], threshold=20) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) #Now we start building our dictinary and creating the cleaned up corpus. #We start by creating a list of stop words, punctuation, and other text to remove. #we also instantiate a lemmatizer stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() boiler_plate = 'This award reflects NSF' 's statutory mission and has been deemed worthy of support through evaluation using the Foundation' 's intellectual merit and broader impacts review criteria' #This function applies the bigram and trigram functions and lemmatizes the #the abstracts and only keeps words that a greater than 2 characters def word_mod(doc): doc = re.sub('<.*?>', ' ', doc) doc = re.sub(boiler_plate, '', doc) punct_free = ''.join(ch for ch in doc if ch not in exclude) words = punct_free.lower().split() bigs = bigram_mod[words] tris = trigram_mod[bigs] stop_free = " ".join([i for i in tris if i not in stop]) lemm = " ".join(lemma.lemmatize(word) for word in stop_free.split()) word_list = lemm.split() # only take words which are greater than 2 characters cleaned = [word for word in word_list if len(word) > 2] return cleaned abstracts['clean_abstracts'] = [ word_mod(doc) for doc in abstracts['Abstract'] ] # Here we create the dictionary from the corpus of abstracts, where each unique term is assigned an index. dictionary = corpora.Dictionary(abstracts['clean_abstracts']) # Filter terms which occurs in less than 4 articles & more than 40% of the abstracts dictionary.filter_extremes(no_below=4, no_above=0.45) #This creates a sparse matrix of word frequencies in each abstracts abstract_term_matrix = [ dictionary.doc2bow(doc) for doc in abstracts['clean_abstracts'] ] # Here we create and train the LDA model, passing in our term frequncy matrix, the number of #topics/clusters to be created, and our dictionary ldamodel = Lda(abstract_term_matrix, num_topics=n_topics, id2word=dictionary, passes=50, iterations=500) # Here we print out the top 10 words for each topic and their weight for i, topic in enumerate( ldamodel.print_topics(num_topics=n_topics, num_words=10)): words = topic[1].split("+") print(words, "\n") #Next we want to know what topic each abstract belongs to we pass each abstract #into the get_document_topics method and it returns the topic and the #probability of the abstract beloning to a that topic. We take the one that #has the highest probability def pred_topic(doc): doc_bow = ldamodel.id2word.doc2bow(doc) doc_topics = ldamodel.get_document_topics(doc_bow, minimum_probability=0.20) if doc_topics: doc_topics.sort(key=operator.itemgetter(1), reverse=True) theme = doc_topics[0][0] else: theme = np.nan return theme abstracts['predicted topic'] = [ pred_topic(doc) for doc in abstracts['clean_abstracts'] ] #Here we do a histogram of how many abstracts/awards fall into each topic ab_hist = abstracts.groupby(['predicted topic'])['AwardNumber'].count() cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] cols = cols + cols + cols + cols f1, ax = plt.subplots() ab_hist.plot.bar(rot=0, color=cols) ax.set_xticklabels([x for x in ab_hist.index]) ax.set_xlabel('Topic Number') ax.set_ylabel('Count of Awards in Topic') ax.set_title('Distribution of Awards in Derived Topic Areas') plt.show() #Here we create a word cloud for each of the top words in the topic. Their size #is indicative of their weight. cloud = WordCloud(stopwords=stopwords.words('english'), background_color='white', width=2500, height=1800, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) topics = ldamodel.show_topics(formatted=False, num_topics=n_topics) fig, axes = plt.subplots(1, n_topics, figsize=(10, 10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off') plt.subplots_adjust(wspace=0, hspace=0) plt.axis('off') plt.margins(x=0, y=0) plt.tight_layout() plt.show() #Next we'll do a t-SNE plot clustering the abstracts based off the topic #probabilities returned from the model. This creates a array where each #column is a topic and each row is an abstract and each entry is the probability #that the abstract belongs to that topic. col_ns = range(0, n_topics) topic_weights = pd.DataFrame(columns=col_ns) for i in range(0, len(ldamodel[abstract_term_matrix])): weights = ldamodel[abstract_term_matrix][i] for j in range(0, len(weights)): entry = pd.DataFrame(columns=col_ns) idx = weights[j][0] entry.loc[0, idx] = weights[j][1] topic_weights = topic_weights.append(entry) topic_weights.reset_index(drop=True, inplace=True) # Replace any nan entries (because there was zero probability the #abstract belonged in that topic) with zero arr = pd.DataFrame(topic_weights).fillna(0).values # We can limit this to only well separated abstracts as well #arr = arr[np.amax(arr, axis=1) > 0.15] # This is pulls out the highest probability topic for each abstract. We'll #use this for the color scheme in the t-SNE plot. topic_num = np.argmax(arr, axis=1) # Here we initialize and fit our t-SNE model tsne_model = TSNE(n_components=2, verbose=1, random_state=0, perplexity=50, init='pca') tsne_lda = tsne_model.fit_transform(arr) #Here we plot out the results for the t-SNE transformation mycolors = np.array(cols) title = "t-SNE Clustering of {} LDA Topics".format(n_topics) f = plt.figure() plt.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1], color=mycolors[topic_num]) plt.title(title) plt.show() fig = plt.figure(figsize=(12, 6)) ax1 = fig.add_subplot(1, 3, 1) ax1.scatter(x=abstracts['AwardedAmountToDate'], y=abstracts['citations per year'], color=mycolors[abstracts['predicted topic']]) ax1.set_ylabel('Average Citations per Year') ax1.set_xlabel('Award Size [$]') ax1.set_title('Average Citiations per Year', fontsize=11) ax2 = fig.add_subplot(1, 3, 2) ax2.scatter(x=abstracts['AwardedAmountToDate'], y=abstracts['pubs per year'], color=mycolors[abstracts['predicted topic']]) ax2.set_ylabel('Number Publications per Year') ax2.set_xlabel('Award Size [$]') ax2.set_title('Number of Publications per Year', fontsize=11) ax3 = fig.add_subplot(1, 3, 3) ax3.scatter(x=abstracts['pubs per year'], y=abstracts['citations per year'], color=mycolors[abstracts['predicted topic']]) ax3.set_xlabel('Number Publications per Year') ax3.set_ylabel('Average Citiations per Year') ax3.set_title('Number Publications vs \nAverage Citation Count', fontsize=11) from matplotlib.legend_handler import HandlerPatch class HandlerEllipse(HandlerPatch): def create_artists(self, legend, orig_handle, xdescent, ydescent, width, height, fontsize, trans): center = 0.5 * width - 0.5 * xdescent, 0.5 * height - 0.5 * ydescent p = mpatches.Ellipse(xy=center, width=height + xdescent, height=height + ydescent) self.update_prop(p, orig_handle, legend) p.set_transform(trans) return [p] handles = [ mpatches.Circle((0.5, 0.5), radius=0.25, facecolor=mycolors[i], edgecolor="none") for i in range(0, n_topics) ] handles = [ mpatches.Circle( (0.5, 0.5), radius=0.25, facecolor='w', edgecolor="none") ] + handles legend_labels = list(range(0, n_topics)) legend_labels = ['Topic'] + legend_labels ax3.legend(handles, legend_labels, bbox_to_anchor=(1, .88), bbox_transform=fig.transFigure, handler_map={mpatches.Circle: HandlerEllipse()}) plt.tight_layout()
def program_clusters(pgms,n_topics,data): #First we need to filter the data by program code. Some grants have multiple program #codes, so we first filter through to determine which cells contain the program code #then we replace the exisiting program code(s) with the provided one. This ensures there #is only one code per award. awds = data awds = awds[awds['ProgramElementCode(s)'].str.contains('|'.join(pgms))] for x in pgms: awds['ProgramElementCode(s)'] = np.where(awds['ProgramElementCode(s)'].str.contains(x), x, awds['ProgramElementCode(s)'] ) abstracts = awds[['ProgramElementCode(s)', 'AwardNumber','Abstract']].copy() #This is a pretty clean data set, but there are some empty entries, so we #filter them out here abstracts = abstracts.dropna() #Here we start building our dictinary and creating the cleaned up corpus. #We start by removing stop words, punctuation, and stemming or lemmatizing #he abstract text stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() stemmer2 = SnowballStemmer("english", ignore_stopwords=True) # pass the article text as string "doc" #Here we use a small nexted function to pass through each abstract individually def clean(doc): #here we clean up errent breaks like <br/> doc = re.sub('<.*?>', ' ', doc) #This creates a long string #of words while excluding stop words stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) #This goes through each character and removes punctuation punct_free = ''.join(ch for ch in stop_free if ch not in exclude) words = punct_free.split() return words #Here is where we pass each abstract through the cleaning function abstracts['clean_abstracts'] = [clean(doc) for doc in abstracts['Abstract']] # So we can use bigrams and trigrams, we create new models, running through our #cleaned abstracts bigram = Phrases(list(abstracts['clean_abstracts']), min_count=5, threshold=100) # higher threshold fewer phrases. trigram =Phrases(bigram[list(abstracts['clean_abstracts'])], threshold=100) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) #This function applies the bigram and trigram functions and lemmatizes the #the abstracts and only keeps words that a greater than 2 characters def word_mod(doc): bigs = bigram_mod[doc] tris = trigram_mod[bigs] lemm = " ".join(lemma.lemmatize(word) for word in tris) #stemm = " ".join(stemmer2.stem(word) for word in punct_free.split()) words = lemm.split() # only take words which are greater than 2 characters cleaned = [word for word in words if len(word) > 2] return cleaned abstracts['clean_abstracts'] = [word_mod(doc) for doc in abstracts['clean_abstracts']] # Here we create the dictionary from the corpus of abstracts, where each unique term is assigned an index. dictionary = corpora.Dictionary(abstracts['clean_abstracts']) # Filter terms which occurs in less than 4 articles & more than 40% of the abstracts dictionary.filter_extremes(no_below=4, no_above=0.4) #This creates a sparse matrix of word frequencies in each abstracts abstract_term_matrix = [dictionary.doc2bow(doc) for doc in abstracts['clean_abstracts']] # Here we create and train the LDA model, passing in our term frequncy matrix, the number of #topics/clusters to be created, and our dictionary ldamodel = Lda(abstract_term_matrix, num_topics= n_topics, id2word = dictionary, passes=15, iterations=500) # Here we print out the top 10 words for each topic and their weight for i,topic in enumerate(ldamodel.print_topics(num_topics=10, num_words=10)): words = topic[1].split("+") print (words,"\n") #Next we want to know what topic each abstract belongs to we pass each abstract #into the get_document_topics method and it returns the topic and the #probability of the abstract beloning to a that topic. We take the one that #has the highest probability def pred_topic(doc): doc_bow = ldamodel.id2word.doc2bow(doc) doc_topics = ldamodel.get_document_topics(doc_bow, minimum_probability=0.20) if doc_topics: doc_topics.sort(key = operator.itemgetter(1), reverse=True) theme = doc_topics[0][0] else: theme = np.nan return theme abstracts['predicted topic'] = [pred_topic(doc) for doc in abstracts['clean_abstracts']] #Here we do a histogram of how many abstracts/awards fall into each topic ab_hist = abstracts.groupby(['predicted topic','ProgramElementCode(s)'])['AwardNumber'].count() cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] f1, ax = plt.subplots() ab_hist.plot.bar(rot = 0, color = cols ) ax.set_xticklabels([x[0] for x in ab_hist.index]) ax.set_xlabel('Topic Number') ax.set_ylabel('Count of Awards in Topic') ax.set_title('Distribution of Awards in Derived Topic Areas') plt.show() #Here we create a word cloud for each of the top words in the topic. Their size #is indicative of their weight. cloud = WordCloud(stopwords=stopwords.words('english'), background_color='white', width=2500, height=1800, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) topics = ldamodel.show_topics(formatted=False) fig, axes = plt.subplots(1, n_topics, figsize=(10,10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off') plt.subplots_adjust(wspace=0, hspace=0) plt.axis('off') plt.margins(x=0, y=0) plt.tight_layout() plt.show() #Next we'll do a t-SNE plot clustering the abstracts based off the topic #probabilities returned from the model. This creates a array where each #column is a topic and each row is an abstract and each entry is the probability #that the abstract belongs to that topic. col_ns = range(0,n_topics) topic_weights = pd.DataFrame(columns = col_ns) for i in range(0,len(ldamodel[abstract_term_matrix])): weights = ldamodel[abstract_term_matrix][i] for j in range(0, len(weights)): entry = pd.DataFrame(columns = col_ns) idx = weights[j][0] entry.loc[0,idx] = weights[j][1] topic_weights = topic_weights.append(entry) topic_weights.reset_index(drop = True, inplace = True) # Replace any nan entries (because there was zero probability the #abstract belonged in that topic) with zero arr = pd.DataFrame(topic_weights).fillna(0).values # We can limit this to only well separated abstracts as well #arr = arr[np.amax(arr, axis=1) > 0.15] # This is pulls out the highest probability topic for each abstract. We'll #use this for the color scheme in the t-SNE plot. topic_num = np.argmax(arr, axis=1) # Here we initialize and fit our t-SNE model tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca') tsne_lda = tsne_model.fit_transform(arr) #Here we plot out the results for the t-SNE transformation mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()]) title ="t-SNE Clustering of {} LDA Topics".format(n_topics) f = plt.figure() plt.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num]) plt.title(title) plt.show()
#After printing the most frequent words of the dictionary, I found that few words which are mostly content neutral words are also present in the dictionary. # These words may lead to modeling of “word distribution”(topic) which is neutral and do not capture any theme or content. # I made a list of such words and filtered all such words. stoplist = set('awesome cant though theyre yeah around try enough keep way start work busines isnt theyre didnt doesnt i\'ve you\'re that\'s what\'s let\'s i\'d you\'ll aren\'t \"the i\'ll we\'re wont 009 don\'t it\'s nbsp i\'m get make like would want dont\' use one need know good take thank say also see really could much something ive well give first even great things come thats sure help youre lot someone ask best many question etc better still put might actually let love may tell every maybe always never probably anything cant\' doesnt\' ill already able anyone since another theres everything without didn\'t isn\'t youll\' per else ive get would like want hey might may without also make want put etc actually else far definitely youll\' didnt\' isnt\' theres since able maybe without may suggestedsort never isredditmediadomain userreports far appreciate next think know need look please one null take dont dont\' want\' could able ask well best someone sure lot thank also anyone really something give years use make all ago people know many call include part find become'.split()) stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] dictionary.filter_tokens(stop_ids) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleanPost] print('Training...') #Creating the object for LDA model using gensim library & Training LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics = int(n_topics), id2word = dictionary, passes=20, iterations=1000) print('Training finished.') # Label a topic to each THREAD. def label_comment(ldamodel,doc_term_matrix, dictionary): # Assigns the topics to the documents in corpus lda_corpus = ldamodel[doc_term_matrix] doc_topics = ldamodel.get_document_topics(doc_term_matrix) se = pd.Series(doc_topics) rawPostall['Topic/Probability'] = se.values main_topics = [] main_probability = [] for k,topics in enumerate(doc_topics):
os.remove(filename) print("File Removed!") with open(filename, 'wb') as handle: pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs_clean] if(len(doc_term_matrix) > 0): # Creating the object for LDA model using gensim library & Training LDA model on the document term matrix. num_topics = int(len(doc_term_matrix)/20) if(num_topics < 2): num_topics = 2 if(num_topics > 30): num_topics = 30 #ldamodel = Lda(doc_term_matrix, num_topics = num_topics, id2word = dictionary)#, passes=50, iterations=500) ldamodel = Lda(doc_term_matrix, num_topics = num_topics, id2word = dictionary, passes=3) print("Saving the LDA model") filename = model_path + pf + '_' + ccode + '_lda_model.pkl' #/auto/vgapps-cstg02-vapps/analytics/csap/models/files/sr/ if os.path.exists(filename): os.remove(filename) print("File Removed!") ldafile = open(filename,'wb') cPickle.dump(ldamodel, ldafile) print("File Created!") ldafile.close() topics_words = ldamodel.print_topics(num_topics = num_topics, num_words = 10) c=0 doc_topics=[]
if stopword in dictionary.token2id ] dictionary.filter_tokens(stop_ids) # In[47]: # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleanPost] # In[48]: print('before training') #Creating the object for LDA model using gensim library & Training LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=50, id2word=dictionary, passes=20, iterations=1000, eval_every=1) #ldafile = open('lda_model_sym_wiki.pkl','wb') #cPickle.dump(ldamodel,ldafile) #ldafile.close() print('after') # In[49]: #Print all the 50 topics #for topic in ldamodel.print_topics(num_topics=50, num_words=10): # print (topic[0]+1, " ", topic[1],"\n") # In[50]: