def compute_values(self, kmin, kmax, kstep): # vectorize doc vec = CountVectorizer() X = vec.fit_transform(self.docs) # get vocabulary and biterms from docs vocab = np.array(vec.get_feature_names()) biterms = vec_to_biterms(X) # create a BTM and pass the biterms to train it btm = oBTM(num_topics = 20, V = vocab) topics = btm.fit_transform(biterms, iterations=100) topic_summuary(btm.phi_wz.T, X, vocab, 10)
def lightningBTM(num_top, vocabulary, b_terms, x1): btm = oBTM(num_topics=num_top, V=vocabulary) #create the btm object start_time = time.time() for i in range(0, len(b_terms), 100): #process chunks of 200 texts biterms_chunk = b_terms[i:i + 100] btm.fit(biterms_chunk, iterations=10) #only 10 iterations in this version, instead of 50 topics = btm.transform(b_terms) end_time = time.time() run_time = end_time - start_time print("For k = %s topics.." % num_top) print("BTM online took %s seconds to train" % run_time) #Examine topic coherence scores: print("\nTopic Coherence:") topic_summuary(btm.phi_wz.T, x1, vocabulary, 10)
def biterm_topic_model_topic_extraction(): """ Function performs topic extraction on Tweets using the Gensim HDP model. :return: None. """ # LDA can only use raw term counts for LDA because it is a probabilistic graphical model. tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') tf = tf_vectorizer.fit_transform(slo_feature_series) tf_feature_names = tf_vectorizer.get_feature_names() log.info(f"\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix.") log.info(f"{tf}\n") log.info(f"\n.get_feature_names - Array mapping from feature integer indices to feature name") log.info(f"{tf_feature_names}\n") # Convert corpus of documents (vectorized text) to numpy array. tf_array = tf.toarray() # Convert dictionary of words (vocabulary) to numpy array. tf_feature_names = np.array(tf_vectorizer.get_feature_names()) # get biterms biterms = vec_to_biterms(tf_array) # create btm btm = oBTM(num_topics=20, V=tf_feature_names) print("\n\n Train Online BTM ..") for i in range(0, len(biterms), 100): # prozess chunk of 200 texts biterms_chunk = biterms[i:i + 100] btm.fit(biterms_chunk, iterations=50) topics = btm.transform(biterms) time.sleep(3) # print("\n\n Visualize Topics ..") # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(tf_array, axis=1), tf_feature_names, np.sum(tf_array, axis=0)) # pyLDAvis.save_html(vis, './vis/online_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, tf_array, tf_feature_names, 10) print("\n\n Texts & Topics ..") for i in range(1, 10): print("{} (topic: {})".format(slo_feature_series[i], topics[i].argmax()))
if __name__ == "__main__": texts = open('./data/reuters.titles').read().splitlines()[:50] # vectorize texts vec = CountVectorizer(stop_words='english') X = vec.fit_transform(texts).toarray() # get vocabulary vocab = np.array(vec.get_feature_names()) # get biterms biterms = vec_to_biterms(X) # create btm btm = oBTM(num_topics=20, V=vocab) print("\n\n Train BTM ..") topics = btm.fit_transform(biterms, iterations=100) # print("\n\n Visualize Topics ..") # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) # pyLDAvis.save_html(vis, './vis/simple_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, X, vocab, 10) print("\n\n Texts & Topics ..") for i in range(len(texts)): print("{} (topic: {})".format(texts[i], topics[i].argmax()))
def compute_topics(self): self.btm = oBTM(num_topics=self.num_topics, V=self.vocab) self.topics = self.btm.fit_transform(self.biterms, iterations=10) topic_summuary(self.btm.phi_wz.T, self.X, self.vocab, 5) return self.topics
def generate_topics(self, data): topic_summuary(self.model.phi_wz.T, data, self.dictionary, 10)
def perform_BTM(fpath, num_top): company_data = pd.read_excel(fpath) company_name = company_data.iloc[ 0, company_data.columns.get_loc("Author Name")] print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name) print("This is using %s topics" % num_top) #Remove retweets from the company account, as they aren't technically company account tweets patternDel = "^RT @" filter1 = company_data["Content"].str.contains(patternDel) company_tweets = company_data[~filter1].copy() #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents: company_tweets["Content"] = company_tweets["Content"].str.replace( r"’", "'") #replace closing smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace( r"‘", "'") #replace opening smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace( r"“", "\"") #replace opening smart quotes with regular quotes company_tweets["Content"] = company_tweets["Content"].str.replace( r"”", "\"") #replace closing smart quotes with regular quotes #Examine tweets after removing/replacing 'smart' apostrophes and quotes: #print(company_tweets["Content"].head(5)) #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney): company_tweets["Content"] = company_tweets["Content"].str.replace( r"'s", "") #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "") #Perform standardization on the textual contents of the company's tweets: #No longer keep newline chars in text, replace double spaces with spaces, now keeping hashtag symbols themselves #def standardize_text(df, text_field): # df[text_field] = df[text_field].str.replace(r".", "") #remove/replace periods w/ nothing. Should now count acronyms as one word # df[text_field] = df[text_field].str.replace(r"&", "and") #replace ampersands with 'and' # df[text_field] = df[text_field].str.replace(r"http\S+", "") #remove links and replace w/ nothing # df[text_field] = df[text_field].str.replace(r"http", "") #ensure all links have been removed # df[text_field] = df[text_field].str.replace(r"@\S+", "") #remove @username mentions and replace with nothing # df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?#@\'\`\"\_]", " ")#Remove/replace anything that's not capital/lowercase letter, number, parentheses, comma, or any of the following symbols with a space # df[text_field] = df[text_field].str.replace(r"@", "at") #replace any remaining '@' symbols with 'at' # df[text_field] = df[text_field].str.lower() #convert all remaining text to lowercase # #remove double spaces and replace with single space # df[text_field] = df[text_field].str.replace(r"\s+", " ") # return df textual_tweets = standardize_text(company_tweets, "Content") #Examine tweets after standardization has been performed: #print(textual_tweets["Content"].head(5)) #Perform lemmatization on the textual contents of the tweets: ##! Code for this function derived from the following link: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/ #from textblob import TextBlob, Word #def lem_with_postag(df, text_field): # tag_dict = {"J": 'a', # "N": 'n', # "V": 'v', # "R": 'r'} # output = [] # for tweet in df[text_field]: # sent = TextBlob(tweet) # words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags] # lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] # lemTweet = " ".join(lemmatized_list) # output.append(lemTweet) # return output textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content") #print(textual_tweets["Content"].head(5)) #Removing tweets that weren't originally in English English_tweets = textual_tweets[textual_tweets["Language"] == "en"] #Removing rows with no text left inside them filter1 = English_tweets["Content"] != "" cleanGlish_tweets = English_tweets[filter1] #Creating tokens: #from nltk.tokenize import RegexpTokenizer #tokenizer = RegexpTokenizer(r'\w+') #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize) #Remove stop words from the data: #from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) ##Expand on the initial set of stopwords: stop_words2 = pd.DataFrame(stop_words) stop_words2["Words"] = stop_words add_stopwords = stop_words2["Words"].str.replace( r"'", "") #replace apostrophes in initial set of stopwords with nothing #Add the newly created stopwords to the original set: for word in add_stopwords: if word not in stop_words: stop_words.add(word) #These words need to be added manually to the set of stopwords: stop_words.add("wed") stop_words.add("us") #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions) stop_words.add("u") ##!!This doesn't seem to be compatible with BTM! #filtered_toks = [] #Filter out the stop words: #for w in cleanGlish_tweets["tokens"]: #tweet tokens # for j in w: #word tokens within each tweet # if j not in stop_words: # filtered_toks.append(j) ##!Seems possible that I need to filter out tweets with less than 3 words remaining for below to work: #from nltk.tokenize import RegexpTokenizer #tokenizer = RegexpTokenizer(r'\w+') #cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize) #print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"])) #Filter out tweets with less than 3 words: #cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["tokens"]] #cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy() #print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"])) #print("Breakpoint") ##Vectorize the cleaned tweets #from sklearn.feature_extraction.text import CountVectorizer #Filter out stopwords here: vec = CountVectorizer(stop_words=stop_words) ##Seems that a potential problem above is that I'm filtering out tweets w/ less than 3 words before stopword removal: ##Thus, making it possible that tweets with less than 3 counting words are being fed to the model ##!!I think I can supply my own set of stopwords above, rather than use CountVectorizer's pre-defined set #print("Stop words:") #for word in vec.stop_words: # print(word) #Save CountVectorizer's set of stop words: #stop_words = [word for word in vec.stop_words] #print("Stop words variable:") #for word in stop_words: # print(word) ##Filter out tweets w/ less than 3 words after stop word removal: #def clean_tokenize(df, text_field, stop_set): # output = [] # for tweet in df[text_field]: # clean_toks = [] # for tok in tweet: # if tok not in stop_set: # clean_toks.append(tok) # output.append(clean_toks) # return output #from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply( tokenizer.tokenize) cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words) #print("Token differences:") #print(cleanGlish_tweets["tokens"].head(5)) #print(cleanGlish_tweets["clean_tokens"].head(5)) print( "Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"])) #Filter out tweets with less than 3 words: cleanGlish_tweets["num_words"] = [ len(token) for token in cleanGlish_tweets["clean_tokens"] ] cleanGlish_tweets2 = cleanGlish_tweets[ cleanGlish_tweets["num_words"] >= 3].copy() print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"])) #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]] #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy() #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"])) #print("Breakpoint") X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray() #print("X looks like:") #print(X) #Get the vocabulary and the biterms from the tweets: #from biterm.utility import vec_to_biterms, topic_summuary vocab = np.array(vec.get_feature_names()) #print("Vocab is:") #print(vocab) biterms = vec_to_biterms(X) #print("Biterms look like:") #print(biterms) #print("The non-zero parameter we're passing looks like:") #print(np.count_nonzero(X, axis=1)) #print("The sum parameter we're passing in looks like:") #print(np.sum(X, axis=0)) #print("Breakpoint") #Create a BTM and pass the biterms to train it: #from biterm.btm import oBTM #import time start_time = time.time() #random.seed(1) btm = oBTM(num_topics=num_top, V=vocab) topics = btm.fit_transform(biterms, iterations=100) end_time = time.time() run_time = end_time - start_time print("For %s..." % company_name) print("BTM took %s seconds to train" % run_time) #print("First parameter:") #print(btm.phi_wz.T) #print("Topics:") #print(topics) ##See if formatting data in the following manner allows pyLDAvis.prepare to work: #Visualize the topics: #If HTML(vis) doesn't work, look at following link #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/ #import pyLDAvis ##!This isn't working for some reason #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) #pyLDAvis.display(vis) #pyLDAvis.show(vis) #from IPython.core.display import HTML #HTML(vis) #cleanGlish_tweets2["topic"] = topics.argmax() cleanGlish_tweets2["topic"] = [ str(topics[i].argmax()) for i in range(len(cleanGlish_tweets2["Content"])) ] #print("\nTweets and Topics:") #for i in range(len(cleanGlish_tweets2["Content"])): #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax())) # cleanGlish_tweets2.iat[i, 22] = topics[i].argmax() #Examine topic coherence scores: print("\nTopic Coherence:") topic_summuary(btm.phi_wz.T, X, vocab, 10) #Save the tweet topics: respath2 = respath + str(company_name) + resEnding cleanGlish_tweets2.to_excel(respath2)
def perform_BTM(fpath, num_top): company_data = pd.read_excel(fpath) company_name = company_data.iloc[0, company_data.columns.get_loc("Author Name")] print("\n\n\n\n\nBeginning BTM modeling for %s" % company_name) print("This is using %s topics" % num_top) #Remove retweets from the company account, as they aren't technically company account tweets patternDel = "^RT @" filter1 = company_data["Content"].str.contains(patternDel) company_tweets2 = company_data[~filter1].copy() ##Designate tweets as 'OT' or 'IRT' prior to removing more tweets or altering tweet contents #Perform initial separation based on "^@" regex: initIRT = [bool(re.search("^@", i)) for i in company_tweets2["Content"]] initOT = [not elem for elem in initIRT] #print(initOT) #Create IRT and OT variables in the data: company_tweets2["IRT"] = initIRT company_tweets2["OT"] = initOT #Fill in NAs under the 'In Reply To' field with "OT": company_tweets2["In Reply To"] = company_tweets2["In Reply To"].replace(np.nan, "OT", regex=True) #print(company_tweets["In Reply To"].head(5)) #Call function to improve on initial OT vs. IRT splits: company_tweets3 = cleanSplit(company_tweets2, "Content", "IRT", "In Reply To", "Author", "OT") #For this version, extract IRT tweets only: company_tweets = company_tweets3[company_tweets3["IRT"] == True].copy() #print(company_tweets.shape) #print("Break") #Create column such that original tweet contents aren't totally lost after textual pre-processing company_tweets["Content2"] = company_tweets["Content"] #Remove/replace 'smart' apostrophes and quotation marks with standard keyboard equivalents: company_tweets["Content"] = company_tweets["Content"].str.replace(r"’", "'") #replace closing smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace(r"‘", "'") #replace opening smart apostrophes with regular apostrophe company_tweets["Content"] = company_tweets["Content"].str.replace(r"“", "\"") #replace opening smart quotes with regular quotes company_tweets["Content"] = company_tweets["Content"].str.replace(r"”", "\"") #replace closing smart quotes with regular quotes #Examine tweets after removing/replacing 'smart' apostrophes and quotes: #print(company_tweets["Content"].head(5)) #Remove apostrophes followed by 's' and replace with nothing (Disney's becomes Disney): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'s", "") #Remove remaining apostrophes and replace with nothing (convert I'm to Im and such): company_tweets["Content"] = company_tweets["Content"].str.replace(r"'", "") #Standardize the textual contents of tweets: textual_tweets = standardize_text(company_tweets, "Content") #Perform lemmatization on the textual contents of tweets: textual_tweets["Content"] = lem_with_postag(textual_tweets, "Content") #print(textual_tweets["Content"].head(5)) #Removing tweets that weren't originally in English English_tweets = textual_tweets[textual_tweets["Language"] == "en"] #Removing rows with no text left inside them filter1 = English_tweets["Content"] != "" cleanGlish_tweets = English_tweets[filter1] #Remove stop words from the data: #from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) ##Expand on the initial set of stopwords: stop_words2 = pd.DataFrame(stop_words) stop_words2["Words"] = stop_words add_stopwords = stop_words2["Words"].str.replace(r"'", "") #replace apostrophes in initial set of stopwords with nothing #Add the newly created stopwords to the original set: for word in add_stopwords: if word not in stop_words: stop_words.add(word) #These words need to be added manually to the set of stopwords: stop_words.add("wed") stop_words.add("us") #Lemmatization, for some reason, converts "us" to "u". Therefore, "u" should be added as a stopword as well (for lemmatized versions) stop_words.add("u") ##Vectorize the cleaned tweets #from sklearn.feature_extraction.text import CountVectorizer #Filter out stopwords here: vec = CountVectorizer(stop_words=stop_words) #Tokenize tweet contents: tokenizer = RegexpTokenizer(r'\w+') cleanGlish_tweets["tokens"] = cleanGlish_tweets["Content"].apply(tokenizer.tokenize) cleanGlish_tweets["clean_tokens"] = clean_tokenize(cleanGlish_tweets, "tokens", stop_words) #print("Token differences:") #print(cleanGlish_tweets["tokens"].head(5)) #print(cleanGlish_tweets["clean_tokens"].head(5)) print("Before filtering out tweets with 3 words or less, cleanGlish has %s tweets" % len(cleanGlish_tweets["Content"])) #Filter out tweets with less than 3 words: cleanGlish_tweets["num_words"] = [len(token) for token in cleanGlish_tweets["clean_tokens"]] cleanGlish_tweets2 = cleanGlish_tweets[cleanGlish_tweets["num_words"] >= 3].copy() print("After filtering, cleanGlish2 has %s tweets" % len(cleanGlish_tweets2["Content"])) #Determine if filtering out tweets with less than 3 words after stop word removal makes a difference #cleanGlish_tweets["num_words2"] = [len(token) for token in cleanGlish_tweets["tokens"]] #cleanGlish_tweets3 = cleanGlish_tweets[cleanGlish_tweets["num_words2"] >=3].copy() #print("Originally, cleanGlish2 would have had %s tweets" % len(cleanGlish_tweets3["Content"])) #print("Breakpoint") X = vec.fit_transform(cleanGlish_tweets2["Content"]).toarray() #print("X looks like:") #print(X) #Get the vocabulary and the biterms from the tweets: #from biterm.utility import vec_to_biterms, topic_summuary vocab = np.array(vec.get_feature_names()) #print("Vocab is:") #print(vocab) biterms = vec_to_biterms(X) #print("Biterms look like:") #print(biterms) #print("The non-zero parameter we're passing looks like:") #print(np.count_nonzero(X, axis=1)) #print("The sum parameter we're passing in looks like:") #print(np.sum(X, axis=0)) #print("Breakpoint") #Create a BTM and pass the biterms to train it: #from biterm.btm import oBTM #import time start_time = time.time() #random.seed(1) btm = oBTM(num_topics=num_top, V=vocab) topics = btm.fit_transform(biterms, iterations=100) end_time = time.time() run_time = end_time - start_time print("For %s..." % company_name) print("BTM took %s seconds to train" % run_time) #print("First parameter:") #print(btm.phi_wz.T) #print("Topics:") #print(topics) ##See if formatting data in the following manner allows pyLDAvis.prepare to work: #Visualize the topics: #If HTML(vis) doesn't work, look at following link #Link: https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/ #import pyLDAvis ##!This isn't working for some reason #vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) #pyLDAvis.display(vis) #pyLDAvis.show(vis) #from IPython.core.display import HTML #HTML(vis) #cleanGlish_tweets2["topic"] = topics.argmax() cleanGlish_tweets2["topic"] = [topics[i].argmax() for i in range(len(cleanGlish_tweets2["Content"]))] #print("\nTweets and Topics:") #for i in range(len(cleanGlish_tweets2["Content"])): #print("{} (topic: {})".format(cleanGlish_tweets2.iloc[i, cleanGlish_tweets2.columns.get_loc("Content")], topics[i].argmax())) # cleanGlish_tweets2.iat[i, 22] = topics[i].argmax() #Examine topic coherence scores: print("\nTopic Coherence:") topic_summuary(btm.phi_wz.T, X, vocab, 10) #Save the tweet topics: respath2 = respath + str(company_name) + resEnding cleanGlish_tweets2.to_excel(respath2)