def train_classifier(papers: list, num_topics: int) -> LdaModel: """ Trains the Lda model with selected documents. Training is done by cleaning the documents, index the words and train the model with a given number of topics Args: papers: list of papers, each item containing the corpus of a document num_topics: amount of topics that need to be trained Returns: Trained lda model """ papers_clean = [clean(paper) for paper in papers] dictionary = corpora.Dictionary(papers_clean) doc_term_matrix = [dictionary.doc2bow(paper) for paper in papers_clean] models = [] print("Start generating models") for x in range(13, 14): ldamodel = LdaMulticore(doc_term_matrix, num_topics=x, id2word=dictionary, passes=50) topic_words = [w[0] for x in range(ldamodel.num_topics) for w in ldamodel.show_topic(x)] unique_words = set(topic_words) models.append(ldamodel) print(x, len(unique_words), len(unique_words)/float(len(topic_words))) x = 1 while True: try: x = int(input("Enter the model you want to train labels for:\n")) except ValueError: print("not an integer") continue if x > len(models) or x < 1: print("Model does not exist") else: break return models[x-1], dictionary
def generate_lda_topics(self): from gensim.corpora import Dictionary, MmCorpus from gensim.models.ldamulticore import LdaMulticore import pyLDAvis import pyLDAvis.gensim import warnings import _pickle as pickle trigram_sentences = LineSentence(self.trigram_sentences_filepath) trigram_dictionary = Dictionary(trigram_sentences) # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(self.trigram_dictionary_filepath) def trigram_bow_generator(filepath): for sentence in LineSentence(filepath): yield trigram_dictionary.doc2bow(sentence) MmCorpus.serialize( self.trigram_bow_filepath, trigram_bow_generator(self.trigram_sentences_filepath)) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore(trigram_bow_corpus, num_topics=3, id2word=trigram_dictionary, workers=3) lda.save(self.lda_model_filepath) lda = LdaMulticore.load(self.lda_model_filepath) lda.show_topic(0) lda.show_topic(1) lda.show_topic(2) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
def perform(self, option="load"): """ Perform LDA analysis to generate topics and topic distribution for each app """ logging.info("Start Lda analysis") ldamodel = LdaMulticore(self.corpus, num_topics=self.ntopic, id2word=self.dictionary, passes=self.iteration) logging.info("LDA multicore modeling done") ldamodel.save(self.lda_out_file_name) self.topics = {} for i in range(0, self.ntopic, 1): self.topics["topic{}".format(i)] = ldamodel.show_topic(i, topn=self.nword) logging.info("Topic{}".format(i)) words = [w[1] for w in self.topics["topic{}".format(i)]] logging.info(words)
def LDA_Machine(lst_dict, handle_lst): assert type(lst_dict) == list, "Please enter a list of dictionary's" assert type(handle_lst) == list, "Please enter a list of handles" file_path_corpus = "/home/igabr/new-project-4/mm_corpus/" cnt_1 = -1 cnt_2 = -1 for handle in handle_lst: cnt_1 += 1 clean_tweet_list = [] handle_tweets = lst_dict[cnt_1][handle]['content'] if handle_tweets == []: continue else: for raw_tweet in handle_tweets: clean_tweet = "" tokenized_tweet = nlp(raw_tweet) for token in tokenized_tweet: if token.is_space: continue elif token.is_punct: continue elif token.is_stop: continue elif token.is_digit: continue elif len(token) == 1: continue elif len(token) == 2: continue else: clean_tweet += str(token.lemma_) + " " clean_tweet_list.append(clean_tweet) clean_tweet_list = list(map(str.strip, clean_tweet_list)) clean_tweet_list = [x for x in clean_tweet_list if x != ""] lst_dict[cnt_1][handle]['tokenized_tweets'] = clean_tweet_list print("{} tokenized_tweets inserted!".format(handle)) print() master_df = make_df(lst_dict) to_remove = list(master_df[master_df['tokenized_tweets'].isnull()].index) index_to_remove = [] for i in to_remove: index_to_remove.append(handle_lst.index(i)) new_handle_list = [ v for i, v in enumerate(handle_lst) if i not in frozenset(index_to_remove) ] master_df.dropna(subset=['tokenized_tweets'], inplace=True) master_df = filtration(master_df, "tokenized_tweets") clean_lst_dict = dataframe_to_dict(master_df) print() print("Cleaning of master dataframe complete!") for handle in new_handle_list: cnt_2 += 1 try: list_of_tweets = clean_lst_dict[cnt_2][handle]['tokenized_tweets'] except KeyError: continue gensim_format_tweets = [] for tweet in list_of_tweets: list_form = tweet.split() gensim_format_tweets.append(list_form) gensim_dictionary = Dictionary(gensim_format_tweets) gensim_dictionary.filter_extremes(no_below=10, no_above=0.4) gensim_dictionary.compactify( ) # remove gaps after words that were removed MmCorpus.serialize( file_path_corpus + "{}.mm".format(handle), bag_of_words_generator(gensim_format_tweets, gensim_dictionary)) corpus = MmCorpus( file_path_corpus + "{}.mm".format(handle)) #loading the corpus from disk if corpus.num_terms == 0: continue else: lda = LdaMulticore(corpus, num_topics=10, id2word=gensim_dictionary, passes=100, workers=100) lda.save(file_path_corpus + "lda_model_{}".format(handle)) print("LDA model for {} saved!".format(handle)) word_list = [] for i in range(10): for term, frequency in lda.show_topic(i, topn=100): if frequency != 0: word_list.append(term) LDA_Counter = Counter(word_list) clean_lst_dict[cnt_2][handle]['LDA'] = LDA_Counter print("Inserted LDA Counter into {} dictionary".format(handle)) pickle_object(clean_lst_dict, "2nd_degree_connections_LDA_complete") print("Script Complete")
class recommendationsys_LDA: def __init__(self, ngram): # load the spacy english model self.nlp = spacy.load('en') self.extrawords = ["'s", "st", "th", "’s", "-PRON-", "’", "htt", "ht", "km", "pm", "am"] # parse the latest emoji code html = str(ur.urlopen('http://www.unicode.org/Public/emoji/5.0/emoji-data.txt').read()) codes=list(map(lambda x: '-'.join(['\\U'+a.zfill(8) for a in x.split('..')]).encode().decode('unicode-escape'),re.findall(r'(?<=\\n)[\w.]+',html))) self.emojiPattern = re.compile('['+','.join(codes)+']',flags=re.UNICODE) PROJECT_DIRECTORY = 'output/project/' + project_name self.f_titles = PROJECT_DIRECTORY + '/titlesLF_target.txt' self.f_authors = PROJECT_DIRECTORY + '/authors_target.txt' self.authorcontent_clean = {} self.ngram_bow_corpus = [] self.ldavec = {} self.ngram_dictionary = None self.ngram = ngram self.num_topics = None def clean_text(self, text): # remove the 'RT' and replace '\n' to '.' text = text.lower() #text = text.replace('RT',' ') text = text.replace('\n',' . ') # this is for USC-2 # remove emojis myre = re.compile(u'(' '@\S*\s?|#|' # remove @ mention names and hastag sign 'http[s]?[:…]+\S*\s|' # remove url '[-~\^\$\*\+\{\}\[\]\\\|\(\)/“"]|' 'rt[:]? |' '…' ')+', re.UNICODE) text = myre.sub(' ', text) text = self.emojiPattern.sub(' ', text) text = text.replace('&','and') #text = ' '.join(text) return text #--------------------------- # make the recommendations #--------------------------- def recomendation(self, username, topicn=0, list=[]): similaritis = self.ldacosinesimilarity(username,topicn) result=[] # list is empty, run on the whole dataset if not list: for key, value in sorted(similaritis.items(), key=lambda x:x[1]): result.append((key,value)) else: for i in list: result.append((i,similaritis[i])) # sort the result by similarities result = sorted(result, key=lambda x:x[1]) #--------------------------- # load and clean the data #--------------------------- def loadandclean(self, n=-1): #authorcontent = {} # ------ with codecs.open(self.f_titles, encoding='utf_8') as f_t: with codecs.open(self.f_authors, encoding='utf_8') as f_a: for l_a, l_t in zip(f_a, f_t): # remove the '\n' at the end key = l_a[:-1].lower() l_t = self.clean_text(l_t) if key in self.authorcontent_clean: self.authorcontent_clean[key].append(l_t) #self.authorcontent_clean[key] = self.clean_text(value) else: self.authorcontent_clean[key] = [l_t] #self.authorcontent_clean[key] = self.clean_text(value) if n != -1 and len(self.authorcontent_clean) == n: break # --------------- for key, value in self.authorcontent_clean.items(): self.authorcontent_clean[key] = self.lemmatized_sentence_corpus(self.authorcontent_clean[key]) #------------------------------------------------------ # build the trigram content based on the clean content #------------------------------------------------------ def punct_space(self, token): """ helper function to eliminate tokens that are pure punctuation or whitespace """ #return token.pos_ == 'NOUN' or token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2 return token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2 def lemmatized_sentence_corpus(self, contents): """ generator function to use spaCy to parse reviews, lemmatize the text, and yield sentences """ sentents = [] for content in self.nlp.pipe(contents,batch_size=500, n_threads=8): for sent in content.sents: #sentents.append(u' '.join([token.lemma_ for token in sent # if not punct_space(token)])) #sentents.append([token.lemma_ for token in sent # if not punct_space(token)]) tokens = [] for token in sent: if self.punct_space(token): continue #if token.lemma_ == '-PRON-': # token.lemma_ = token.lower_ tokens.append(token.lemma_) sentents.append(tokens) return sentents """ prepare the parameters for lda """ def ldainit(self): # self.num_topics = num_topics # ngram = self.ngram # # if ngram_bow_corpus is empty, build it first # if not self.ngram_bow_corpus: self.user_sentences = self.authorcontent_clean self.user_bigramsentences = {} self.all_sentences = [] self.all_bigram_sentences = [] sentences = list(self.authorcontent_clean.values()) self.all_sentences = [item for sublist in sentences for item in sublist] # buld bigram model if self.ngram == 2: self.bigram_model = Phrases(self.all_sentences) for user,content in self.user_sentences.items(): bigram_s = [] for s in content: bigram_s.append(self.bigram_model[s]) self.user_bigramsentences[user] = bigram_s self.all_bigram_sentences += self.user_bigramsentences[user] def trainlda(self, topics_n = 10): self.num_topics = topics_n alltexts = [] for name,sentences in self.user_sentences.items(): sentences = [item for sublist in sentences for item in sublist] alltexts.append(sentences) # if self.ngram_dictionary == None: # if self.ngram == 1: # self.ngram_dictionary = Dictionary(self.all_sentences) # elif self.ngram == 2: # self.ngram_dictionary = Dictionary(self.all_bigram_sentences) # if self.ngram_dictionary == None: if self.ngram == 1: self.ngram_dictionary = Dictionary(alltexts) elif self.ngram == 2: self.ngram_dictionary = Dictionary(alltexts) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8) self.ngram_dictionary.compactify() # if self.ngram == 1: # sentences = self.all_sentences # elif self.ngram == 2: # sentences = self.all_bigram_sentences # ngram_bow_corpus = [] # for sentence in sentences: # ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) # # # self.lda = LdaMulticore(ngram_bow_corpus, # num_topics = topics_n, # id2word=self.ngram_dictionary, # workers=3) ngram_bow_corpus = [] for sentence in alltexts: ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) self.lda = LdaMulticore(ngram_bow_corpus, num_topics = topics_n, id2word=self.ngram_dictionary, workers=3) # calculate the cohe topics=[] for i in range(self.lda.num_topics): terms = [] for n in self.lda.show_topic(i): terms.append(n[0]) topics.append(terms) cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass') cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v') cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci') cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi') return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence() def explore_topic(self, topic_number, topn=25): """ accept a user-supplied topic number and print out a formatted list of the top terms """ print(u'{:20} {}'.format(u'term', u'frequency') + u'\n') for term, frequency in self.lda.show_topic(topic_number, topn): print(u'{:20} {:.3f}'.format(term, round(frequency, 3))) def runlda(self, username): if self.ngram == 1: user_sentences = self.user_sentences[username] elif self.ngram == 2: user_sentences = self.user_bigramsentences[username] # flat the list of list into single list user_sentences = [item for sublist in user_sentences for item in sublist] user_bow = self.ngram_dictionary.doc2bow(user_sentences) user_lda = self.lda[user_bow] #user_lda = sorted(user_lda, key=lambda x:-x[1]) return user_lda """ compute the lda topic vec for every one """ def runldavec(self): if not self.ldavec: for key, value in self.user_sentences.items(): vec = np.zeros(self.num_topics) result = self.runlda(key) for i in result: vec[i[0]] = i[1] self.ldavec[key] = vec """ """ def runtopntopic(self, n): self.topntopics = [] for key, value in self.ldavec.items(): idx = value.argsort() self.topntopics += list(idx[-n:]) self.topntopics = list(set(self.topntopics)) """ compute the lda cosine similarity between a given user and the rest users """ def ldacosinesimilarity(self, username, topn=0): if username not in self.authorcontent_clean: print('The user cannot find') return if topn < 0: print('topn should be >= 0') return topn = int(topn) cosinesimilaritydic = {} if not self.ldavec: self.runldavec() if topn == 0: usertopicvec = self.ldavec[username] else: self.runtopntopic(topn) usertopicvec = self.ldavec[username][self.topntopics] for key, value in self.ldavec.items(): if key != username: if topn == 0: pairtopicvec = value else: pairtopicvec = value[self.topntopics] cosinesimilarity = pairwise_distances(np.array(usertopicvec).reshape(1,-1),np.array(pairtopicvec).reshape(1,-1), metric='cosine')[0][0] cosinesimilaritydic[key] = cosinesimilarity return cosinesimilaritydic
num_topics=6, id2word=trigram_dictionary, workers=3) lda.save(lda_model_filepath) # load the finished LDA model from disk lda = LdaMulticore.load(lda_model_filepath) def explore_topic(topic_number, topn=6): """ accept a user-supplied topic number and print out a formatted list of the top terms """ for term, frequency in lda.show_topic(topic_number, topn=25): print (f"{term:20} : {frequency:.3f}") explore_topic(topic_number=3) print("done") # =========from earlier ========= # dictionary = corpora.Dictionary(text_data) # corpus = [dictionary.doc2bow(text) for text in text_data] # import pickle # pickle.dump(corpus, open('corpus.pkl', 'wb'))
class LdaRecsys: def __init__(self): self.user_corp = {} self.corp_dict = {} self.corp_bow = [] self.user_profile = UserProfile() self.user_profile.getUserList() self.user_list = self.user_profile.user_list self.cosine_distance_dict = {} self.lda_vect_dict = {} def loadCorpDict(self, user_list=[]): return def saveCorpDict(self): return def buildCorpDict(self, user_list=[], no_blow_doc=10, no_above_doc=0.5): if not user_list: if not self.user_list: print('user list is empty! will run getUserList') self.user_profile.getUserList() self.user_list = self.user_profile.user_list print('get the user list') user_list = self.user_list if not user_list: print('user list is still empty!') return [] self.user_corp = self.user_profile.getCorpProfile(self.user_list) self.corp_dict = Dictionary( [corp for corp in list(self.user_corp.values())]) # Keep tokens which are contained in at least no_below documents # Keep tokens which are contained in no more than no_above documents # (fraction of total corpus size, not an absolute number) self.corp_dict.filter_extremes(no_below=no_blow_doc, no_above=no_above_doc) self.corp_dict.compactify() def buildCorpBow(self): if not self.user_corp or not self.corp_dict: self.buildCorpDict() self.corp_bow = {} for user, corp in self.user_corp.items(): self.corp_bow[user] = self.corp_dict.doc2bow(corp) def saveCorpBow(self): return def loadCorpBow(self): return def trainLDA(self, topics_num, iter_num=50): # reset the cosine_distance_dict in every training self.cosine_distance_dict = {} self.topics_num = topics_num corp_bow = [bow for bow in list(self.corp_bow.values())] self.lda = LdaMulticore(corp_bow, num_topics=topics_num, id2word=self.corp_dict, iterations=iter_num, workers=4) def runLDA(self, user_name): if user_name in self.corp_bow: user_bow = self.corp_bow[user_name] else: print('no such user! Please check the screen name') return user_lda = self.lda[user_bow] return user_lda def buildLdaVect(self): for user, bow in self.corp_bow.items(): vect = np.zeros(self.topics_num) user_lda = self.lda[bow] for i in user_lda: vect[i[0]] = i[1] self.lda_vect_dict[user] = vect def ldaCosineDistance(self, user_name): if not self.lda_vect_dict: self.buildLdaVect() if user_name not in self.lda_vect_dict: print('no such user') return cosine_distance_dict = {} user_vect = self.lda_vect_dict[user_name] for user, lda_vect in self.lda_vect_dict.items(): cosine_distance = pairwise_distances( np.array(user_vect).reshape(1, -1), np.array(lda_vect).reshape(1, -1), metric='cosine')[0][0] cosine_distance_dict[user] = cosine_distance self.cosine_distance_dict[user_name] = cosine_distance_dict def makeRecommendation(self, user_name, topn_recommendation=10): if user_name not in self.cosine_distance_dict: self.ldaCosineDistance(user_name) user_recommendations = self.cosine_distance_dict[user_name] n = 0 for recommendation, cosine_distance in sorted( user_recommendations.items(), key=lambda x: x[1]): print((recommendation, cosine_distance)) n += 1 if n == topn_recommendation: break return user_recommendations def showTopic(self, topic_number, topn_word=5): """ topic_numer: which topic to show topn_word: show top n words in this topic """ print(u'{:20} {}'.format(u'term', u'frequency') + u'\n') for term, frequency in self.lda.show_topic(topic_number, topn_word): print(u'{:20} {:.3f}'.format(term, round(frequency, 3))) def showUserTopic(self, user_name, topn_word=10): if user_name not in self.corp_bow: print('no such user! please check the screen name') return user_bow = self.corp_bow[user_name] user_lda = self.lda[user_bow] user_lda = sorted(user_lda, key=lambda x: -x[1]) for topic_number, freq in user_lda: print('topic number {} {}'.format(topic_number, freq)) print('|____') self.showTopic(topic_number, topn_word) print('\n')
# generate LDA model my_num_topics = 30 # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20) ldamodel = LdaMulticore(corpus, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1) print(ldamodel.print_topics(num_topics=my_num_topics, num_words=5)) print(corpus[0]) print(corpus[1]) print(corpus[2]) print(ldamodel[corpus[0]]) print(ldamodel[corpus[1]]) print(ldamodel[corpus[2]]) # print(ldamodel.print_topics(20)) model_basename = '/home/osboxes/w/wlda/trymodel' ldamodel.save(model_basename) for t in range(ldamodel.num_topics): plt.figure() plt.imshow(WordCloud().fit_words(dict(ldamodel.show_topic(t, 200)))) plt.axis("off") plt.title("Topic #" + str(t)) plt.show() # plt.pause(0.0001)
# create a stram of sentances for corpus dict input_data = [sent.split(' ') for sent in data['text_clean_bigram'].tolist()] # topic model with LDA id2word = Dictionary(input_data) # Term Document Frequency corpus = [id2word.doc2bow(text) for text in input_data] # number of topics num_topics = 10 # Build LDA model lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, workers=2) # topics don't work well as tweets all relate to disasters lda_model.show_topic(topicid=0, topn=25) lda_model.show_topic(topicid=1, topn=25) lda_model.show_topic(topicid=2, topn=25) lda_model.show_topic(topicid=3, topn=25) # predict of single string string_input = data['text_clean_bigram'][1] lda_topic_prob(string_input, input_data, lda_model) # predict for all strings # takes ~20 minutes to run data['lda_topic_prob'] = data['text_clean_bigram'].apply( lambda x: lda_topic_prob(x, input_data, lda_model)) # transform the lda topic probabilities into a dataframe representation # takes a while to run lda_topic_df = topic_df(data=data,