def getRelationDetailByHDP(sentence_list): # 聚类获取结果 corpus = [] pairs_all, position_all = segmentor.segListWithNerTag(sentence_list) words_list = [] for pairs in pairs_all: word_list = [] for pair in pairs: if pair.flag.__contains__("v") or pair.flag.__contains__("n"): word_list.append(pair.word) words_list.append(word_list) # words_list = list(map(lambda pairs: map(lambda x: x.word, pairs), pairs_all)) from gensim import corpora dictionary = corpora.Dictionary(words_list) for words in words_list: corpus.append(dictionary.doc2bow(words)) from gensim.models import HdpModel hdp = HdpModel(corpus, dictionary) a = hdp.print_topics() words = {} for topic in a: word_details = str(topic[1]).split(" + ") for word_detail in word_details: word = str(word_detail[word_detail.index("*") + 1:]) num = float(str(word_detail[:word_detail.index("*")])) if not (words.__contains__(word)): words[word] = num else: words[word] += num words = sorted(words.items(), key=lambda d: d[1]) return words # 后获取句法分析中的高频动词名词)
def runModels(self, number_of_topics, corpus, dictionary, start, end): #do hdp model hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10) hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(hdptopics) #add results to total kept in a list # addToResults(result_dict) #output results self.printResults(number_of_topics, hdptopics, 'hdp', start, end) #d lda model ldamodel = LdaModel(corpus=corpus, num_topics=number_of_topics, id2word=dictionary, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) ldamodel.save('lda' + number_of_topics + '.model') ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(ldatopics) # addToResults(result_dict) self.printResults(number_of_topics, ldatopics, 'lda', start, end) visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) location = os.path.join(pn, 'topic_model_results') #visualize outputs in html pyLDAvis.save_html( visualisation, os.path.join( location, 'LDA_Visualization' + str(number_of_topics) + "_" + start + "_" + end + '.html'))
def create_hdp(num_topic, dictionary): print("__________________________Create HDP_________________________") corpus, dic = generate_corpus(dictionary) hdpmodel = HdpModel(corpus=corpus, id2word=dic) topics = hdpmodel.print_topics(num_topics=num_topic, num_words=7) # see list of topics for topic in topics: print(topic) return hdpmodel
def train_topics(args): print(f"Arguments: {args}") nlp = spacy.load("en", disable=["parser", "ner"]) files = args["text"] lines = extract_stories(files) def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): allowed_postags = set(allowed_postags) docs = nlp.pipe(texts) text_tokens = [] for doc in docs: tokens = [ token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.is_punct and not token.is_stop ] text_tokens.append(tokens) return text_tokens docs = tozenize(lines, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) print("Preprocessed Docs") bigram = gensim.models.Phrases(docs, min_count=5, threshold=100) trigram = gensim.models.Phrases(bigram[docs], threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] docs = make_bigrams(docs) docs = make_trigrams(docs) print("Create Dictionary") # Create Dictionary corpus_dict = corpora.Dictionary(docs) # Create Corpus texts = docs # Term Document Frequency corpus = [corpus_dict.doc2bow(text) for text in texts] print("Train Model") hdp = HdpModel(corpus, corpus_dict) print(hdp.print_topics(num_topics=50, num_words=20)) hdp.save(args["target"])
def get_num_topics(self): self.rev_train['title'] = self.strip_newline(self.rev_train.title) self.rev_test['title'] = self.strip_newline(self.rev_test.title) # rev_train.text[21:22].values words_tr = list(self.sent_to_words(self.rev_train.title)) words_te = list(self.sent_to_words(self.rev_test.title)) words_tr = self.remove_stopwords(words_tr) bigram_tr, trigram_tr = self.bigrams(words_tr) trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr] lemma_lg = self.lemmatization(trigrams_tr) with open(os.path.join('.', 'data', 'lemma_lg.pkl'), 'wb') as f: pickle.dump(lemma_lg, f) id2word_lg = gensim.corpora.Dictionary(lemma_lg) id2word_lg.filter_extremes(no_below=2, no_above=0.6) id2word_lg.compactify() id2word_lg.save(os.path.join('.', 'data', 'train_dict_lg')) corpus_lg = [id2word_lg.doc2bow(text) for text in lemma_lg] with open(os.path.join('.', 'data', 'corpus_lg.pkl'), 'wb') as f: pickle.dump(corpus_lg, f) hdp = HdpModel(corpus_lg, id2word_lg, chunksize=100) n_topics = len(hdp.print_topics()) hdptopics = hdp.print_topics(num_topics=n_topics) for tp in hdptopics: print(tp) return n_topics
def try_news_cluster(): docs = feed_doc() df_threshold_lower = 50 df_threshold_upper = 500 dictionary = corpora.Dictionary(doc for doc in docs) print 'dictionary ready' low_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= df_threshold_lower ] high_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq > df_threshold_upper ] dictionary.filter_tokens(low_df + high_df) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in feed_doc()] print 'corpus ready' hdp = HdpModel(corpus, dictionary) for topic in hdp.print_topics(num_topics=50, num_words=20): print topic
def hierarchical_dirichlet_process_topic_extraction(): """ Function performs topic extraction on Tweets using the Gensim HDP model. :return: None. """ from gensim.models import HdpModel # LDA can only use raw term counts for LDA because it is a probabilistic graphical model. tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') tf = tf_vectorizer.fit_transform(slo_feature_series) tf_feature_names = tf_vectorizer.get_feature_names() log.info( "\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix." ) log.info(f"{tf}\n") log.info( "\n.get_feature_names - Array mapping from feature integer indices to feature name" ) log.info(f"{tf_feature_names}\n") # Train the HDP model. hdp = HdpModel(corpus, dictionary) time.sleep(3) # # For use as wrapper with Scikit-Learn API. # model = HdpTransformer(id2word=dictionary) # distribution = model.fit_transform(corpus) # Display the top words for each topic. topic_info = hdp.print_topics(num_topics=20, num_words=10) for topic in topic_info: print(topic)
class TcModel: """ Using gensim LDA model to implement the topic cluster """ def __init__(self): self.original_data = [] self.text = [] self.token = [] self.corpus = [] self.id2word = [] self.model_name = '' self.num_topics = 10 self.iterations = 100 self.model = None self.stop_words = stopwords.words('english') self.stop_words.extend( ['be', 'say', '-PRON-', 'ms', 'mr', 'year', 'cent']) def _tokenize_words(self, text): token = [] total = len(text) for i in range(total): token.append(gensim.utils.simple_preprocess(text[i], deacc=True)) return token def _phrase(self, token): bigram = Phrases(token, min_count=5, threshold=100) bigram_mod = Phraser(bigram) trigram = Phrases(bigram_mod[token], min_count=5, threshold=100) trigram_mod = Phraser(trigram) return [trigram_mod[bigram_mod[doc]] for doc in token] def _lemmatization(self, token): nlp = spacy.load('en', disable=['parser', 'ner']) return_text = [] allow_postags = ['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN'] for i in token: sentence = nlp(" ".join(i)) return_text.append([ token.lemma_ for token in sentence if token.pos_ in allow_postags ]) return return_text def find_most_common(self, token, plot=False): word_list = [] extra_stopwords = [] for i in token: word_list.extend(i) word_dic = collections.Counter(word_list) #print(word_dic.most_common(100)) tf = list(word_dic.values()) tf.sort(reverse=True) if plot == True: print(tf[:100]) plt.plot(range(500), tf[:500]) plt.xlabel('word sequence') plt.ylabel('Term Frequency') plt.show() m_list = [] for i in range(len(tf) - 1): m_list.append(tf[i] - tf[i + 1]) k = tf[m_list.index(max(m_list))] print(k) k = 5000 for i in word_dic: if word_dic[i] > k: extra_stopwords.append(i) print(extra_stopwords) return extra_stopwords def _remove_stopwords(self, token): return_text = [] self.stop_words.extend(self.find_most_common(token)) for i in token: return_text.append( [word for word in i if word not in self.stop_words]) return return_text def _doc_topic(self): """ Matrix = [doc_id,title,topic,probability,summary,content] """ matrix = [] for num in range(len(self.corpus)): row = self.model[self.corpus[num]] row = sorted(row, key=lambda x: x[1], reverse=True) for i, j in row: if float(j) < 0.05: continue value = [ self.original_data.ix[num]['id'], self.original_data.ix[num]['title'], i, j, self.original_data.ix[num]['summary'], self.original_data.ix[num]['content'] ] if value not in matrix: matrix.append(value) matrix = pd.DataFrame(matrix, columns=[ 'doc_id', 'title', 'topic', 'probability', 'summary', 'content' ]) self.doc_topic = matrix print(matrix) return matrix def _topic_doc(self): matrix = [] for i in range(self.num_topics): doc_list = [ i for i in list(self.doc_topic[ self.doc_topic.topic == i].sort_values( by='probability', ascending=False)['doc_id']) ] if doc_list == []: self.num_topics = i break output = ",".join([str(i) for i in doc_list]) print('topic {}: {}'.format(i, output)) matrix.append([i, output]) return matrix def _readable_topic(self, sent_num=3): output = [] for i in range(self.num_topics): sent = '' content = [] score_list = [] topic_term = dict(self.model.show_topic(i, topn=1000)) topic_list = self.doc_topic[self.doc_topic.topic == i] max_pro = heapq.nlargest(5, topic_list['probability']) for pro in max_pro: content.append( list(topic_list[topic_list.probability == pro]['content']) [0]) content = ' '.join(content) content = [text for text in sent_tokenize(content)] for j in range(len(content)): words = gensim.utils.simple_preprocess(content[j], deacc=True) corpus = self.model.id2word.doc2bow(words) score = 0 for word, num in corpus: word = self.model.id2word.get(word) if word in topic_term.keys(): score += num * topic_term[word] score_list.append(score) score_list = list(set(score_list)) max_score = heapq.nlargest(sent_num, score_list) for j in range(len(max_score)): max_sent = score_list.index(max_score[j]) print('topic {}: {}'.format(i, content[max_sent])) sent = sent + str('sentence {}: {}\n'.format( j + 1, content[max_sent])) output.append([i, sent]) return output def _topic_key(self): output = [] for i in range(self.num_topics): output.append( [i, ','.join([item[0] for item in self.model.show_topic(i)])]) print(output) return output def train(self, path, num_topics=20, iterations=500, n_gram=True, lemmatization=True, stop_words=True, tfidf=True, model='lda'): """ Trian the topic cluster model. Input value: data: pd.DataFrame format ['id','title','content','summary'] num_topics: (int) the number of topics iterations: (int) total number of iteration times example: >>> lda = LDA_Model >>> lda.train(text) """ data = load_data(str(path + '/output/data.csv')) self.original_data = data self.text = list(data['content']) self.num_topics = num_topics self.iterations = iterations self.model_name = model print('tokenizing...') self.token = self._tokenize_words(self.text) if n_gram == True: print('phrasing...') self.token = self._phrase(self.token) if lemmatization == True: print('lemmatization...') self.token = self._lemmatization(self.token) if stop_words == True: print('remove stop words...') self.token = self._remove_stopwords(self.token) self.id2word = Dictionary(self.token) self.corpus = [self.id2word.doc2bow(text) for text in self.token] if tfidf == True: print('calculate tfidf...') tfidf_model = TfidfModel(self.corpus) self.corpus = [tfidf_model[i] for i in self.corpus] if model == 'lda': self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics, iterations=self.iterations) if model == 'lsi': self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics) if model == 'hdp': self.model = HdpModel(corpus=self.corpus, id2word=self.id2word) self.num_topics = self.model.get_topics().shape[0] self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words']) self.doc_topic = self._doc_topic() self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id']) self.topic_sent = pd.DataFrame( self._readable_topic(), columns=['topic_id', 'most relative sentence']) def save(self, path='default'): #timestr = time.strftime('%Y%m%d%H%M%S',time.localtime(time.time())) if path == 'default': path = 'model' try: os.mkdir(path) except: pass else: try: os.mkdir(path) except: pass if self.model_name == 'lda': self.model.save(str(path + '/lda.model')) if self.model_name == 'lsi': self.model.save(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model.save(str(path + '/hdp.model')) f = open(str(path + '/original_data.pickle'), 'wb') pickle.dump(self.original_data, f) f.close() f = open(str(path + '/text.pickle'), 'wb') pickle.dump(self.text, f) f.close() f = open(str(path + '/token.pickle'), 'wb') pickle.dump(self.token, f) f.close() f = open(str(path + '/corpus.pickle'), 'wb') pickle.dump(self.corpus, f) f.close() path = path + '/result' self.save_result(path) avg, cosine_matrix = self.similarity() sns.set() label = [] col = [] for i in range(self.num_topics): cosine_matrix[i][i] = 0.5 col.append('topic {}'.format(i)) cosine_matrix = pd.DataFrame(cosine_matrix) cosine_matrix.columns = col cosine_matrix.index = col sns.heatmap(cosine_matrix, cmap='YlGnBu') plt.savefig(path + '/topic_similarity.jpg') cosine_matrix.to_csv(str(path + '/cosine_matrix.csv')) def save_result(self, path='default'): if path == 'default': path = 'model/result' try: os.mkdir(path) except: pass else: try: os.mkdir(path) except: pass # topic_key = pd.DataFrame(self.print_topics(num_topics=self.num_topics,num_words=10),columns=['topic id','key words']) # topic_key.to_csv(str(path+'/topic_key.csv'),index=False) # doc_topic = self._doc_topic() # doc_topic.to_csv(str(path+'/doc_topic.csv')) # topic_doc = pd.DataFrame(self._topic_doc(),columns=['topic id','document id']) # topic_doc.to_csv(str(path+'/topic_doc.csv'),index=False) # topic_sent = pd.DataFrame(self._readable_topic(),columns=['topic id','most relative sentence']) # topic_sent.to_csv(str(path+'/topic_sent.csv'),index=False) f = open(str(path + '/topic_key.pickle'), 'wb') pickle.dump(self.topic_key, f) f.close() f = open(str(path + '/doc_topic.pickle'), 'wb') pickle.dump(self.doc_topic, f) f.close() f = open(str(path + '/topic_doc.pickle'), 'wb') pickle.dump(self.topic_doc, f) f.close() f = open(str(path + '/topic_sent.pickle'), 'wb') pickle.dump(self.topic_sent, f) f.close() def load(self, path='default'): """ :param path: the path of trained model. :return: """ if path == 'default': path = 'model' file_list = os.listdir(path) for file in file_list: if file.endswith('.model'): self.model_name = file.split('.')[0] if self.model_name == 'lda': self.model = LdaModel.load(str(path + '/lda.model')) if self.model_name == 'lsi': self.model = LsiModel.load(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model = HdpModel.load(str(path + '/hdp.model')) self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.model.get_topics().shape[0] else: self.num_topics = self.model.num_topics #self.iterations = self.model.iterations f = open(str(path + '/original_data.pickle'), 'rb') self.original_data = pickle.load(f) f.close() f = open(str(path + '/text.pickle'), 'rb') self.text = pickle.load(f) f.close() f = open(str(path + '/token.pickle'), 'rb') self.token = pickle.load(f) f.close() f = open(str(path + '/corpus.pickle'), 'rb') self.corpus = pickle.load(f) f.close() path = path + '/result' f = open(str(path + '/topic_key.pickle'), 'rb') self.topic_key = pickle.load(f) f.close() f = open(str(path + '/doc_topic.pickle'), 'rb') self.doc_topic = pickle.load(f) f.close() f = open(str(path + '/topic_doc.pickle'), 'rb') self.topic_doc = pickle.load(f) f.close() f = open(str(path + '/topic_sent.pickle'), 'rb') self.topic_sent = pickle.load(f) f.close() self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.topic_doc.shape[0] else: self.num_topics = self.model.num_topics def update(self, path, iterations=100, n_gram=True, lemmatization=True, stop_words=True, model='lda'): """ :param path: The path of training file :param iterations: Only for lda model :param n_gram: choose if use n_gram feature, default is true :param lemmatization: choose if use lemmatization feature, default is true :param stop_words: choose if need to remove stop words, default is true :param model: choose what model to use, default is 'lda' :return: """ data = load_data(path + '/output/data.csv') pd.concat([self.original_data, data], axis=0) text = list(data['content']) self.text.extend(text) print('tokenizing...') token = self._tokenize_words(text) self.token.extend(token) if n_gram == True: print('phrasing...') token = self._phrase(token) self.token.extend(token) if lemmatization == True: print('lemmatization...') token = self._lemmatization(token) self.token.extend(token) if stop_words == True: print('remove stop words...') token = self._remove_stopwords(token) self.token.extend(token) corpus = [self.id2word.doc2bow(text) for text in self.token] self.corpus.extend(corpus) self.model.update(corpus=corpus, iterations=iterations) def print_topics(self, num_topics=-1, num_words=10): """ :param num_topics:(int, optional) – The number of topics to be selected :param num_words:(int, optional) – The number of words to be included per topics :return: list of (int, list of (str, float)) """ if num_topics == -1: num_topics = self.num_topics pprint.pprint( self.model.print_topics(num_topics=num_topics, num_words=num_words)) return self.model.print_topics(num_topics=num_topics, num_words=num_words) def score(self): """ Print the Coherence score of the model. """ #print('\nPerplexity: ', self.model.log_perplexity(self.corpus)) coherence_model_lda = CoherenceModel(model=self.model, texts=self.token, corpus=self.corpus, dictionary=self.id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) def vis(self): """ Visualization of the data through browser. """ vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word) pyLDAvis.show(vis) def consine(self, v1, v2): cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) return cosine def similarity(self): topic_vector = self.model.get_topics() num_topics = topic_vector.shape[0] consine_matrix = np.diag(np.ones(num_topics)) consine_list = [] for i in range(num_topics - 1): for j in range(i + 1, num_topics): consine_matrix[i][j] = self.consine(topic_vector[i], topic_vector[j]) consine_matrix[j][i] = consine_matrix[i][j] consine_list.append(consine_matrix[i][j]) average = np.average(consine_list) return average, consine_matrix def to_gephi(self): _, cosine_matrix = self.similarity() edge = [] for i in range(self.num_topics - 1): for j in range(i + 1, self.num_topics): edge.append([ 'topic {}'.format(i), 'topic {}'.format(j), cosine_matrix[i][j] ]) for i in range(self.doc_topic.shape[0]): edge.append([ 'topic {}'.format(self.doc_topic.ix[i]['topic']), self.doc_topic.ix[i]['doc_id'], self.doc_topic.ix[i]['probability'] ]) # edge = [] # node = [] # topic_vector = self.model.get_topics() #decomposition # pca = PCA(n_components=1000) # topic_vector = pca.fit_transform(topic_vector) # print(len(topic_vector[0])) # for i in range(len(topic_vector)): # for j in range(len(topic_vector[i])): # edge.append(['topic {}'.format(i),j,topic_vector[i][j]]) # node.append(['topic {}'.format(i),'topic {}'.format(i)]) # # return node,edge # def to_neo4j(self): output = [] for i in range(self.num_topics): output.append('CREATE(:Topic{id:"topic %d"})' % i) for word, pro in self.model.show_topic(i): output.append( 'MATCH (t:Topic) where t.id = "topic %d" CREATE t-[:Include{probability:%f}]-> (:Word{word:"%s"})' % (i, pro, word)) for i in range(len(self.original_data)): output.append('CREATE(:Document{id:%d})' % (self.original_data.ix[i]['id'])) for i in range(len(self.doc_topic)): output.append( 'MATCH (t:Topic),(d:Document) WHERE t.id = "topic %d" and d.id = %d CREATE t-[:Include{probability:%f}]->d' % (self.doc_topic.ix[i]['topic'], self.doc_topic.ix[i]['doc_id'], self.doc_topic.ix[i]['probability'])) return output
class TcModel: """ Using gensim LDA model to implement the topic cluster """ def __init__(self): self.original_data = [] self.text = [] self.token = [] self.corpus = [] self.id2word = [] self.model_name = '' self.num_topics = 10 self.iterations = 100 self.model = None self.stop_words = stopwords.words('english') self.stop_words.extend(['be', 'say', '-PRON-', 'ms','Mr','Ms','mr', 'year', 'cent', 'per', 'www', 'http', 'com']) def _phrase(self, token): bigram = Phrases(token, min_count=5, threshold=100) bigram_mod = Phraser(bigram) # trigram = Phrases(bigram_mod[token],min_count=5,threshold=100) # trigram_mod = Phraser(trigram) # return [trigram_mod[bigram_mod[doc]] for doc in token] return [bigram_mod[doc] for doc in token] def _tokenize_words(self,text): token = [] total = len(text) for i in range(total): token.append(gensim.utils.simple_preprocess(text[i],deacc=True)) return token def _preprocess(self, doc,lemma = True, stop_words = True): nlp = spacy.load('en') return_text = [] allow_NER = ["NORP","FAC","ORG","GPE","LOC","PERSON","PRODUCT","LANGUAGE","EVENT"] allow_POS = ["ADJ","NOUN","VERB"] for i in doc: i = re.sub("[\!\/_,%^*(+\"\')]+|[+——()?【】'’“”!,。?、~@#¥%……&*()]+"," ",i) i = re.sub("[\s+]"," ",i) sentence = nlp(i,disable = ['parser']) return_text.append([ent.text for ent in sentence.ents if ent.label_ in allow_NER]) if lemma == True and stop_words == True: return_text[-1].extend([token.lemma_ for token in sentence if token.ent_type_ == '' and token.lemma_ not in self.stop_words and token.pos_ in allow_POS]) elif lemma == True and stop_words == False: return_text[-1].extend([token.lemma_ for token in sentence if token.ent_type_ == '' and token.pos_ in allow_POS]) elif lemma == False and stop_words == False: return_text[-1].extend([token for token in sentence if token.ent_type_ == '' and token.pos_ in allow_POS]) elif lemma == False and stop_words == True: return_text[-1].extend([token for token in sentence if token.ent_type_ == '' and token.lemma_ not in self.stop_words and token.pos_ in allow_POS]) return return_text def find_most_common(self, token, plot=False): word_list = [] extra_stopwords = [] for i in token: word_list.extend(i) word_dic = collections.Counter(word_list) # print(word_dic.most_common(100)) tf = list(word_dic.values()) tf.sort(reverse=True) if plot == True: print(tf[:100]) plt.plot(range(500), tf[:500]) plt.xlabel('word sequence') plt.ylabel('Term Frequency') plt.show() m_list = [] for i in range(len(tf) - 1): m_list.append(tf[i] - tf[i + 1]) k = tf[m_list.index(max(m_list))] print(k) k = 5000 for i in word_dic: if word_dic[i] > k: extra_stopwords.append(i) print(extra_stopwords) return extra_stopwords def _doc_topic(self): """ Matrix = [doc_id,title,topic,probability,summary,content] """ matrix = [] for num in range(len(self.corpus)): row = self.model[self.corpus[num]] row = sorted(row, key=lambda x: x[1], reverse=True) for i, j in row: if float(j) < 0.05: continue value = [self.original_data.ix[num]['id'], self.original_data.ix[num]['title'], i, j, self.original_data.ix[num]['summary'], self.original_data.ix[num]['content']] if value not in matrix: matrix.append(value) matrix = pd.DataFrame(matrix, columns=['doc_id', 'title', 'topic', 'probability', 'summary', 'content']) self.doc_topic = matrix print(matrix) return matrix def _topic_doc(self): matrix = [] for i in range(self.num_topics): doc_list = [i for i in list( self.doc_topic[self.doc_topic.topic == i].sort_values(by='probability', ascending=False)['doc_id'])] if doc_list == []: self.num_topics = i break output = ",".join([str(i) for i in doc_list]) print('topic {}: {}'.format(i, output)) matrix.append([i, output]) return matrix def _readable_topic(self, sent_num=5): output = [] for i in range(self.num_topics): sent = '' content = [] score_list = [] topic_term = dict(self.model.show_topic(i, topn=1000)) content = ' '.join(list(self.doc_topic[self.doc_topic['topic'] == i].drop_duplicates('doc_id').sort_values('probability',ascending=False)[:10]['content'])) content = sent_tokenize(content) for j in range(len(content)): words = gensim.utils.simple_preprocess(content[j], deacc=True) corpus = self.model.id2word.doc2bow(words) score = 0 for word, num in corpus: word = self.model.id2word.get(word) if word in topic_term.keys(): score += num * topic_term[word] score_list.append(score) #score_list = list(set(score_list)) max_score = heapq.nlargest(sent_num, score_list) for j in range(len(max_score)): max_sent = score_list.index(max_score[j]) print('topic {}: {}'.format(i, content[max_sent])) sent = sent + str('sentence {}: {}\n'.format(j + 1, content[max_sent])) output.append([i, sent]) return output def _topic_key(self): output = [] for i in range(self.num_topics): output.append([i, ','.join([item[0] for item in self.model.show_topic(i, topn=30)])]) print(output) return output def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True, model='lda'): """ Trian the topic cluster model. Input value: data: pd.DataFrame format ['id','title','content','summary'] num_topics: (int) the number of topics iterations: (int) total number of iteration times example: >>> lda = LDA_Model >>> lda.train(text) """ data = load_data(str(path + '/output/data.csv')) self.original_data = data self.text = list(data['content']) self.num_topics = num_topics self.iterations = iterations self.model_name = model print('preprocessing...') self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words) self.id2word = Dictionary(self.token) self.corpus = [self.id2word.doc2bow(text) for text in self.token] if tfidf == True: print('calculate tfidf...') tfidf_model = TfidfModel(self.corpus) self.corpus = tfidf_model[self.corpus] if model == 'lda': self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics, iterations=self.iterations) if model == 'lsi': self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics) if model == 'hdp': self.model = HdpModel(corpus=self.corpus, id2word=self.id2word) self.num_topics = self.model.get_topics().shape[0] self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words']) self.doc_topic = self._doc_topic() self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id']) self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence']) def save(self, path='default'): # timestr = time.strftime('%Y%m%d%H%M%S',time.localtime(time.time())) if path == 'default': path = 'model' try: os.mkdir(path) except: pass else: try: os.mkdir(path) except: pass if self.model_name == 'lda': self.model.save(str(path + '/lda.model')) if self.model_name == 'lsi': self.model.save(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model.save(str(path + '/hdp.model')) f = open(str(path + '/original_data.pickle'), 'wb') pickle.dump(self.original_data, f) f.close() f = open(str(path + '/text.pickle'), 'wb') pickle.dump(self.text, f) f.close() f = open(str(path + '/token.pickle'), 'wb') pickle.dump(self.token, f) f.close() f = open(str(path + '/corpus.pickle'), 'wb') pickle.dump(self.corpus, f) f.close() self.to_wordcloud(path) self.to_neo4j(path) path = path + '/result' self.save_result(path) avg, cosine_matrix = self.similarity() sns.set() label = [] col = [] for i in range(self.num_topics): cosine_matrix[i][i] = 1 # for j in range(i,self.num_topics): # cosine_matrix[i][j] = 0 col.append('topic{}'.format(i)) cosine_matrix = pd.DataFrame(cosine_matrix) cosine_matrix.columns = col cosine_matrix.index = col sns.heatmap(cosine_matrix, cmap='YlGnBu') plt.savefig(path + '/topic_similarity.jpg') cosine_matrix.to_csv(str(path + '/cosine_matrix.csv')) def save_result(self, path='default'): if path == 'default': path = 'model/result' try: os.mkdir(path) except: pass else: try: os.mkdir(path) except: pass # topic_key = pd.DataFrame(self.print_topics(num_topics=self.num_topics,num_words=10),columns=['topic id','key words']) # topic_key.to_csv(str(path+'/topic_key.csv'),index=False) # doc_topic = self._doc_topic() # doc_topic.to_csv(str(path+'/doc_topic.csv')) # topic_doc = pd.DataFrame(self._topic_doc(),columns=['topic id','document id']) # topic_doc.to_csv(str(path+'/topic_doc.csv'),index=False) # topic_sent = pd.DataFrame(self._readable_topic(),columns=['topic id','most relative sentence']) # topic_sent.to_csv(str(path+'/topic_sent.csv'),index=False) f = open(str(path + '/topic_key.pickle'), 'wb') pickle.dump(self.topic_key, f) f.close() f = open(str(path + '/doc_topic.pickle'), 'wb') pickle.dump(self.doc_topic, f) f.close() f = open(str(path + '/topic_doc.pickle'), 'wb') pickle.dump(self.topic_doc, f) f.close() f = open(str(path + '/topic_sent.pickle'), 'wb') pickle.dump(self.topic_sent, f) f.close() def load(self, path='default'): """ :param path: the path of trained model. :return: """ if path == 'default': path = 'model' file_list = os.listdir(path) for file in file_list: if file.endswith('.model'): self.model_name = file.split('.')[0] if self.model_name == 'lda': self.model = LdaModel.load(str(path + '/lda.model')) if self.model_name == 'lsi': self.model = LsiModel.load(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model = HdpModel.load(str(path + '/hdp.model')) self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.model.get_topics().shape[0] else: self.num_topics = self.model.num_topics # self.iterations = self.model.iterations f = open(str(path + '/original_data.pickle'), 'rb') self.original_data = pickle.load(f) f.close() f = open(str(path + '/text.pickle'), 'rb') self.text = pickle.load(f) f.close() f = open(str(path + '/token.pickle'), 'rb') self.token = pickle.load(f) f.close() f = open(str(path + '/corpus.pickle'), 'rb') self.corpus = pickle.load(f) f.close() path = path + '/result' f = open(str(path + '/topic_key.pickle'), 'rb') self.topic_key = pickle.load(f) f.close() f = open(str(path + '/doc_topic.pickle'), 'rb') self.doc_topic = pickle.load(f) f.close() f = open(str(path + '/topic_doc.pickle'), 'rb') self.topic_doc = pickle.load(f) f.close() f = open(str(path + '/topic_sent.pickle'), 'rb') self.topic_sent = pickle.load(f) f.close() self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.topic_doc.shape[0] else: self.num_topics = self.model.num_topics def update(self, path, iterations=100, n_gram=True, lemmatization=True, stop_words=True, model='lda'): """ :param path: The path of training file :param iterations: Only for lda model :param n_gram: choose if use n_gram feature, default is true :param lemmatization: choose if use lemmatization feature, default is true :param stop_words: choose if need to remove stop words, default is true :param model: choose what model to use, default is 'lda' :return: """ data = load_data(path + '/output/data.csv') pd.concat([self.original_data, data], axis=0) text = list(data['content']) self.text.extend(text) print('preprocessing...') self.token = self._preprocess(self.text, lemma=lemmatization, stop_words=stop_words) corpus = [self.id2word.doc2bow(text) for text in self.token] self.corpus.extend(corpus) self.model.update(corpus=corpus, iterations=iterations) def print_topics(self, num_topics=-1, num_words=10): """ :param num_topics:(int, optional) – The number of topics to be selected :param num_words:(int, optional) – The number of words to be included per topics :return: list of (int, list of (str, float)) """ if num_topics == -1: num_topics = self.num_topics pprint.pprint(self.model.print_topics(num_topics=num_topics, num_words=num_words)) return self.model.print_topics(num_topics=num_topics, num_words=num_words) def score(self): """ Print the Coherence score of the model. """ # print('\nPerplexity: ', self.model.log_perplexity(self.corpus)) coherence_model_lda = CoherenceModel(model=self.model, texts=self.token, corpus=self.corpus, dictionary=self.id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) def vis(self): """ Visualization of the data through browser. """ vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.id2word) pyLDAvis.show(vis) def consine(self, v1, v2): cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) return cosine def similarity(self): topic_vector = self.model.get_topics() num_topics = topic_vector.shape[0] consine_matrix = np.diag(np.ones(num_topics)) consine_list = [] for i in range(num_topics - 1): for j in range(i + 1, num_topics): consine_matrix[i][j] = self.consine(topic_vector[i], topic_vector[j]) consine_matrix[j][i] = consine_matrix[i][j] consine_list.append(consine_matrix[i][j]) average = np.average(consine_list) return average, consine_matrix def to_gephi(self): _, cosine_matrix = self.similarity() edge = [] for i in range(self.num_topics - 1): for j in range(i + 1, self.num_topics): edge.append(['topic {}'.format(i), 'topic {}'.format(j), cosine_matrix[i][j]]) for i in range(self.doc_topic.shape[0]): edge.append(['topic {}'.format(self.doc_topic.ix[i]['topic']), self.doc_topic.ix[i]['doc_id'], self.doc_topic.ix[i][ 'probability']]) return edge def to_wordcloud(self,path): try: os.mkdir(path + '/wordcloud') except: pass path = path + '/wordcloud' cont = [] for i in range(self.num_topics): key_word = dict(self.model.show_topic(i,topn=1000)) #cont = " ".join([word * int(value*10000) for word,value in key_word]) #cont = ",".join([(word + ",") * int(value*10000) for word,value in key_word]) wordcloud = WordCloud(max_words=300, background_color="white",height=600,width=800).generate_from_frequencies(key_word) wordcloud.to_file(path+"/topic{}.png".format(i)) def to_neo4j(self, path): try: os.mkdir(path + '/database') except: pass path = path + '/database' self.original_data.to_csv(path + '/document.csv', index=False) topic = [] relationship = [] words = [] for i in range(self.num_topics): topic.append(['topic {}'.format(i)]) for word, pro in self.model.show_topic(i): words.append([word]) relationship.append(['topic {}'.format(i), pro, word]) topic = pd.DataFrame(topic) topic.columns = ['id'] topic.to_csv(path + '/topic.csv', index=False) words = pd.DataFrame(words) words.columns = ['word'] words.to_csv(path + '/words.csv', index=False) for i in range(len(self.doc_topic)): relationship.append(['topic {}'.format(self.doc_topic.ix[i]['topic']), self.doc_topic.ix[i]['probability'], self.doc_topic.ix[i]['doc_id']]) _, consine_matrix = self.similarity() for i in range(self.num_topics - 1): for j in range(i + 1, self.num_topics): relationship.append(['topic %d' % i, consine_matrix[i][j], 'topic %d' % j]) relationship = pd.DataFrame(relationship) relationship.columns = ['source', 'probability', 'target'] relationship.to_csv(path + '/relationship.csv', index=False) f = open(path + '/script.txt', 'w') f.write( 'load csv with headers from "file:///document.csv" as line \nmerge (d:Document{id:toInteger(line.id),title:line.title,summary:line.title,content:line.content})\n\n') f.write('load csv with headers from "file:///topic.csv" as line\nmerge (t:Topic{id:line.id})\n\n') f.write('load csv with headers from "file:///words.csv" as line\nmerge (w:Word{id:line.word})\n\n') f.write( 'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Word{id:line.target})\nmerge (from)-[r:Key_word{probability:line.probability}]->(to)\n\n') f.write( 'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Document{id:toInteger(line.target)})\nmerge (from)-[r:Include{probability:line.probability}]->(to)\n\n') f.write( 'load csv with headers from "file:///relationship.csv" as line\nmatch (from:Topic{id:line.source}),(to:Topic{id:line.target})\nmerge (from)<-[r:Similarity{probability:line.probability}]->(to)\n\n') f.close()
from gensim.models import HdpModel import os from utils import read_traj_synthetic, ObsQuantizer __author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr' corpus_raw = [] path = "data/toy_pierre" max_iter = 100 for fname in os.listdir(path): if not fname.endswith(".txt"): continue fullname = os.path.join(path, fname) traj = read_traj_synthetic(fullname) corpus_raw.append(traj) oq = ObsQuantizer(min_x=-25, max_x=5, min_y=-10, max_y=10) corpus_gensim = oq.fit(corpus_raw) hdp = HdpModel(corpus=corpus_gensim, id2word=oq.dictionary) for iter in range(max_iter): hdp.update(corpus=corpus_gensim) topic_info = hdp.print_topics(num_topics=20, num_words=10) print(hdp) for topic in topic_info: print(topic)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @File : gensim_hdp.py # @Author: TW # @Date : 2018/3/23 # @Desc : from gensim import corpora from gensim.models import HdpModel import tw_word2vec.word2vec as tw_w2v from tw_segment import jiebaseg default_model: dict = tw_w2v.get_word2vec_dic("../data/needed_zh_word2vec.bin") keys = list(default_model.keys()) dictionary = corpora.Dictionary(keys) corpus = [] with open("../data/rawZhData/news_raw_wc2017-12-19.txt", "r") as f: for line in f.readlines(): line = line.strip() bow = dictionary.doc2bow(jiebaseg.segOnly(line)) corpus.append(bow) hdp = HdpModel(corpus, dictionary) hdp.print_topics(num_topics=20, num_words=10)
from gensim import corpora from gensim.models import HdpModel from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import re stop_words = set(stopwords.words('english')) file = open("ap_changed.txt", "r+") documents = file.readlines() #word_tokens = word_tokenize([for doc in documents]) texts = [[ re.sub(r'[^a-z]', '', text.lower()) for text in doc.split() if not text in stop_words and re.sub(r'[^a-z]', '', text) ] for doc in documents] #print('TEXT: ', texts[0]) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print(hdpmodel.print_topics(1000, 1)) #print(hdptopics)
def hlda(corpus, eldictionary, topic, probably_words): hdp = HdpModel(corpus, eldictionary) topic_info = hdp.print_topics(num_topics=20, num_words=10) print topic_info
#content.download() #content.parse() #content.nlp() article = {} # stemmed_words = set(stem_tokens(content.cleaned_text, stemmer)) article['keywords'] = text article['url'] = u article['title'] = content.title articles.append(article) except: continue #add existing articles to new articles if len(timelines) > 0: recent_timelines = [*map(lambda t: t[0], timelines)] articles = recent_timelines + articles texts = [*map(lambda x: x['keywords'], articles)] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] hdp = HdpModel(corpus, dictionary) #print(hdp.print_topics(num_topics=3, num_words=10)) print(hdp.show_topics(num_topics=-1, num_words=10)) topics = hdp.print_topics(num_topics=-1) texst = 1