def train_mgp(spacy_lang='pt_core_news_sm', train_file='train.csv', category_column='category', title_column='title', number_regex='[0-9]', number_code='NUMBER', model_file='mgp.model', scores_file='scores.npy'): nlp = spacy.load(spacy_lang) tokenizer = nlp.Defaults.create_tokenizer(nlp) df = pd.read_csv(train_file) len_categories = len(df[category_column].drop_duplicates().values.tolist()) mgp = MovieGroupProcess(K=len_categories+100, alpha=0.1, beta=0.1, n_iters=10) docs = df[title_column].values.tolist() tokens = [] for item in docs: processed_item = re.sub(number_regex, number_code, item.lower()) tmp = tokenizer(processed_item) tokens.append([str(x) for x in tmp if not (x.is_punct or x.is_stop)]) tokens_freq_dict = dict( Counter([x for y in tokens for x in y]).most_common()) for idx, item in enumerate(tokens): tokens[idx] = list( filter(lambda x: tokens_freq_dict[x] > 1, tokens[idx])) vocab_size = len(set(x for y in tokens for x in y)) y = mgp.fit(tokens, vocab_size) scores = [] for item in tokens: scores.append(np.array(mgp.score(item))) scores = np.array(scores) with open(model_file, 'wb') as f: pickle.dump(mgp, f) f.close() np.save(scores_file, scores)
def sentences_to_gsdmm(sentences, num_topics): # return dictionary, topic ndarray, and scores for each document dictionary, corpus = get_dict_corpus(sentences) # corpus is the list of documents in token, BOW format. # a sequence of tuples where the first entry is the token ID, # and the second is the count of that token corpus_tokens = [[a[0] for a in sent] for sent in corpus] max_token = max([max(a) for a in corpus_tokens if a]) mgp = MovieGroupProcess(K=num_topics, alpha=0.1, beta=0.5, n_iters=50) mgp.fit(corpus_tokens, max_token) topics = mgp.cluster_word_distribution # array of BOW dicts of token ids scores = [mgp.score(sentence) for sentence in corpus] # return topics represented as dicts of word => val # return scores represented as array (len docs) of arrays (len topics) mapping = {v: k for k, v in dictionary.token2id.items()} # create ndarray from topic map topic_ndarray = np.zeros((len(topics), max_token + 1)) for i, topic in enumerate(topics): for k, v in topic.items(): topic_ndarray[i][k] = v return dictionary, topic_ndarray, scores
def train(topics): mgp = MovieGroupProcess(K=topics, alpha=0.1, beta=0.1, n_iters=30) vocab = set(word for doc in data_words for word in doc) n_terms = len(vocab) y = mgp.fit(data_words, n_terms) pickle.dump(mgp, open(path / ('Models/GSDMMModel.pkl'), 'wb')) pickle.dump(y, open(path / ('Models/GSDMMLabel.pkl'), 'wb')) print('Finished training')
def train_model(self, corpus): if super().check_if_force_build(): mgp = MovieGroupProcess(**self.gsdmm_args) vocab = set(x for doc in corpus for x in doc) n_terms = len(vocab) model = mgp.fit(corpus, n_terms) self.model = model self.save() else: pass
def main(): reviews = get_yelp_reviews(yelp_reviews_file) sentences = preprocess(reviews) vocab = set(word for sentence in sentences for word in sentence) n_terms = len(vocab) mgp = MovieGroupProcess(K=25, alpha=0.1, beta=0.1, n_iters=30) mgp.fit(sentences, n_terms) pickle.dump(mgp, open("Chapter06/mgp.pkl", "wb")) doc_count = np.array(mgp.cluster_doc_count) print(doc_count) top_clusters = doc_count.argsort()[-15:][::-1] print(top_clusters) top_words_by_cluster(mgp, top_clusters, 10)
def train_mgp( self, docs, K=10, # noqa: N803 alpha=0.1, beta=0.1, n_iters=30): vocab = set(x for doc in docs for x in doc.tokens) n_terms = len(vocab) mgp = MovieGroupProcess(K=K, alpha=alpha, beta=beta, n_iters=n_iters) with StreamToLogger(log_level=logging.DEBUG).activate(): mgp.fit([doc.tokens for doc in docs], n_terms) return mgp
def gsdmm_train(training_text, extra_stop_words, K=20, alpha=0.1, beta=0.1, n_iters=30): stop_words.update(extra_stop_words) # Preprocessing docs = [] for text in training_text: tokens = word_tokenize(text) docs.append([]) for word in tokens: if len(word) < 3: continue if word not in stop_words: docs[-1].append(stemmer.stem(word)) vocab = set(x for doc in docs for x in doc) n_terms = len(vocab) mgp = MovieGroupProcess(K=K, alpha=alpha, beta=beta, n_iters=n_iters) n_terms = len(vocab) y = mgp.fit(docs, n_terms) # Save model with open("v1.model", "wb") as f: pickle.dump(mgp, f) f.close() doc_count = np.array(mgp.cluster_doc_count) #print('Number of documents per topic :', doc_count) #print('*'*20) top_index = doc_count.argsort()[-10:][::-1] #print('Most important clusters (by number of docs inside):', top_index) #print('*'*20) idx = 0 for distr in mgp.cluster_word_distribution: sorted_x = sorted(distr.items(), key=operator.itemgetter(1)) #print(idx, sum([y for x,y in sorted_x]), sorted_x[-20:]) idx += 1 return mgp
def mpg_tester(data, k=5, a=0.1, b=0.1, iters=30): "fits GSDMM model to create k clusters of topics out of data" vocab = set(x for doc in data for x in doc) n_terms = len(vocab) mgpk = MovieGroupProcess(K=k, alpha=a, beta=b, n_iters=iters) y = mgpk.fit(data, n_terms) doc_count = np.array(mgpk.cluster_doc_count) print('Number of documents per topic :', doc_count) print('*' * 20) top_index = doc_count.argsort()[-10:][::-1] print('Most important clusters (by number of docs inside):', top_index) print('*' * 20) # Show the top 20 words in term frequency for each cluster for i in range(k): print(i) A = mgpk.cluster_word_distribution[i] print(heapq.nlargest(20, A, key=A.get)) return mgpk
left_out = [] labels_samples = [] labels_left_out = [] lemmatized = [] for sentence in sentences: curr = [lem.lemmatize(token) for token in sentence.split() if token not in stopwords] lemmatized.append(' '.join(elem for elem in curr)) vect.fit_transform(lemmatized) for i in range(len(sentences)): curr = [token for token in sentences[i].split() if token in vect.vocabulary_] if len(curr) > 0: samples.append(curr) samples_raw.append(sentences[i]) labels_samples.append(labels[i]) else: left_out.append(sentences[i]) labels_left_out.append(labels[i]) print(len(vect.vocabulary_)) mgp = MovieGroupProcess(K=40, alpha=0.1, beta=0.1, n_iters=30) mgp.fit(samples,len(vect.vocabulary_)) results = [mgp.choose_best_label(sample) for sample in samples] if len(results) < len(sentences): results.extend(["Left out"] * (len(sentences)-len(results))) pd.DataFrame({'Sentence':samples_raw+left_out, 'Label':labels_samples+labels_left_out, 'Cluster':results}).to_csv('clustering_results.csv')
@lru_cache(100000) def lemmatize(w): return wnl.lemmatize(w) def tokenize(txt): txt = "\n".join([line.strip() for line in txt.split("\n") if ":" not in line]) return [lemmatize(t) for t in tt.tokenize(txt) if str.isalpha(t) and not t in stops] newsgroups_train = [tokenize(txt) for txt in tqdm(fetch_20newsgroups(subset="train").data[:100])] print("Texts prepared.") # print("\n".join(newsgroups_train)) print("---") mgp = MovieGroupProcess(K=40, alpha=0.1, beta=0.1, n_iters=30) vocab_size = len(set([w for txt in newsgroups_train for w in txt])) y = mgp.fit(newsgroups_train, vocab_size) with open("trained.pickle", "wb") as wf: pickle.dump([mgp, y], wf) for doc, label in zip(newsgroups_train, y): print(label, " ".join(doc[:20])) for map in mgp.cluster_word_distribution: print(map)
# temp_list.append(eachword) # my_output += eachword # my_output += ' ' # except: # print('no synonyms') data_list.append(my_output) #存放进行分词后的帖子 f4 = open(mydir + 'segments/' + "%s" % key, 'w', encoding='UTF-8') f4.write(my_output) f4.close() data_list = [text.split() for text in data_list] print("data_list: ") print(data_list) V = compute_V(data_list) mgp = MovieGroupProcess(K=50, n_iters=1000, alpha=0.02, beta=0.01) print("GSDMM算法开始!") y = mgp.fit(data_list, V) print("GSDMM算法结束!") #生成聚类的result字典,并且写入cluster_result文件 result = {} aid_bid = [ line_content.strip() for line_content in codecs.open(mydir + 'result_file/aid_bid') ] f5 = open(mydir + 'result_file/cluster_result', 'w', encoding='UTF-8') for index in range(len(data_list)): z = mgp.choose_best_label(data_list[index]) f5.write("%s" % aid_bid[index] + " " + "%s" % z[0] + "\n") result["%s" % aid_bid[index]] = z[0] print(result)
print('{}/10 processing {}'.format(i + 1, top_ten_product[i])) df_product = df[df.F5_Product == top_ten_product[i]].copy() product_embedding = np.array(df_product.embedding.to_list()) # dimensionality reduction print('dimension reduction') reduced_embedding = umap.UMAP( n_components=32).fit_transform(product_embedding) # split document (each PAR) into list of unique tokens as required for gsdmm model input texts = [list(set(text.split())) for text in df_product.Processed_PAR] # compute number of unique words in the vocabulary V = compute_V(texts) # build GSDMM model ## n_iters=10, number of clusters drops quickly and gets stable within 10 iterations ## K=50, K must be greater than number of ground truth clusters. We assume there are at most 50 topics for each product ## alpha=range(0,1), performance is stable within the range, but when alpha = 0, GSDMM converges very quickly ## beta=range(0,0.2), number of clusters drop as beta increases, performance is stable within the range mgp = MovieGroupProcess(K=50, n_iters=10, alpha=0.1, beta=0.1) # fit model and return list of labels for each document cluster_labels = mgp.fit(texts, V) # get silhouette_score s_score = evaluate(reduced_embedding, cluster_labels) print('{} processing completed, s_score={}'.format(top_ten_product[i], s_score)) # append cluster label to dataframe df_product['cluster_labels'] = cluster_labels df_product.to_csv('./data/{}_labels.csv'.format(top_ten_product[i]))
def main(args): mongourl = "mongodb://localhost:25541" snippet_db = "snippetdb" snippet_collection = "snippets" # max_dict_size = 50000 output_stem = "snippet_topicmodel/snippet" #### generate dictionary from large random sample of snippets #### raw_dict_fn = output_stem + '_wordids_raw.txt.bz2' corpus_dict = [] if (REBUILD_DICT or not os.path.isfile(raw_dict_fn)): print("=== Generating Dictionary") # as there are 26 million snippets, this is slow corpus_reader_text = MongoCorpusReader(mongourl, snippet_db, snippet_collection, rand_pct=.5) corpus_dict = gensim.corpora.Dictionary(corpus_reader_text) # save raw dict (raw text compresses better than pickle, though may be slower to load) corpus_dict.save_as_text(raw_dict_fn) final_dict_fn = output_stem + '_wordids_final.txt.bz2' if (REFILTER_DICT or REBUILD_DICT or not os.path.isfile(final_dict_fn)): print("=== Filtering Dictionary") if not corpus_dict: corpus_dict = gensim.corpora.Dictionary.load_from_text(raw_dict_fn) # trim corpus_dict to contain words appearing in at least 500 documents and not more than 2.5% of docs (up to DEFAULT_DICT_SIZE) corpus_dict.filter_extremes(no_below=500, no_above=0.025) # keep_n=max_dict_size) # save final dict corpus_dict.save_as_text(final_dict_fn) # load back the id->word mapping directly from file # this seems to save memory, compared to keeping the object as it was corpus_dict = gensim.corpora.Dictionary.load_from_text(final_dict_fn) if GENSIM_LDA: #### transform from BOW to TFIDF #### tfidf_model_fn = output_stem + "_tfidf.model" num_topics = 125 if (REBUILD_TFIDF or REBUILD_DICT or REFILTER_DICT or not os.path.isfile(tfidf_model_fn)): print("=== Transforming from BOW to TFIDF") # this is from a random sample stratified by station (we sample 20k samples per station) corpus_reader_bow = MongoCorpusReader(mongourl, snippet_db, snippet_collection, rand_idx_high=20000, corpus_dict=corpus_dict) tfidf = gensim.models.TfidfModel(corpus_reader_bow) # save tfidf model tfidf.save(tfidf_model_fn) # TBD: do we even need this now? # save tfidf vectors for all documents in matrix market format for later use # (this will be big and may take a while) gensim.corpora.MmCorpus.serialize(output_stem + '_tfidf.mm', tfidf[corpus_reader_bow], progress_cnt=10000) # NOTE: .mm file is subsequently bzipped and needs to be decompressed before reading! else: print("=== Loading TFIDF") tfidf = gensim.models.TfidfModel.load(tfidf_model_fn) if REGEN_LDA: #### transform from TFIDF to LDA topic model #### print("=== Generating LDA from TFIDF") # we're again using the same random sample when transforming from tfidf to LDA model corpus_reader_tfidf = MongoCorpusReader(mongourl, snippet_db, snippet_collection, rand_idx_high=20000, corpus_dict=corpus_dict, tfidf=tfidf) # single core version: # recommend trying this first to get alpha and eta settings and see how many we need to get converging #~ lda = gensim.models.ldamodel.LdaModel(corpus=corpus_reader_tfidf, id2word=corpus_dict, #~ num_topics=num_topics, #~ iterations=100, passes=3, #~ #alpha='auto', eta='auto', # these seem to increase perplexity #~ chunksize=8000) # use multicore version with recommended #/cores - 1 (not including hyperthreads) lda = gensim.models.ldamulticore.LdaMulticore( workers=3, corpus=corpus_reader_tfidf, id2word=corpus_dict, num_topics=num_topics, eval_every=0, iterations=100, passes=5, chunksize=4000) # save lda model lda.save(output_stem + '_lda.model-125.bz2') print("=== Loading LDA models") lda = gensim.models.ldamulticore.LdaMulticore.load( output_stem + '_lda.model-125.bz2') print("=== Gathering top topics and topic coherence") # show components of top topics corpus_reader_tfidf_small = MongoCorpusReader(mongourl, snippet_db, snippet_collection, rand_idx_high=500, corpus_dict=corpus_dict, tfidf=tfidf) top_topics = lda.top_topics(corpus=corpus_reader_tfidf_small, num_words=15) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) print(top_topics) elif GSDMM: print("=== Loading documents for GSDMM") MAX_CLUSTERS = 500 # this is from a random sample stratified by station (we sample 20k samples per station) corpus_reader_idxs = MongoCorpusReader( mongourl, snippet_db, snippet_collection, rand_idx_low=200, rand_idx_high=8200, corpus_dict=corpus_dict, fulldoc=True, dict_output="index", additional_queries={"recluster": True}) # gsdmm works with either text words or word indices; the latter is slightly faster and uses less memory docs = [doc['dict_out'] for doc in corpus_reader_idxs] doc_ids = [doc['_id'] for doc in corpus_reader_idxs] print("loaded %d docs" % len(docs)) CALC_VOCAB_SIZE = True if CALC_VOCAB_SIZE: # len(corpus_dict) is >= vocab_size since our sample < the corpus print("=== Calculating size of used vocabulary") V = set() for text in docs: for word in text: V.add(word) vocab_size = len(V) V = None print("size of used vocab: %d words" % vocab_size) # len(corpus_dict) is >= vocab_size since our sample < the corpus else: # but with a large enough sample, they're nearly the same vocab_size = len(corpus_dict) print("=== Clustering documents using GSDMM") mgp = MovieGroupProcess(K=MAX_CLUSTERS, alpha=0.12, beta=0.08, n_iters=20) doc_clusters = mgp.fit(docs, vocab_size) print("cluster count %d" % len(set(doc_clusters))) cluster_fn = output_stem + '_gsdmm_doc_clusters_2.pickle' data = {"doc_ids": doc_ids, "doc_clusters": doc_clusters} with open(cluster_fn, 'wb') as f: pickle.dump(data, f) for c in range(50): print("\nCluster %d:\n" % c) print(mgp.cluster_word_distribution[c]) else: print(" NO final analysis selected.") print("DONE")
passes=100, alpha=0.73, eta=0.97, per_word_topics=True) topics = lda_model.print_topics(num_topics=10, num_words=6) t = 0 for topic in topics: print("Topic {} -> {}".format(t, topic)) t += 1 ######gsdmm random.seed(1992) # Init of the Gibbs Sampling Dirichlet Mixture Model algorithm mgp = MovieGroupProcess(K=10, alpha=0.73, beta=0.97, n_iters=30) vocab = set(x for doc in tokenized_series for x in doc) n_terms = len(vocab) n_docs = len(tokenized_series) # Fit the model on the data given the chosen seeds print('Training GSDMM model....') y = mgp.fit(tokenized_series, n_terms) #probability the doc belongs to the topic print( sum([ max(mgp.score(tokenized_series[n]))
# hyperparameters tuning result_dict = {} # split document (each PAR) into list of unique tokens as required for gsdmm model input texts = [list(set(text.split())) for text in df.Processed_PAR] # compute number of unique words in the vocabulary V = compute_V(texts) # build GSDMM model ## n_iters=10, number of clusters drops quickly and gets stable within 10 iterations ## K=100, K must be greater than number of ground truth clusters. We assume there are at most 100 topics in total ## alpha=range(0,1), performance is stable within the range, but when alpha = 0, GSDMM converges very quickly ## beta=range(0,0.2), number of clusters drop as beta increases, performance is stable within the range alpha = [a / 10 for a in range(1, 11)] beta = [b / 20 for b in range(1, 5)] print('start tuning') for a in alpha: for b in beta: print('processing a={}, b={}'.format(a, b)) mgp = MovieGroupProcess(K=100, n_iters=10, alpha=a, beta=b) # fit model and return list of labels for each document cluster_labels = mgp.fit(texts, V) # get silhouette_score s_score = evaluate(reduced_embedding, cluster_labels) print('s_score={}'.format(s_score)) result_dict[(a, b)] = s_score max_key = max(result_dict, key=result_dict.get) print(result_dict) print('Best Result: a={}, b={}, s_score={}'.format(max_key[0], max_key[1], result_dict[max_key]))
# Input format for the model : list of strings (list of tokens) docs = tokenized_data['tokens'].tolist() vocab = set(x for doc in docs for x in doc) n_terms = len(vocab) print("Voc size:", n_terms) print("Number of documents:", len(docs)) # %% [markdown] # ## Training # %% # Train a new model # Init of the Gibbs Sampling Dirichlet Mixture Model algorithm mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30) vocab = set(x for doc in docs for x in doc) n_terms = len(vocab) n_docs = len(docs) # Fit the model on the data given the chosen seeds y = mgp.fit(docs, n_terms) # Save model with open('dumps/trained_models/model_v2.model', "wb") as f: pickle.dump(mgp, f) f.close() # %% # Load the model used in the post