Пример #1
0
def train_mgp(spacy_lang='pt_core_news_sm', train_file='train.csv', category_column='category', title_column='title', number_regex='[0-9]', number_code='NUMBER', model_file='mgp.model', scores_file='scores.npy'):
    nlp = spacy.load(spacy_lang)
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    df = pd.read_csv(train_file)
    len_categories = len(df[category_column].drop_duplicates().values.tolist())
    mgp = MovieGroupProcess(K=len_categories+100, alpha=0.1, beta=0.1, n_iters=10)
    docs = df[title_column].values.tolist()
    tokens = []
    for item in docs:
        processed_item = re.sub(number_regex, number_code, item.lower())
        tmp = tokenizer(processed_item)
        tokens.append([str(x) for x in tmp if not (x.is_punct or x.is_stop)])

    tokens_freq_dict = dict(
        Counter([x for y in tokens for x in y]).most_common())
    for idx, item in enumerate(tokens):
        tokens[idx] = list(
            filter(lambda x: tokens_freq_dict[x] > 1, tokens[idx]))

    vocab_size = len(set(x for y in tokens for x in y))
    y = mgp.fit(tokens, vocab_size)
    scores = []
    for item in tokens:
        scores.append(np.array(mgp.score(item)))
    scores = np.array(scores)
    with open(model_file, 'wb') as f:
        pickle.dump(mgp, f)
        f.close()
    np.save(scores_file, scores)
Пример #2
0
def sentences_to_gsdmm(sentences, num_topics):
    # return dictionary, topic ndarray, and scores for each document
    dictionary, corpus = get_dict_corpus(sentences)
    # corpus is the list of documents in token, BOW format.
    # a sequence of tuples where the first entry is the token ID,
    # and the second is the count of that token

    corpus_tokens = [[a[0] for a in sent] for sent in corpus]
    max_token = max([max(a) for a in corpus_tokens if a])
    mgp = MovieGroupProcess(K=num_topics, alpha=0.1, beta=0.5, n_iters=50)
    mgp.fit(corpus_tokens, max_token)
    topics = mgp.cluster_word_distribution
    # array of BOW dicts of token ids
    scores = [mgp.score(sentence) for sentence in corpus]
    # return topics represented as dicts of word => val
    # return scores represented as array (len docs) of arrays (len topics)
    mapping = {v: k for k, v in dictionary.token2id.items()}

    # create ndarray from topic map
    topic_ndarray = np.zeros((len(topics), max_token + 1))
    for i, topic in enumerate(topics):
        for k, v in topic.items():
            topic_ndarray[i][k] = v

    return dictionary, topic_ndarray, scores
Пример #3
0
def train(topics):
    mgp = MovieGroupProcess(K=topics, alpha=0.1, beta=0.1, n_iters=30)
    vocab = set(word for doc in data_words for word in doc)
    n_terms = len(vocab)
    y = mgp.fit(data_words, n_terms)
    pickle.dump(mgp, open(path / ('Models/GSDMMModel.pkl'), 'wb'))
    pickle.dump(y, open(path / ('Models/GSDMMLabel.pkl'), 'wb'))
    print('Finished training')
Пример #4
0
 def train_model(self, corpus):
     if super().check_if_force_build():
         mgp = MovieGroupProcess(**self.gsdmm_args)
         vocab = set(x for doc in corpus for x in doc)
         n_terms = len(vocab)
         model = mgp.fit(corpus, n_terms)
         self.model = model
         self.save()
     else:
         pass
Пример #5
0
def main():
    reviews = get_yelp_reviews(yelp_reviews_file)
    sentences = preprocess(reviews)
    vocab = set(word for sentence in sentences for word in sentence)
    n_terms = len(vocab)
    mgp = MovieGroupProcess(K=25, alpha=0.1, beta=0.1, n_iters=30)
    mgp.fit(sentences, n_terms)
    pickle.dump(mgp, open("Chapter06/mgp.pkl", "wb"))
    doc_count = np.array(mgp.cluster_doc_count)
    print(doc_count)
    top_clusters = doc_count.argsort()[-15:][::-1]
    print(top_clusters)
    top_words_by_cluster(mgp, top_clusters, 10)
    def train_mgp(
            self,
            docs,
            K=10,  # noqa: N803
            alpha=0.1,
            beta=0.1,
            n_iters=30):
        vocab = set(x for doc in docs for x in doc.tokens)
        n_terms = len(vocab)

        mgp = MovieGroupProcess(K=K, alpha=alpha, beta=beta, n_iters=n_iters)
        with StreamToLogger(log_level=logging.DEBUG).activate():
            mgp.fit([doc.tokens for doc in docs], n_terms)
        return mgp
def gsdmm_train(training_text,
                extra_stop_words,
                K=20,
                alpha=0.1,
                beta=0.1,
                n_iters=30):
    stop_words.update(extra_stop_words)

    # Preprocessing
    docs = []
    for text in training_text:
        tokens = word_tokenize(text)
        docs.append([])
        for word in tokens:
            if len(word) < 3:
                continue
            if word not in stop_words:
                docs[-1].append(stemmer.stem(word))

    vocab = set(x for doc in docs for x in doc)
    n_terms = len(vocab)

    mgp = MovieGroupProcess(K=K, alpha=alpha, beta=beta, n_iters=n_iters)

    n_terms = len(vocab)
    y = mgp.fit(docs, n_terms)  # Save model
    with open("v1.model", "wb") as f:
        pickle.dump(mgp, f)
        f.close()

    doc_count = np.array(mgp.cluster_doc_count)
    #print('Number of documents per topic :', doc_count)
    #print('*'*20)

    top_index = doc_count.argsort()[-10:][::-1]
    #print('Most important clusters (by number of docs inside):', top_index)
    #print('*'*20)

    idx = 0
    for distr in mgp.cluster_word_distribution:
        sorted_x = sorted(distr.items(), key=operator.itemgetter(1))
        #print(idx, sum([y for x,y in sorted_x]), sorted_x[-20:])
        idx += 1

    return mgp
Пример #8
0
def mpg_tester(data, k=5, a=0.1, b=0.1, iters=30):
    "fits GSDMM model to create k clusters of topics out of data"
    vocab = set(x for doc in data for x in doc)
    n_terms = len(vocab)
    mgpk = MovieGroupProcess(K=k, alpha=a, beta=b, n_iters=iters)
    y = mgpk.fit(data, n_terms)
    doc_count = np.array(mgpk.cluster_doc_count)
    print('Number of documents per topic :', doc_count)
    print('*' * 20)
    top_index = doc_count.argsort()[-10:][::-1]
    print('Most important clusters (by number of docs inside):', top_index)
    print('*' * 20)
    # Show the top 20 words in term frequency for each cluster
    for i in range(k):
        print(i)
        A = mgpk.cluster_word_distribution[i]
        print(heapq.nlargest(20, A, key=A.get))
    return mgpk
Пример #9
0
left_out = []
labels_samples = []
labels_left_out = []

lemmatized = []
for sentence in sentences:
	curr = [lem.lemmatize(token) for token in sentence.split() if token not in stopwords]
	lemmatized.append(' '.join(elem for elem in curr))

vect.fit_transform(lemmatized)

for i in range(len(sentences)):
	curr = [token for token in sentences[i].split() if token in vect.vocabulary_]
	if len(curr) > 0:
		samples.append(curr)
		samples_raw.append(sentences[i])
		labels_samples.append(labels[i])
	else:
		left_out.append(sentences[i])
		labels_left_out.append(labels[i])
print(len(vect.vocabulary_))

mgp = MovieGroupProcess(K=40, alpha=0.1, beta=0.1, n_iters=30)
mgp.fit(samples,len(vect.vocabulary_))

results = [mgp.choose_best_label(sample) for sample in samples]
if len(results) < len(sentences):
	results.extend(["Left out"] * (len(sentences)-len(results)))

pd.DataFrame({'Sentence':samples_raw+left_out, 'Label':labels_samples+labels_left_out, 'Cluster':results}).to_csv('clustering_results.csv')
Пример #10
0

@lru_cache(100000)
def lemmatize(w):
    return wnl.lemmatize(w)


def tokenize(txt):
    txt = "\n".join([line.strip() for line in txt.split("\n") if ":" not in line])
    return [lemmatize(t) for t in tt.tokenize(txt) if str.isalpha(t) and not t in stops]


newsgroups_train = [tokenize(txt) for txt in tqdm(fetch_20newsgroups(subset="train").data[:100])]

print("Texts prepared.")
# print("\n".join(newsgroups_train))
print("---")

mgp = MovieGroupProcess(K=40, alpha=0.1, beta=0.1, n_iters=30)
vocab_size = len(set([w for txt in newsgroups_train for w in txt]))
y = mgp.fit(newsgroups_train, vocab_size)

with open("trained.pickle", "wb") as wf:
    pickle.dump([mgp, y], wf)

for doc, label in zip(newsgroups_train, y):
    print(label, " ".join(doc[:20]))

for map in mgp.cluster_word_distribution:
    print(map)
Пример #11
0
                #                     temp_list.append(eachword)
                #                     my_output += eachword
                #                     my_output += ' '
                # except:
                #     print('no synonyms')
        data_list.append(my_output)
        #存放进行分词后的帖子
        f4 = open(mydir + 'segments/' + "%s" % key, 'w', encoding='UTF-8')
        f4.write(my_output)
        f4.close()

    data_list = [text.split() for text in data_list]
    print("data_list: ")
    print(data_list)
    V = compute_V(data_list)
    mgp = MovieGroupProcess(K=50, n_iters=1000, alpha=0.02, beta=0.01)
    print("GSDMM算法开始!")
    y = mgp.fit(data_list, V)
    print("GSDMM算法结束!")
    #生成聚类的result字典,并且写入cluster_result文件
    result = {}
    aid_bid = [
        line_content.strip()
        for line_content in codecs.open(mydir + 'result_file/aid_bid')
    ]
    f5 = open(mydir + 'result_file/cluster_result', 'w', encoding='UTF-8')
    for index in range(len(data_list)):
        z = mgp.choose_best_label(data_list[index])
        f5.write("%s" % aid_bid[index] + " " + "%s" % z[0] + "\n")
        result["%s" % aid_bid[index]] = z[0]
    print(result)
Пример #12
0
    print('{}/10 processing {}'.format(i + 1, top_ten_product[i]))
    df_product = df[df.F5_Product == top_ten_product[i]].copy()
    product_embedding = np.array(df_product.embedding.to_list())

    # dimensionality reduction
    print('dimension reduction')
    reduced_embedding = umap.UMAP(
        n_components=32).fit_transform(product_embedding)

    # split document (each PAR) into list of unique tokens as required for gsdmm model input
    texts = [list(set(text.split())) for text in df_product.Processed_PAR]
    # compute number of unique words in the vocabulary
    V = compute_V(texts)

    # build GSDMM model
    ## n_iters=10, number of clusters drops quickly and gets stable within 10 iterations
    ## K=50, K must be greater than number of ground truth clusters. We assume there are at most 50 topics for each product
    ## alpha=range(0,1), performance is stable within the range, but when alpha = 0, GSDMM converges very quickly
    ## beta=range(0,0.2), number of clusters drop as beta increases, performance is stable within the range
    mgp = MovieGroupProcess(K=50, n_iters=10, alpha=0.1, beta=0.1)
    # fit model and return list of labels for each document
    cluster_labels = mgp.fit(texts, V)
    # get silhouette_score
    s_score = evaluate(reduced_embedding, cluster_labels)
    print('{} processing completed, s_score={}'.format(top_ten_product[i],
                                                       s_score))

    # append cluster label to dataframe
    df_product['cluster_labels'] = cluster_labels
    df_product.to_csv('./data/{}_labels.csv'.format(top_ten_product[i]))
Пример #13
0
def main(args):
    mongourl = "mongodb://localhost:25541"
    snippet_db = "snippetdb"
    snippet_collection = "snippets"
    #    max_dict_size = 50000
    output_stem = "snippet_topicmodel/snippet"

    #### generate dictionary from large random sample of snippets ####
    raw_dict_fn = output_stem + '_wordids_raw.txt.bz2'
    corpus_dict = []
    if (REBUILD_DICT or not os.path.isfile(raw_dict_fn)):
        print("=== Generating Dictionary")
        # as there are 26 million snippets, this is slow
        corpus_reader_text = MongoCorpusReader(mongourl,
                                               snippet_db,
                                               snippet_collection,
                                               rand_pct=.5)
        corpus_dict = gensim.corpora.Dictionary(corpus_reader_text)

        # save raw dict (raw text compresses better than pickle, though may be slower to load)
        corpus_dict.save_as_text(raw_dict_fn)

    final_dict_fn = output_stem + '_wordids_final.txt.bz2'
    if (REFILTER_DICT or REBUILD_DICT or not os.path.isfile(final_dict_fn)):
        print("=== Filtering Dictionary")
        if not corpus_dict:
            corpus_dict = gensim.corpora.Dictionary.load_from_text(raw_dict_fn)
        # trim corpus_dict to contain words appearing in at least 500 documents and not more than 2.5% of docs (up to DEFAULT_DICT_SIZE)
        corpus_dict.filter_extremes(no_below=500,
                                    no_above=0.025)  # keep_n=max_dict_size)
        # save final dict
        corpus_dict.save_as_text(final_dict_fn)

    # load back the id->word mapping directly from file
    # this seems to save memory, compared to keeping the object as it was
    corpus_dict = gensim.corpora.Dictionary.load_from_text(final_dict_fn)

    if GENSIM_LDA:
        #### transform from BOW to TFIDF ####
        tfidf_model_fn = output_stem + "_tfidf.model"
        num_topics = 125
        if (REBUILD_TFIDF or REBUILD_DICT or REFILTER_DICT
                or not os.path.isfile(tfidf_model_fn)):
            print("=== Transforming from BOW to TFIDF")

            # this is from a random sample stratified by station (we sample 20k samples per station)
            corpus_reader_bow = MongoCorpusReader(mongourl,
                                                  snippet_db,
                                                  snippet_collection,
                                                  rand_idx_high=20000,
                                                  corpus_dict=corpus_dict)
            tfidf = gensim.models.TfidfModel(corpus_reader_bow)

            # save tfidf model
            tfidf.save(tfidf_model_fn)

            # TBD: do we even need this now?
            # save tfidf vectors for all documents in matrix market format for later use
            # (this will be big and may take a while)
            gensim.corpora.MmCorpus.serialize(output_stem + '_tfidf.mm',
                                              tfidf[corpus_reader_bow],
                                              progress_cnt=10000)
            # NOTE: .mm file is subsequently bzipped and needs to be decompressed before reading!

        else:
            print("=== Loading TFIDF")
            tfidf = gensim.models.TfidfModel.load(tfidf_model_fn)

        if REGEN_LDA:
            #### transform from TFIDF to LDA topic model ####
            print("=== Generating LDA from TFIDF")
            # we're again using the same random sample when transforming from tfidf to LDA model
            corpus_reader_tfidf = MongoCorpusReader(mongourl,
                                                    snippet_db,
                                                    snippet_collection,
                                                    rand_idx_high=20000,
                                                    corpus_dict=corpus_dict,
                                                    tfidf=tfidf)
            # single core version:
            # recommend trying this first to get alpha and eta settings and see how many we need to get converging
            #~ lda = gensim.models.ldamodel.LdaModel(corpus=corpus_reader_tfidf, id2word=corpus_dict,
            #~ num_topics=num_topics,
            #~ iterations=100, passes=3,
            #~ #alpha='auto', eta='auto', # these seem to increase perplexity
            #~ chunksize=8000)
            # use multicore version with recommended #/cores - 1 (not including hyperthreads)
            lda = gensim.models.ldamulticore.LdaMulticore(
                workers=3,
                corpus=corpus_reader_tfidf,
                id2word=corpus_dict,
                num_topics=num_topics,
                eval_every=0,
                iterations=100,
                passes=5,
                chunksize=4000)
            #  save lda model
            lda.save(output_stem + '_lda.model-125.bz2')

        print("=== Loading LDA models")
        lda = gensim.models.ldamulticore.LdaMulticore.load(
            output_stem + '_lda.model-125.bz2')

        print("=== Gathering top topics and topic coherence")
        # show components of top topics
        corpus_reader_tfidf_small = MongoCorpusReader(mongourl,
                                                      snippet_db,
                                                      snippet_collection,
                                                      rand_idx_high=500,
                                                      corpus_dict=corpus_dict,
                                                      tfidf=tfidf)
        top_topics = lda.top_topics(corpus=corpus_reader_tfidf_small,
                                    num_words=15)

        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)

        print(top_topics)

    elif GSDMM:

        print("=== Loading documents for GSDMM")
        MAX_CLUSTERS = 500
        # this is from a random sample stratified by station (we sample 20k samples per station)
        corpus_reader_idxs = MongoCorpusReader(
            mongourl,
            snippet_db,
            snippet_collection,
            rand_idx_low=200,
            rand_idx_high=8200,
            corpus_dict=corpus_dict,
            fulldoc=True,
            dict_output="index",
            additional_queries={"recluster": True})
        # gsdmm works with either text words or word indices; the latter is slightly faster and uses less memory

        docs = [doc['dict_out'] for doc in corpus_reader_idxs]
        doc_ids = [doc['_id'] for doc in corpus_reader_idxs]
        print("loaded %d docs" % len(docs))

        CALC_VOCAB_SIZE = True
        if CALC_VOCAB_SIZE:
            # len(corpus_dict) is >= vocab_size since our sample < the corpus
            print("=== Calculating size of used vocabulary")
            V = set()
            for text in docs:
                for word in text:
                    V.add(word)
            vocab_size = len(V)
            V = None
            print("size of used vocab: %d words" % vocab_size)
            # len(corpus_dict) is >= vocab_size since our sample < the corpus
        else:
            # but with a large enough sample, they're nearly the same
            vocab_size = len(corpus_dict)

        print("=== Clustering documents using GSDMM")
        mgp = MovieGroupProcess(K=MAX_CLUSTERS,
                                alpha=0.12,
                                beta=0.08,
                                n_iters=20)
        doc_clusters = mgp.fit(docs, vocab_size)

        print("cluster count %d" % len(set(doc_clusters)))
        cluster_fn = output_stem + '_gsdmm_doc_clusters_2.pickle'

        data = {"doc_ids": doc_ids, "doc_clusters": doc_clusters}
        with open(cluster_fn, 'wb') as f:
            pickle.dump(data, f)

        for c in range(50):
            print("\nCluster %d:\n" % c)
            print(mgp.cluster_word_distribution[c])

    else:
        print(" NO final analysis selected.")

    print("DONE")
Пример #14
0
                                            passes=100,
                                            alpha=0.73,
                                            eta=0.97,
                                            per_word_topics=True)

topics = lda_model.print_topics(num_topics=10, num_words=6)

t = 0
for topic in topics:
    print("Topic {} -> {}".format(t, topic))
    t += 1

######gsdmm
random.seed(1992)
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
mgp = MovieGroupProcess(K=10, alpha=0.73, beta=0.97, n_iters=30)

vocab = set(x for doc in tokenized_series for x in doc)
n_terms = len(vocab)
n_docs = len(tokenized_series)

# Fit the model on the data given the chosen seeds
print('Training GSDMM model....')

y = mgp.fit(tokenized_series, n_terms)

#probability the doc belongs to the topic

print(
    sum([
        max(mgp.score(tokenized_series[n]))
Пример #15
0
# hyperparameters tuning
result_dict = {}
# split document (each PAR) into list of unique tokens as required for gsdmm model input
texts = [list(set(text.split())) for text in df.Processed_PAR]
# compute number of unique words in the vocabulary
V = compute_V(texts)

# build GSDMM model
## n_iters=10, number of clusters drops quickly and gets stable within 10 iterations
## K=100, K must be greater than number of ground truth clusters. We assume there are at most 100 topics in total
## alpha=range(0,1), performance is stable within the range, but when alpha = 0, GSDMM converges very quickly
## beta=range(0,0.2), number of clusters drop as beta increases, performance is stable within the range
alpha = [a / 10 for a in range(1, 11)]
beta = [b / 20 for b in range(1, 5)]
print('start tuning')
for a in alpha:
    for b in beta:
        print('processing a={}, b={}'.format(a, b))
        mgp = MovieGroupProcess(K=100, n_iters=10, alpha=a, beta=b)
        # fit model and return list of labels for each document
        cluster_labels = mgp.fit(texts, V)
        # get silhouette_score
        s_score = evaluate(reduced_embedding, cluster_labels)
        print('s_score={}'.format(s_score))
        result_dict[(a, b)] = s_score

max_key = max(result_dict, key=result_dict.get)
print(result_dict)
print('Best Result: a={}, b={}, s_score={}'.format(max_key[0], max_key[1],
                                                   result_dict[max_key]))
Пример #16
0
# Input format for the model : list of strings (list of tokens)
docs = tokenized_data['tokens'].tolist()
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

print("Voc size:", n_terms)
print("Number of documents:", len(docs))

# %% [markdown]
# ## Training

# %%
# Train a new model

# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30)

vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
n_docs = len(docs)

# Fit the model on the data given the chosen seeds
y = mgp.fit(docs, n_terms)

# Save model
with open('dumps/trained_models/model_v2.model', "wb") as f:
    pickle.dump(mgp, f)
    f.close()

# %%
# Load the model used in the post