예제 #1
0
def fit_topics(data, embeddings, vocab, K):
    """Fit a topic model to bag-of-words data."""

    tic = time.time()

    model = lda(n_components=K,
                max_iter=100,
                learning_method='online',
                learning_offset=50.,
                doc_topic_prior=1.,
                random_state=0,
                verbose=1)

    model.fit(data)
    topics = model.components_
    lda_centers = np.matmul(topics, embeddings)
    print('LDA Gibbs topics')
    n_top_words = 20
    print_top_words(model, vocab)
    topics_words = []
    for i, topic_dist in enumerate(topics):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        topics_words.append(topic_words)

    topic_proportions = model.transform(data)
    print("LDA fit done in %0.3fs." % (time.time() - tic))

    return topics, lda_centers, topic_proportions, topics_words
def lda_cluster(train_text):
    vectorizer = TfidfVectorizer(input='content',
                                 stop_words='english',
                                 lowercase=True,
                                 encoding='UTF-8',
                                 strip_accents='unicode',
                                 analyzer='word',
                                 ngram_range=(1, 2),
                                 max_features=100)

    # feature_matrix will be sparse, use "feature_matrix.toarray()" to get an array representation
    feature_matrix = vectorizer.fit_transform(train_text)
    corpus_vocab = vectorizer.get_feature_names()
    corpus_vocab = np.array(corpus_vocab)

    # now we want to get the most important words in each document
    feature_matrix = feature_matrix.toarray()

    # Since values of feature_matrix are between zero and 1, we can say
    # that for each document, only consider the word as part of the document
    # if it is of particular importance:
    feature_matrix = feature_matrix > TFID_cutoff
    feature_matrix = feature_matrix.astype(int)

    # now feature matrix is a matrix of 1 or 0 (bag of words)
    # we use corpus_vocab to get the actual corresponding word
    my_lda = lda(n_components=num_clusters, random_state=0)

    # run the LDA algorithm
    my_lda.fit_transform(feature_matrix)

    # extract the associated probability for each word
    word_topics = my_lda.components_  # will be of size (num_topics, num_words), each cell representing probability

    top_words_list = []
    top_words_group = []

    # now that we have the top words, we want to see what the top words are in each category
    for idx in range(num_clusters):
        category = word_topics[idx][:]

        # sort the probabilities of the associated words by index
        # the below will go from smallest to largest
        sorted_words = np.argsort(category)
        top_words = sorted_words[-1 * num_LDA_words:]

        print('###########')
        print('For Category number {0}'.format(idx))
        for word in top_words:
            top_words_list.append(corpus_vocab[word])
            top_words_group.append(idx)
            print(corpus_vocab[word])

    imp_word_list = pd.DataFrame(list(zip(top_words_list, top_words_group)),
                                 columns=['word', 'group'])
    return imp_word_list, num_clusters
예제 #3
0
    def run(self):
        self.model = lda(n_topics=100, learning_method='online')
        self.model.fit(self.doc_terms)

        feature_names = self.cv.get_feature_names()
        topic_matrix = []
        for topic_idx, topic in enumerate(self.model.components_):
            row = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            topic_matrix.append(row)

        self.topics = np.array(topic_matrix)
 def fitlda(x):
     from sklearn.decomposition import LatentDirichletAllocation as lda
     from pandas import DataFrame as df
     #    l=lda(n_components =100)
     l = lda(
         n_topics=ntopic
     )  #This parameter has been renamed to n_components and will be removed in version 0.21. .. deprecated:: 0.19
     if type(x) == type(df()):
         x = x.fillna(0)
     l.fit(x.fillna(0))
     return (l)
예제 #5
0
def fit_topics(data, embeddings, vocab, K):
    """Fit a topic model to bag-of-words data."""
    model = lda(n_components=K, max_iter=1500, random_state=1)
    model.fit(data)
    topics = model.components_
    lda_centers = np.matmul(topics, embeddings)
    print('LDA Gibbs topics')
    n_top_words = 20
    topics_words = []
    for i, topic_dist in enumerate(topics):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))
        topics_words.append(topic_words)
    print('\n')
    topic_proportions = model.transform(data)

    return topics, lda_centers, topic_proportions, topics_words
예제 #6
0
 def train_topic_model(self, docs, n_clusters):
     lda_random_state = 100
     lda_n_iter = 100
     n_top_words = 20
     print("Topic modeling using LDA...")
     d2w_vect = TfidfVectorizer(stop_words='english', max_df=0.30)
     d2w = d2w_vect.fit_transform(docs)
     model = lda(n_components=n_clusters,
                 max_iter=lda_n_iter,
                 random_state=lda_random_state)
     model.fit(d2w)
     print("\nTopical words:")
     print("-" * 20)
     words = [w for w, i in d2w_vect.vocabulary_.items()]
     for i, topic_dist in enumerate(model.components_):
         top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1]
         topic_words = [words[id_] for id_ in top_word_ids]
         print('Topic {}: {}'.format(i, ', '.join(topic_words)))
     topic_values = model.fit_transform(d2w)
     return topic_values

filename = "labeled_data.csv"  #input("enter .csv: ")
texts = "original_post"  #input("enter text field name: ")
label = "5CAT"  #input("enter label field name: ")
texts, labels = read_file(filename, texts, label)

filtered_texts = pre_clean(texts)
filtered_texts = number_filter(filtered_texts)
filtered_texts = drop_filter(filtered_texts)
texts = untokenize(filtered_texts)
#tf,voc = tf_idf(filtered_texts)
#print(tf.head(2))

tf, voc = tf(texts)

clf = lda(n_components=8)
model = clf.fit(tf)

##print(model.components_)
##print(len(model.components_[0]))
##for i in (model.components_):
##    print(max(i))
##print(len(model.components_))

print_top_words(model, voc, 20)

#trans = model.transform(texts)

#print(trans)
예제 #8
0
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation as lda
from scipy.sparse import csr_matrix

df2 = pd.read_pickle('all_comments_per_writer_df.pkl')
df2['count'] = 1
print(df2.shape)
print(df2.name.nunique())

article_id_u = sorted(df2.article_id.unique())
name_u = sorted(df2.name.unique())
data = df2['count'].tolist()
row = df2.name.astype('category', categories=name_u).cat.codes
col = df2.article_id.astype('category', categories=article_id_u).cat.codes

sparse_matrix = csr_matrix((data, (row, col)),
                           shape=(len(name_u), len(article_id_u)))
model = lda(n_components=15)
res = model.fit_transform(sparse_matrix)
df1 = pd.DataFrame(res)
df1.index = name_u
df1.to_csv('lda_15.csv')
예제 #9
0
""" Speeches III """
import pickle
import glob
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation as lda
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

n_features = 1000
n_components = 15
n_top_words = 10
n_topics = 2
count_m = pickle.load(open('./output/speech_matrix.pk', 'rb'))
lda_m = lda(n_components=n_topics, random_state=0)
topics = lda_m.fit_transform(count_m)
files = glob.glob('.\data\speeches\R0*')
corpus = []
text = open('.\data\speeches\R021028A', encoding='utf-8')

for name in files:
    try:
        f = open(name, encoding='utf-8')
        text = f.read()
        corpus.append(text)

    except UnicodeDecodeError:
        print(name)
text = "".join(corpus)


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
예제 #10
0
def iptdata(data_path,
            embeddings_path,
            T=70,
            glove_embeddings=True,
            stemming=True):
    data_all = sio.loadmat(data_path, squeeze_me=True,
                           chars_as_strings=True)  # dict

    if 'Y' in data_all:
        y_all = data_all['Y'].astype(np.int)
    else:
        y_all = np.concatenate(
            (data_all['yte'].astype(np.int), data_all['ytr'].astype(np.int)),
            axis=1)

    if 'X' in data_all:
        embed_all = data_all['X']
    else:
        embed_all = np.concatenate((data_all['xte'], data_all['xtr']), axis=1)

    if 'BOW_X' in data_all:
        BOW_all = data_all['BOW_X']
    else:
        BOW_all = np.concatenate((data_all['BOW_xte'], data_all['BOW_xtr']),
                                 axis=1)

    if 'words' in data_all:
        words_all = data_all['words']
    else:
        words_all = np.concatenate(
            (data_all['words_tr'], data_all['words_te']), axis=1)

    vocab = []
    vocab_embed = {}

    l = len(words_all)
    for i in range(l):
        word_i = words_all[i]
        embed_i = embed_all[i]
        bow_i = BOW_all[i]
        w = len(word_i)
        for j in range(w):
            if type(word_i[j]) == str:
                if word_i[j] not in vocab:
                    vocab.append(word_i[j])
                    vocab_embed[word_i[j]] = embed_i[:, j]
            else:
                break

    vocab_BOW = np.zeros((l, len(vocab)), dtype=np.int)

    l = len(words_all)
    for i in range(l):
        word_i = words_all[i]
        bow_i = BOW_all[i]

        w = len(word_i)
        words_idx = []
        for j in range(w):
            if type(word_i[j]) == str:
                words_idx.append(vocab.index(word_i[j]))
            else:
                break

        vocab_BOW[i, words_idx] = bow_i.astype(np.int)

    if glove_embeddings:
        vocab, vocab_embed, vocab_BOW = embeddings_new(vocab, vocab_BOW,
                                                       embeddings_path)

    if stemming:
        vocab, vocab_embed, vocab_BOW = stem_vocab(vocab_BOW, vocab,
                                                   vocab_embed)

    ####################################################

    l1_BOW, l2_BOW = vocab_BOW.shape
    embed_dat = [[] for _ in range(l1_BOW)]
    for i in range(l2_BOW):
        for d in range(l1_BOW):
            if vocab_BOW[d, i] > 0:
                for _ in range(vocab_BOW[d, i]):
                    embed_dat[d].append(vocab_embed[vocab[i]])

    vocab_embed = []
    for doc_i in embed_dat:
        vocab_embed.append(np.array(doc_i))

    # Matrix of word embeddings
    embeddings = np.array([vocab_embed[w] for w in vocab])

    model = lda(n_components=K, random_state=1)
    model.fit(vocab_BOW)
    topics = model.components_
    n_top_words = 20
    topic_dict = {}
    topic_proportions = model.transform(vocab_BOW)

    #cost_embeddings_cos = cosine_similarity(embeddings, embeddings)
    cost_embeddings = euclidean_distances(embeddings, embeddings)**1
    cost_topics = np.zeros((topics.shape[0], topics.shape[0]))
    cost_m = np.zeros((topics.shape[0], topics.shape[0]))

    for i in range(cost_topics.shape[0]):
        for j in range(i + 1, cost_topics.shape[1]):
            #print(i,j)
            # i_list = topic_dict[i].astype(bool)
            # j_list = topic_dict[j].astype(bool)
            #
            # topic_i = topics[i][i_list]
            # topic_j = topics[j][j_list]
            #
            # cost_e = cost_embeddings[i_list][:,j_list]
            # # np.ascontiguousarray(topic_i)
            # print(topic_i.flags['C_CONIGUOUS'])
            # # np.ascontiguousarray(topic_j)
            # print(topic_j.flags['C_CONTIGUOUS'])
            # cost_e = np.ascontiguousarray(cost_e)
            # print(cost_e.flags['C_CONTIGUOUS'])
            # cost_m[i,j] = ot.emd2(topic_i, topic_j, cost_e, numItermax=10000)
            cost_topics[i, j] = ot.emd2(topics[i],
                                        topics[j],
                                        cost_embeddings,
                                        numItermax=10000)
    cost_topics = cost_topics + np.transpose(cost_topics)

    outputs = {
        'BOW': vocab_BOW,
        'class': y_all - 1,
        'topic_proportions': topic_proportions,
        'cost_embeddings': cost_embeddings,
        'cost_topics': cost_topics
    }

    return outputs
예제 #11
0
파일: lda.py 프로젝트: kosyachniy/tg
import csv

import numpy as np
# import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation as lda
from sklearn.externals import joblib

COMPILATION = 'polytics'


def read(name, sign=','):
    with open(name + '.csv', 'r') as file:
        return [i for i in csv.reader(file, delimiter=sign, quotechar=' ')]


vectors = read('data/history/{}/vectors'.format(COMPILATION))
dataset = np.array(vectors, dtype='float')

model = lda(n_components=2
            )  # 60, max_iter=30, n_jobs=6, learning_method='batch', verbose=1)
model.fit(dataset)

for i in model.components_:
    print(i)

model.show_topic(0, topn=10)

joblib.dump(model, 'data/history/{}/lda.txt'.format(COMPILATION))
예제 #12
0
mod.components_.shape
mod.transform(matrix)

topics=pd.DataFrame({'topic1':mod.components_[0],'topic2':mod.components_[1]},index=tf.get_feature_names())

mod.transform(matrix)

topics['topic1'].sort_values(ascending=False).head()

topics['topic2'].sort_values(ascending=False).head()

## Using lda

from sklearn.decomposition import LatentDirichletAllocation as lda

mod1=lda(n_topics=2)

mod1.fit(matrix)

mod1.components_

topics_lda=pd.DataFrame({'topic1':mod1.components_[0],'topic2':mod1.components_[1]},index=tf.get_feature_names())

mod1.transform(matrix)

topics_lda['topic1'].sort_values(ascending=False).head()

topics_lda['topic2'].sort_values(ascending=False).head()

## Visualising lda model
예제 #13
0
 def run(self):
     self.model = lda(learning_method='online')
     self.model.fit(self.doc_terms)