Exemplo n.º 1
0
def get_lda_model(X, y):

    from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=20,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(X, y)

    print(lda_model)  # Model attributes
    from pprint import pprint
    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(X, y))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    #this is giving some error
    #print("Perplexity: ", lda_model.perplexity(X,y))

    # See model parameters
    pprint(lda_model.get_params())
    return lda_model
Exemplo n.º 2
0
    def applyLDA2(self, number_of_clusters, country_specific_tweets):
        train, feature_names = self.extractFeatures(country_specific_tweets,False)
        
        name = "lda"
        if self.results:
            print("Fitting LDA model with tfidf", end= " - ")
        t0 = time()     
        lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5,
                                        learning_method='online', learning_offset=50.,
                                        random_state=0)

        lda.fit(train)
        
        if self.results:
            print("done in %0.3fs." % (time() - t0))
        
        parameters = lda.get_params()
        topics = lda.components_
        doc_topic = lda.transform(train)
        top10, labels = self.printTopicCluster(topics, doc_topic, feature_names)
        labels = numpy.asarray(labels)
        
        if self.results:
            print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels)))
        
        return name, parameters, top10, labels
Exemplo n.º 3
0
def LDA_sklearn(text_data, num_topics, iterations, visualization = False, gridsearch = False ):
    vectorizer = OwnCountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english', lowercase = True,
                                    token_pattern = '[a-zA-Z\-][a-zA-Z\-]{2,}', ngram_range = (2, 3),
                                    decode_error = 'ignore')
    vectorized_text_data = vectorizer.fit_transform(text_data)
    lda_model = LatentDirichletAllocation(n_topics = num_topics, max_iter = iterations, learning_method = 'online',
                                          random_state = 100, batch_size = 120, evaluate_every = -1, n_jobs = -1)
    lda_output = lda_model.fit_transform(vectorized_text_data)
    print lda_model # model attributes
    print 'Log likelihood: ', lda_model.score(vectorized_text_data) # log-likelihood: the higher the better
    print 'Perplexity: ', lda_model.perplexity(vectorized_text_data) # perplexity = exp(-1. * log-likelihood per word, the lower the better
    pprint(lda_model.get_params()) # see model parameters

    # GridSearch the best model
    search_params = {'n_components': [41, 45, 50, 55, 60], 'learning_decay': [.5, .7, .9]}
    lda = LatentDirichletAllocation() # initialize the model
    model = GridSearchCV(lda, param_grid = search_params) # initialize the gridsearch class
    model.fit(vectorized_text_data) # do the grid search

    best_lda_model = model.best_estimator_ # best model
    print 'Best parameters: ', model.best_params_ # best parameters
    print 'Best Log-likelihood score: ', model.best_score_
    print 'Model perplexity: ', best_lda_model.perplexity(vectorized_text_data)

    # Compare LDA model performance scores

    # Get Log-likelihoods from Gridsearch otputs
    n_topics = [41, 45, 50, 55, 60]
    log_likelihoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if
                         g.score.parameters['learning_decay' == 0.5]]
    log_likelihoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if
                         g.score.parameters['learning_decay' == 0.7]]
    log_likelihoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if
                         g.score.parameters['learning_decay' == 0.9]]

    # Show graph
    plt.figure(figsize = (10, 8))
    plt.plot(n_topics, log_likelihoods_5, label = '0.5')
    plt.plot(n_topics, log_likelihoods_7, label = '0.7')
    plt.plot(n_topics, log_likelihoods_9, label = '0.9')
    plt.title('Gridsearch output on choosing optimal LDA model')
    plt.xlabel('Number of topics')
    plt.ylabel('Log likelihood scores')
    plt.legend(title = 'Learning decay', loc = 'best')
    plt.show()

    if visualize == True:
        panel = pyLDAvis.sklearn.prepare(lda_model, vectorized_text_data, vectorizer, mds = 'tsne')
        pyLDAvis.show(panel)
    else:
        return lda_output[0] # for verification that it works
def embeddings_LDA(data):
    n_features = 1000

    # use scikit-learn implementation
    # https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
    tf_vectorizer = CountVectorizer(max_df=0.95, 
                                    min_df=0.2,
                                    max_features=n_features,
                                    stop_words=None)                         
    tf = tf_vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(n_components=3, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    lda.fit(tf)

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words=100)
    params = lda.get_params()
    print(params)
    # Show topic distribution over words
    # https://stackoverflow.com/questions/44208501/getting-topic-word-distribution-from-lda-in-scikit-learn
    topic_embeddings = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
    print(topic_embeddings)
    # TODO project with t-SNE
    # t SNE ok for user embeddings.
    # Will be harder for product embeddings, too many dimensions.
    tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(topic_embeddings)

    N = 10000
    df = pd.DataFrame(tsne_results)
    rndperm = np.random.permutation(df.shape[0])

    df_subset = df.loc[rndperm[:N],:].copy()
    df_subset['tsne-one'] = tsne_results[:,0]
    df_subset['tsne-two'] = tsne_results[:,1]

    plt.figure(figsize=(16,4))
    ax = plt.subplot(1, 3, 3)
    sns.scatterplot(
        x="tsne-one", y="tsne-two",
        hue="y",
        palette=sns.color_palette("hls", 10),
        data=df_subset,
        legend="full",
        alpha=0.3,
        ax=ax
)
Exemplo n.º 5
0
def perform_lda_analysis(txtDir='', numOfTxts=None, numOfTopics=5, maxIter=20,
                         learningMode='online', randomState=100, batchSize=128,
                         evaluateEvery=-1, nJobs=-1):
    """

    :param txtDir:
    :param numOfTxts: an integer or None for selecting all files
    :param numOfTopics:
    :param maxIter:
    :param learningMode:
    :param randomState:
    :param batchSize:
    :param evaluateEvery:
    :param nJobs:
    :return:
    """
    warnings.simplefilter("ignore", DeprecationWarning)
    txtLst = []
    for fname in os.listdir(txtDir)[:numOfTxts]:
        with codecs.open(os.path.join(cfg.pwc['cleanTxtDir'], fname), 'r', 'utf-8-sig') as fh:
            txt = get_content_words(fh.read())
            txtLst.append(txt)
    txtLst = txtLst
    vectorizer = CountVectorizer(analyzer='word', min_df=4, lowercase=True,
                                 token_pattern='[a-zA-Z0-9]{3,}')

    dataVector = vectorizer.fit_transform(txtLst)
    dataDense = dataVector.todense()
    print("Sparsicity: ", ((dataDense > 0).sum() / dataDense.size) * 100, "%")

    lda_model = LatentDirichletAllocation(n_topics=numOfTopics,
                                          max_iter=maxIter,
                                          learning_method=learningMode,
                                          random_state=randomState,
                                          batch_size=batchSize,
                                          evaluate_every=evaluateEvery,
                                          n_jobs=nJobs)

    lda_result = lda_model.fit_transform(dataVector)
    results = { 'result':lda_result,
                'logLikelyhood': lda_model.score(dataVector), # the higher the better
                'perplexity': lda_model.perplexity(dataVector), # the lower the better
                'params': lda_model.get_params()
                }
    pprint(results)
    return results
Exemplo n.º 6
0
def get_model_metrics(model: LatentDirichletAllocation, doc_mat: np.array):
    """

    Args:
        model ():
        doc_mat ():

    Returns:

    """

    print(doc_mat.shape)

    print('Perplexity: ', model.perplexity(doc_mat))

    print('Log likelihood', model.score(doc_mat))

    print('Params', model.get_params())
def _learn_lda(data, **kwargs):
    from sklearn.decomposition import LatentDirichletAllocation
    if hasattr(data.retention,
               'datatype') and data.retention.datatype == 'features':
        features = data.copy()
    else:
        if 'ngram_range' not in kwargs:
            kwargs.update({'ngram_range': (1, 2)})
        features = data.retention.extract_features(**kwargs)
    lda_filter = LatentDirichletAllocation.get_params(
        LatentDirichletAllocation)
    if 'random_state' not in kwargs:
        kwargs.update({'random_state': 0})
    kwargs = {i: j for i, j in kwargs.items() if i in lda_filter}
    lda = LatentDirichletAllocation(**kwargs)
    lda.fit(features)
    mech_desc = pd.DataFrame(lda.components_, columns=features.columns)
    return mech_desc, lda
Exemplo n.º 8
0
Arquivo: lda.py Projeto: dpakpdl/NLP
def analyser(data):
    _, data_vectorized = get_vectorized_data(data)
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=20,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(data_vectorized)

    print(lda_output)

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))

    # See model parameters
    pprint(lda_model.get_params())
Exemplo n.º 9
0
    timestamp = time.time()
    print("加载停词表...")
    load_stopwords(swlist)
    print("加载停词表耗时:", time.time() - timestamp, "s")
    timestamp = time.time()
    print("分词...")
    lemmatizer = WordNetLemmatizer()
    load_corpus(expected_tags, lemmatizer, swlist, corpus)
    print("分词耗时:", time.time() - timestamp, "s")
    tf_vectorizer = CountVectorizer(stop_words="english", lowercase=False)
    word_freq = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    print("语料库总词数:", len(tf_feature_names))
    lda = LatentDirichletAllocation(max_iter=50, doc_topic_prior=0.5, \
        topic_word_prior=0.1, learning_method="batch", random_state=0)
    for n_topics in [5, 10, 20]:
        lda.set_params(n_components=n_topics)
        params = lda.get_params(False)
        print("\nLDA模型参数:")
        for key, value in params.items():
            print(key, "<-", value)
        timestamp = time.time()
        print("LDA(" + "n_components = " + str(n_topics) + ")训练...")
        lda.fit(word_freq)
        print("LDA(" + "n_components = " + str(n_topics) + ")训练耗时:",
              time.time() - timestamp, "s")
        print("输出结果到" + "../../output_python/topic" + str(n_topics) +
              "/topic-top" + str(n_top_words) + "keywords.txt")
        save_top_topciwords(lda, tf_feature_names, n_top_words)
    print("\n结束时间:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
Exemplo n.º 10
0
    def getDecomposition(self):

        # cluster images into a dictionary
        # number has to be finalised after testing
        dictionary_size = 25
        h, w = self.dataMatrix.shape
        # print(w)
        # print(h)

        kmeans = MiniBatchKMeans(n_clusters=dictionary_size,
                                 init='k-means++',
                                 batch_size=250,
                                 random_state=0,
                                 verbose=0)
        kmeans.fit(self.dataMatrix)
        kmeans.get_params()
        kmeans.cluster_centers_
        labels = kmeans.labels_

        # histogram of labels for each image = term-document matrix
        num_train_images = h
        self.dataMatrix
        #num_kps needs to be calculated dynamically
        num_kps = 192
        A = np.zeros((dictionary_size, num_train_images))
        ii = 0
        jj = 0
        for img_idx in range(num_train_images):
            if img_idx == 0:
                A[:, img_idx], bins = np.histogram(labels[0:num_kps],
                                                   bins=range(dictionary_size +
                                                              1))
            else:
                ii = np.int(ii + num_kps)
                jj = np.int(ii + num_kps)
                A[:, img_idx], bins = np.histogram(labels[ii:jj],
                                                   bins=range(dictionary_size +
                                                              1))
                # print str(ii) + ':' + str(jj)
        # end for
        # plt.figure()
        # plt.spy(A.T, cmap='gray')
        # plt.gca().set_aspect('auto')
        # plt.title('AP tf-idf corpus')
        # plt.xlabel('dictionary')
        # plt.ylabel('documents')
        # plt.show()

        # print(self.dataMatrix)

        # Needs to be finalised
        num_topics = 25

        lda_vb = LatentDirichletAllocation(n_components=num_topics,
                                           max_iter=10,
                                           learning_method='online',
                                           batch_size=512,
                                           random_state=0,
                                           n_jobs=1)

        lda_vb.fit(self.dataMatrix.T)
        lda_vb.get_params()
        topics = lda_vb.components_
        H = lda_vb.transform(self.dataMatrix.T)

        # print(topics)
        # print(H.T)

        return topics, H.T
Exemplo n.º 11
0
    #fit LDA model
    print "Fitting LDA model..."
    lda_vb = LatentDirichletAllocation(n_topics=num_topics,
                                       max_iter=10,
                                       learning_method='online',
                                       batch_size=512,
                                       random_state=0,
                                       n_jobs=-1)

    tic = time()
    lda_vb.fit(A_tfidf_sp)  #online VB
    toc = time()
    print "elapsed time: %.4f sec" % (toc - tic)
    print "LDA params"
    print lda_vb.get_params()

    print "number of EM iter: %d" % lda_vb.n_batch_iter_
    print "number of dataset sweeps: %d" % lda_vb.n_iter_

    #topic matrix W: K x V
    #components[i,j]: topic i, word j
    topics = lda_vb.components_

    f = plt.figure()
    plt.matshow(topics, cmap='gray')
    plt.gca().set_aspect('auto')
    plt.title('learned topic matrix')
    plt.ylabel('topics')
    plt.xlabel('dictionary')
    plt.show()
Exemplo n.º 12
0
class NewsBias:
    def __init__(self):
        self.tf_vectorizer = []
        self.tf = []
        self.lda_model = []
        self.feature_names = []
        self.topics_mat = []
        self.sentiment_by_topic = []

    def fix_sites(mongo_db):
        fix_cnn(mongo_db)
        fix_huffpo(mongo_db)

    def from_mongo(self, db_name):
        df = get_df(db_name)
        df = clean_df(df)
        df = df[pd.notnull(df['processed_text'])]
        df = df[df['processed_text'] != '']

        return df

    def from_csv(self, csv_name):
        try:
            df = pd.read_csv('data/' + csv_name, parse_dates=False)
            return df
        except:
            print('CSV file does not exist!')
            print('Make sure CSV file is in data folder.')
            return False

    def to_csv(self, df, filename):
        filename = 'data/' + filename
        df.to_csv(filename, index=False)
        print('CSV file saved to: ' + filename)

    def update_from_bucket(self, filename):
        path = os.getcwd()
        # Example filename: 'dsiprojectdata/rss_feeds_new.tar'
        result = from_bucket(filename, path)
        if not result:
            print('Error updating data from bucket!')
            print(
                'Make sure you include folder and file in filename from bucket.'
            )

    def update_to_bucket(self, filename, bucketname, mongo_db=False):
        # If mongo database then just give database name as filename
        if mongo_db:
            cwd = os.getcwd()
            # Give permission to bash file then run
            p1 = subprocess.Popen('chmod',
                                  '+x',
                                  'backup.sh',
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            out1, err1 = p1.communicate()
            p2 = subprocess.Popen(cwd + '/backup.sh',
                                  filename,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            out2, err2 = p2.communicate()
        else:
            p = subprocess.Popen('/usr/bin/aws',
                                 's3',
                                 'cp',
                                 filename,
                                 's3://' + bucketname + '/',
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()

    def run_lda(self, df, max_features=1000, n_topics=20):
        df = df[pd.notnull(df['processed_text'])]
        processed_text = df['processed_text'].values.tolist()
        # Inclued quotes in LDA
        processed_quote = df['processed_quote'].values.tolist()
        processed_tweet = df['processed_tweet'].values.tolist()
        processed_all = []
        for text, quote, tweet in zip(processed_text, processed_quote):
            # Check if quote is nan
            if type(quote) == float:
                quote = ''
            if type(tweet) == float:
                tweet = ''
            processed_all.append(text + quote + tweet)
        try:
            self.tf_vectorizer = CountVectorizer(max_df=0.95,
                                                 min_df=0.05,
                                                 max_features=max_features,
                                                 stop_words='english')
            self.tf = self.tf_vectorizer.fit_transform(processed_all)
        except:
            import pdb
            pdb.set_trace()
        self.lda_model = LatentDirichletAllocation(n_topics=n_topics,
                                                   max_iter=5,
                                                   learning_method='online',
                                                   learning_offset=50.,
                                                   random_state=0,
                                                   n_jobs=-1)

        self.lda_model.fit(self.tf)

        self.feature_names = np.array(self.tf_vectorizer.get_feature_names())
        self.topics_mat = self.lda_model.components_

        return self.lda_model

    def run_gensim_lda(self, df, n_topics=20):
        self.lda_model = gensim_lda(df, n_topics)

    def get_top_word_by_topic(topic, n_words):
        return self.feature_names[np.argsort(
            self.topics_mat[topic, :])[::-1]][:n_words]

    def visualize_lda(self, df, display=False):
        if self.lda_model == []:
            self.run_lda(df)
        max_features = self.tf_vectorizer.get_params()['max_features']
        n_topics = self.lda_model.get_params()['n_topics']
        vis_data = pyLDAvis.sklearn.prepare(self.lda_model,
                                            self.tf,
                                            self.tf_vectorizer,
                                            R=n_topics,
                                            n_jobs=-1)
        pyLDAvis.save_html(
            vis_data, 'plots/pyLDAvis_' + str(max_features) + 'feats_' +
            str(n_topics) + 'topics.html')
        if display:
            pyLDAvis.show(vis_data)

    def get_sentiment_of_words(self, df):
        sentiment_of_words = sentiment_of_words_wordnet(df)

        return sentiment_of_words

    def get_sentiment_by_topic(self, df, display=False):
        n_topics = self.lda_model.get_params()['n_topics']

        self.sentiment_by_topic = sentiment_by_topic_wordnet(
            df, self.topics_mat, self.feature_names)

        if display:
            for i, site in enumerate(sentiment_by_topic.keys()):
                plt.subplot(3, 4, i + 1)
                score = []
                for topic in range(n_topics):
                    score.append(sentiment_by_topic[site][topic][3])
                score = np.array(score)
                score /= sum(np.abs(score))
                plt.bar(np.arange(len(score)), score, align='center')
                plt.ylabel('Score')
                plt.title('Score by Topic for ' + site)
            plt.subplots_adjust(hspace=0.4, wspace=0.4)
            plt.show()

        return self.sentiment_by_topic

    def length_of_articles_hist(self, df):
        for i, site in enumerate(df['source'].unique()):
            plt.subplot(3, 4, i + 1)
            new_df = df[df['source'] == site]
            article_len = [
                len(article.split(' ')) for article in new_df['article_text']
            ]
            plt.hist(article_len, normed=True)
            plt.xlabel('Length of Article')
            plt.ylabel('# of Articles')
            plt.title('Length of articles for ' + site)
        plt.subplots_adjust(hspace=0.4, wspace=0.4)
        plt.show()

    def pickle_everything(self):
        filename = '../pickles/lda_model.pkl'
        pickle.dump(self.lda_model, open(filename, 'wb'), protocol=2)

        filename = '../pickles/tf_vectorizer.pkl'
        pickle.dump(self.tf_vectorizer, open(filename, 'wb'), protocol=2)
Exemplo n.º 13
0
#              random_state=100, topic_word_prior=None,
#              total_samples=1000000.0, verbose=0)
lda = LatentDirichletAllocation(n_components=no_topics,
                                max_iter=100,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(tf)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(tf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(tf))

# See model parameters
print(lda.get_params())

no_top_words = 15
# display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

# text = "S'han detectat interrupcions degut a incidencies. Accident a Torredembarra."

# # LDA
# x = lda.transform(tf_vectorizer.transform([text]))[0]
# print ("Pel primer text, LDA es: ", x )

# text2 = "Exemple dun tweet que no te res a veure amb el tema i espero que no generi correlacions amb topics entrenats."

# # LDA
# x = lda.transform(tf_vectorizer.transform([text2]))[0]
Exemplo n.º 14
0
        print(feature)
    print(" ")

# In[86]:

print("Log Liklihood", ldavect.score(dtm))  # higher the better

# In[87]:

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", ldavect.perplexity(dtm))

# In[88]:

# See model parameters
print(ldavect.get_params())

# In[89]:

#create a transform dataframe for the LDA model
ldadf_tr = ldavect.fit_transform(dtm)
ldadf_tr
print("Completed in %0.4fs." % (time() - t0))

# In[90]:

#Now we will optimize using grid search and find the best parameters to model the topic using LDA
grid_params = {'n_components': [5, 8, 10, 15], 'learning_decay': [.5, .7, .9]}

# In[91]:
Exemplo n.º 15
0
def lda_topic_modeling(csv_file, n_topics):
    # Import dataset
    df = pd.read_csv(csv_file,
                     delimiter=';',
                     usecols=['post_name', 'post_description', 'post_tagline'])
    print(df.head(15))

    # Remove email and new line characters
    data = remove_email_and_new_line_chars(df.post_tagline)
    print('\n')
    pprint(data[:1])
    data = remove_email_and_new_line_chars(df.post_description)
    print('\n')
    pprint(data[:1])

    # Tokenize and Clean-up text
    data_words = list(sent_to_words(data))
    print('\n', data_words[:1])

    # Lemmatization
    nlp = spacy.load('en_core_web_sm', disable=[
        'parser', 'ner'
    ])  # Initialize spacy 'en' model, keeping only tagger
    # component (for efficiency)
    data_lemmatized = lemmatization(data_words, ['NOUN', 'ADJ', 'VERB', 'ADV'],
                                    nlp)  # Do lemmatization keeping only
    # Noun, Adj, Verb, Adverb
    print('\n', data_lemmatized[:2])

    # Create the Document-Word matrix
    vectorizer = CountVectorizer(
        analyzer='word',
        min_df=10,  # minimum required occurences of a word
        stop_words='english',  # remove english stop words
        lowercase=True,  # convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',  # number of chars in a word > 3
    )
    data_vectorized = vectorizer.fit_transform(data_lemmatized)

    # Check the Sparsicity
    data_dense = data_vectorized.todense()  # Materialize the sparse data
    print("\nSparsicity: ", ((data_dense > 0).sum() / data_dense.size) * 100,
          "%")  # Compute Sparsicity = Percentage of
    # Non-Zero cells

    # Build LDA model with sklearn
    lda_model = LatentDirichletAllocation(
        n_components=10,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # number of documents in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(data_vectorized)
    print('\n', lda_model)  # Model attributes

    # Diagnose model performance with perplexity and log-likelihood
    print(
        "\nLog Likelihood: ",
        lda_model.score(data_vectorized))  # Log Likelyhood: Higher the better
    print(
        "Perplexity: ",
        lda_model.perplexity(data_vectorized))  # Perplexity: Lower the better.
    # Perplexity = exp(-1. * log-likelihood per word)
    pprint(lda_model.get_params())  # See model parameters

    # GridSearch the best LDA model
    search_params = {
        'n_components': [n_topics],
        'learning_decay': [.5, .7, .9]
    }  # Define Search Param
    model = GridSearchCV(lda_model,
                         param_grid=search_params,
                         n_jobs=1,
                         iid=True,
                         cv=3,
                         error_score='raise')  # Init Grid Search Class
    model.fit(data_vectorized)  # Do the Grid Search

    # Find the best topic model and its parameters
    best_lda_model = model.best_estimator_  # Best Model
    print("\nBest Model's Params: ", model.best_params_)  # Model Parameters
    print("Best Log Likelihood Score: ",
          model.best_score_)  # Log Likelihood Score
    print("Model Perplexity: ",
          best_lda_model.perplexity(data_vectorized))  # Perplexity

    # How to see the dominant topic in each document
    df_document_topic = create_document_topic_matrix(best_lda_model,
                                                     data_vectorized, data)
    n_documents = 15  # This number indicates an excerpt of documents to which visualize their own dominant topic
    df_document_topics = df_document_topic.head(n_documents).style.applymap(
        color_green).applymap(make_bold).highlight_max(color='yellow', axis=1)
    print('\n', df_document_topics)

    # Get the top 15 keywords each topic
    topic_keywords = show_topics(
        vectorizer, best_lda_model,
        15)  # Show top n keywords for each topic in order of
    # highest probability. In this case n is equal to 15
    df_topic_keywords = pd.DataFrame(
        topic_keywords)  # Topic- Keywords Dataframe
    df_topic_keywords.columns = [
        'Word ' + str(i) for i in range(df_topic_keywords.shape[1])
    ]
    df_topic_keywords.index = [
        'Topic ' + str(i) for i in range(df_topic_keywords.shape[0])
    ]
    print('\n', df_topic_keywords)
    return df_document_topic
Exemplo n.º 16
0
class LDA(GenericModel):
    def __init__(self, **kwargs):
        self._corpus_matrix = None
        self._query_vector = None

        self.vectorizer = None
        self.lda_model = LatentDirichletAllocation(n_jobs=-1)

        super().__init__()

        self.similarity_measure = None
        self.set_basic_params(**kwargs)

        self.set_vectorizer(**kwargs)
        self.set_lda_model(**kwargs)

    def set_name(self, name):
        super().set_name(name)

    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)

    def set_basic_params(self, **kwargs):
        self.set_name('LDA' if LDA_Model_Hyperp.NAME.value not in
                      kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value])
        self.set_model_gen_name('lda')
        self.set_similarity_measure(
            sm.SimilarityMeasure.COSINE if LDA_Model_Hyperp.SIMILARITY_MEASURE.
            value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.
                                                   SIMILARITY_MEASURE.value])

    def set_similarity_measure(self, sim_measure):
        self.similarity_measure = sim_measure

    def set_vectorizer(self, **kwargs):
        self.vectorizer = TfidfVectorizer(
            stop_words='english', use_idf=True, smooth_idf=True
        ) if LDA_Model_Hyperp.VECTORIZER.value not in kwargs.keys(
        ) else kwargs[LDA_Model_Hyperp.VECTORIZER.value]
        vec_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__vectorizer__' in key
        }
        self.vectorizer.set_params(**vec_params)

    def set_lda_model(self, **kwargs):
        lda_model_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__lda_model__' in key
        }
        self.lda_model.set_params(**lda_model_params)

    def recover_links(self, corpus, query, test_cases_names,
                      bug_reports_names):
        self._corpus_matrix = self.vectorizer.fit_transform(corpus)
        self._query_vector = self.vectorizer.transform(query)

        self.out_1 = self.lda_model.fit_transform(self._corpus_matrix)
        self.out_2 = self.lda_model.transform(self._query_vector)

        metric = self.similarity_measure
        if metric == sm.SimilarityMeasure.COSINE:
            self._sim_matrix = pairwise.cosine_similarity(X=self.out_1,
                                                          Y=self.out_2)
        elif metric == sm.SimilarityMeasure.JSD:
            self._sim_matrix = pairwise_distances(X=self.out_1,
                                                  Y=self.out_2,
                                                  metric=SimilarityMeasure.jsd)
        elif metric == sm.SimilarityMeasure.EUCLIDIAN_DISTANCE:
            self._sim_matrix = pairwise_distances(X=self.out_1,
                                                  Y=self.out_2,
                                                  metric='euclidean')

        #self._sim_matrix =  super().normalize_sim_matrix(self._sim_matrix)
        self._sim_matrix = pd.DataFrame(data=self._sim_matrix,
                                        index=test_cases_names,
                                        columns=bug_reports_names)

        self._record_docs_feats(corpus, query, test_cases_names,
                                bug_reports_names)

    def _record_docs_feats(self, corpus, query, test_cases_names,
                           bug_reports_names):
        self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus)
        self.mrw_brs = self._recover_mrw_list(bug_reports_names, query)

        self.dl_tcs = self._recover_dl_list(test_cases_names, corpus)
        self.dl_brs = self._recover_dl_list(bug_reports_names, query)

        index = list(test_cases_names) + list(bug_reports_names)
        self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl'])

        for tc_name, mrw in self.mrw_tcs:
            self.docs_feats_df.at[tc_name, 'mrw'] = mrw

        for tc_name, dl in self.dl_tcs:
            self.docs_feats_df.at[tc_name, 'dl'] = dl

        for br_name, mrw in self.mrw_brs:
            self.docs_feats_df.at[br_name, 'mrw'] = mrw

        for br_name, dl in self.dl_brs:
            self.docs_feats_df.at[br_name, 'dl'] = dl

    def _recover_dl_list(self, artf_names, artf_descs):
        tokenizer = PorterStemmerBased_Tokenizer()
        dl_list = []
        for artf_name, artf_desc in zip(artf_names, artf_descs):
            dl_list.append((artf_name, len(tokenizer.__call__(artf_desc))))
        return dl_list

    def _recover_mrw_list(self, artf_names, artf_descs):
        N_REL_WORDS = 6
        mrw_list = []  # list of tuples (artf_name, mrw_list={})

        for artf_name, artf_desc in zip(artf_names, artf_descs):
            X = self.vectorizer.transform([artf_desc])
            df1 = pd.DataFrame(X.T.toarray())
            df1['token'] = self.vectorizer.get_feature_names()
            df1.sort_values(by=0, ascending=False, inplace=True)
            mrw = list(df1.iloc[0:N_REL_WORDS, 1].values)
            mrw_list.append((artf_name, mrw))

        return mrw_list

    def model_setup(self):
        return {
            "Setup": [{
                "Name": self.get_name()
            }, {
                "Similarity Measure and Minimum Threshold":
                self.get_sim_measure_min_threshold()
            }, {
                "Top Value": self.get_top_value()
            }, {
                "LDA Model": self.lda_model.get_params()
            }, {
                "Vectorizer": self.vectorizer.get_params()
            }, {
                "Vectorizer Type": type(self.vectorizer)
            }]
        }

    def get_name(self):
        return super().get_name()

    def get_model_gen_name(self):
        return super().get_model_gen_name()

    def get_similarity_measure(self):
        return self.similarity_measure

    def get_sim_matrix(self):
        return super().get_sim_matrix()

    def get_tokenizer_type(self):
        return type(self.tokenizer)

    def save_sim_matrix(self):
        super().save_sim_matrix()

    def get_query_vector(self):
        return self._query_vector

    def get_corpus_matrix(self):
        return self._corpus_matrix

    def get_vectorizer_type(self):
        return type(self.vectorizer)

    def print_topics(self):
        feature_names = self.vectorizer.get_feature_names()
        n_top_words = 10

        for topic_idx, topic in enumerate(self.lda_model.components_):
            message = "Topic #%d: " % topic_idx
            message += " ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            print(message)
Exemplo n.º 17
0
class LDA(object):
    """ Class for Latent Dirichlet Allocation model """
    def __init__(self, filename):
        self.filename = filename
        self.vectorized_data = None
        self.df_topic_keywords = None
        self.fitted = False

        # load fitted model if exists

        try:
            self.load_model(self.filename)
            self.fitted = True

        except IOError:
            self.vectorizer = CountVectorizer(lowercase=False)
            self.model = LatentDirichletAllocation()

    def __str__(self):
        print_string = 'LDA model. Params:\n'
        params = self.model.get_params()

        for key in params:
            print_string += '{0}: {1}\n'.format(key, params[key])

        return print_string

    def check_model(self):
        if not self.fitted or self.model is None:
            raise ValueError('Model is not fitted or not created')

    def fit(self, corpora):
        """
        Fit LDA model by texts collection

        :param corpora: list of str
        """

        self.vectorized_data = self.vectorizer.fit_transform(corpora)

        search_params = {'n_components': [10, 15, 20, 25, 30]}
        model = GridSearchCV(self.model, param_grid=search_params, cv=3)
        model.fit(self.vectorized_data)

        self.model = model.best_estimator_
        self.fitted = True
        self.construct_df_topics()
        self.save_model()

    def predict(self, text, distribution_only=True):
        """
        Predict most relevant words for given text

        :param text: str
        :param distribution_only: bool, compute only document-topics distribution
        :return: list of (keyword, probability score)
        """

        self.check_model()

        if distribution_only:
            return self.model.transform(self.vectorizer.transform(text))

        topic_probability_scores = self.model.transform(
            self.vectorizer.transform(text))[0]
        topics = self.df_topic_keywords.iloc[
            argmax(topic_probability_scores), :].values.tolist()
        topics = list(zip(topics, topic_probability_scores))
        return sorted(topics, key=lambda x: x[1], reverse=True)

    def compute_similarity(self, text1, text2):
        """
        Compute the Jensen-Shannon distance between probability arrays of two texts

        :param text1: list of str
        :param text2: list of str
        :return: float in [0, 1], bigger - less similar
        """

        text1_dist = self.predict(text1)[0]
        text2_dist = self.predict(text2)[0]
        return jensenshannon(text1_dist, text2_dist)

    def construct_df_topics(self, n_words=20):
        """ Construct pd.DataFrame with top %n_words% keywords for each topic """

        self.check_model()
        topic_keywords = []
        keywords = array(self.vectorizer.get_feature_names())

        for topic_weights in self.model.components_:
            top_keyword_locs = (-topic_weights).argsort()[:n_words]
            topic_keywords.append(keywords.take(top_keyword_locs))

        self.df_topic_keywords = pd.DataFrame(topic_keywords)
        self.df_topic_keywords.columns = [
            'Word ' + str(i) for i in range(self.df_topic_keywords.shape[1])
        ]
        self.df_topic_keywords.index = [
            'Topic ' + str(i) for i in range(self.df_topic_keywords.shape[0])
        ]

    def stats(self):
        self.check_model()
        print('Log Likelihood:', self.model.score(self.vectorized_data))
        print('Perplexity:', self.model.perplexity(self.vectorized_data))

    def visualize(self):
        """ Start local web-server and display LDA fitted model """

        self.check_model()
        show(
            prepare(self.model,
                    self.vectorized_data,
                    self.vectorizer,
                    mds='tsne'))

    def load_model(self, filename):
        """ Load LDA model, CountVectorizer instance and term-document matrix from binary file """

        with open(filename, 'rb') as file:
            model_dict = pickle.load(file)

        self.model = model_dict['model']
        self.vectorizer = model_dict['vec']
        self.vectorized_data = model_dict['vec_data']
        self.df_topic_keywords = model_dict['df']

    def save_model(self):
        """ Save fitted LDA model by pickle """

        self.check_model()

        with open(self.filename, 'wb') as file:
            pickle.dump(
                {
                    'model': self.model,
                    'vec': self.vectorizer,
                    'vec_data': self.vectorized_data,
                    'df': self.df_topic_keywords
                }, file)
Exemplo n.º 18
0
    print "number of docs: %d" %A_tfidf_sp.shape[0]
    print "dictionary size: %d" %A_tfidf_sp.shape[1]

    #tf-idf dictionary    
    tfidf_dict = tfidf.get_feature_names()
             
    #fit LDA model
    print "Fitting LDA model..."
    lda_vb = LatentDirichletAllocation(n_topics = num_topics, max_iter=10, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)

    tic = time()
    lda_vb.fit(A_tfidf_sp)  #online VB
    toc = time()
    print "elapsed time: %.4f sec" %(toc - tic)
    print "LDA params"
    print lda_vb.get_params()

    print "number of EM iter: %d" % lda_vb.n_batch_iter_
    print "number of dataset sweeps: %d" % lda_vb.n_iter_

    #topic matrix W: K x V
    #components[i,j]: topic i, word j
    topics = lda_vb.components_
        
    f = plt.figure()
    plt.matshow(topics, cmap = 'gray')   
    plt.gca().set_aspect('auto')
    plt.title('learned topic matrix')
    plt.ylabel('topics')
    plt.xlabel('dictionary')
    plt.show()
Exemplo n.º 19
0
# Save the dropped sentences
drop_doc_path = "selected_%d_dropped.txt" % total
with open("data/"+drop_doc_path, "w") as doc:
    for line in doc_dropped:
        doc.write(line + "\n")

# Training LDA
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=None, stop_words='english')
tf = tf_vectorizer.fit_transform(doc_dropped)
lda = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
lda.fit(tf)
vocab_dict = tf_vectorizer.vocabulary_
components = lda.components_
component_names = tf_vectorizer.get_feature_names()

lda_params = lda.get_params(deep=True)
with open('data/params/lda_params.save', 'wb') as f:
    pickle.dump(lda_params, f, protocol=pickle.HIGHEST_PROTOCOL)

# Save topics and their associate words
for topic_idx, topic in enumerate(components):
    message = "topic_%d" % topic_idx
    print(message)
    idx = topic.argsort()
    with open("data/topics/" + message + ".txt", "w") as doc:
        for i in idx:
            doc.write(component_names[i] + "\n")

tf_sentence_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                         max_features=None,
                                         stop_words='english',
fileTfVector = fileVector.fit_transform(data)

print(fileTfVector.shape)

# LDA训练  使用16部小说进行训练
topic = args.k
model = LDA(n_components=topic, max_iter=50, learning_method='batch')
docres = model.fit_transform(fileTfVector[:16])

# print(docres)
value, indices = torch.max(torch.tensor(docres), 1)
print(indices)
print("{}个主题识别出来了{}个主题".format(topic, len(list(set(indices.tolist())))))
# print(len(model.components_))

res = model.transform(fileTfVector)
assert len(res) == len(labels)
df_labels = pd.DataFrame(labels)
# df_labels.to_excel("labels.xlsx")
df_res = pd.DataFrame(res)
# df_res.to_excel("ldaVector.xlsx")
df = pd.concat([df_labels, df_res], axis=1)
df.to_excel("labels_with_vector.xlsx")

with open("history.txt", "a", encoding="utf-8") as f:
    print(topic, file=f)
    print(model.get_params(), file=f)
    print("perplexity:", model.perplexity(fileTfVector[:16]), file=f)
    print("", file=f)
Exemplo n.º 21
0
import pandas as pd
columnMap = pd.read_csv(dataDirectory + dataFile + "-columnMap.txt",header=None, names=("Idx","Term"))

targetMap = pd.read_csv(dataDirectory + dataFile + "-targetMap.txt",header=None, names=("Target","Idx"))

NUM_TOPICS=5

from sklearn.decomposition import LatentDirichletAllocation
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=400, random_state=10,learning_method='batch', learning_decay=0, evaluate_every=1, perp_tol=0.01 , topic_word_prior=1/1000, verbose=1)

#lda_hr= lda_model.fit_transform(X)

lda_hr= lda_model.fit(X)
# checking model specification
lda_model.get_params()

def print_topics(model, feature_names, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(feature_names[i], round(topic[i],2))
        for i in topic.argsort()[:-top_n -1:-1]])
    
print("LDA Model:")
print_topics(lda_hr, columnMap['Term'])
print("=" * 40)


# grid search
from sklearn.model_selection import GridSearchCV
search_params = {'n_components': [2, 3, 4, 5, 6, 7]}
Exemplo n.º 22
0
    batch_size=128,  # No of docs in each iter
    evaluate_every=-1,  # Compute perplexity every n iters
    n_jobs=-1)  # Use all available CPUs

lda_output = lda_model.fit_transform(samples)
print(lda_model)

# Diagnose model performance with perplexity and log-likelihood
# Log Likelyhood: Higher the better
print "Log Likelihood: ", lda_model.score(samples)

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(samples))

# See model parameters
pprint(lda_model.get_params())

# Perform GridSearch for the best LDA model
# Define Search Param
search_params = {
    'n_components': [6, 7, 8, 9],  # take 10 topics
    'learning_decay': [0.5, 0.7, 0.9],
    'max_iter': [6, 7, 8, 9],
    'random_state': [2018]
}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
Exemplo n.º 23
0
                                      batch_size=128,            
# n docs in each learning iter
                                      evaluate_every = -1,       
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized) #takes times...
print(lda_model)  #print the  Model attributes
###############################################################################
    #4bis. Diagnose model performance with perplexity and log-likelihood
print("Log Likelihood: ", lda_model.score(data_vectorized)) #Higher the better
#out : Log Likelihood:  -8 509 466.557993239
print("Perplexity: ", lda_model.perplexity(data_vectorized)) #Lower the better. Perplexity = exp(-1. * log-likelihood per word)
#out: Perplexity:  1 039.767935888455
print(lda_model.get_params()) #print the lda_paramètres
###############################################################################"

    #5. Use Grid-Search.fit & .best_estimator_ to have the best LDA model.n_components  ?
# Define Grid-Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', 
                                learning_offset=50., random_state=0)

model = GridSearchCV(lda, param_grid=search_params)
model.fit(data_vectorized) #takes time !

#Grid-Search constructs multiple LDA models for all possible combinations of param values
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,