示例#1
0
 def test_lda_params(self):
     n_topics = 10
     model1 = guidedlda.GuidedLDA(n_topics, alpha=0.3)
     self.assertIsNotNone(model1)
     model2 = guidedlda.GuidedLDA(n_topics=n_topics, alpha=0.3, eta=0.4)
     self.assertIsNotNone(model2)
     self.assertRaises(ValueError, guidedlda.GuidedLDA, n_topics, alpha=-3)
     self.assertRaises(ValueError, guidedlda.GuidedLDA, n_topics, eta=-3)
     self.assertRaises(ValueError, guidedlda.GuidedLDA, n_topics, alpha=-3, eta=-3)
示例#2
0
 def fit(self, dtm, seed_topics=None, seed_confidence=None):
     """ Fits topic model using guidedlda model.
         Args:
             dtm = numpy array or pandas dataframe, document-term-matrix
             guided = boolean, guided LDA or regular LDA
             seed_topics = list, list of words belonging to a topic
             seed_confidence = float, confidence of seed_topics
             n_topics = int, number of topics to model
             n_iter = int, number of iterations
             random_state = int,
             refresh = int,
         Returns:
             model = guidedlda object, fitted topic model
     """
     # check if guided
     if (bool(seed_topics) is False) and (bool(seed_confidence) is False):
         guided = False
     else:
         guided = True
     # convert dtm to numpy array if input is in pandas
     if isinstance(dtm, pd.DataFrame):
         dtm = np.array(dtm)
     if not isinstance(dtm, np.ndarray):
         raise ValueError(
             'Please input a valid pandas dataframe or numpy array for dtm!'
         )
     # fit LDA model
     if guided:
         if not isinstance(seed_topics, dict):
             raise ValueError("Please enter a dictionary for seed_topics.")
         elif not isinstance(seed_confidence, float):
             raise ValueError("Please enter a float for seed_confidence.")
         elif self.n_topics < len(seed_topics):
             raise ValueError(
                 "n_topics must be greater than number of seed topics!")
         print("Guided LDA")
         model = guidedlda.GuidedLDA(n_topics=self.n_topics,
                                     n_iter=self.n_iter,
                                     random_state=self.random_state,
                                     refresh=self.refresh)
         model._fit(dtm, seed_topics, seed_confidence)
     elif not guided:
         print("Regular LDA")
         model = guidedlda.GuidedLDA(n_topics=self.n_topics,
                                     n_iter=self.n_iter,
                                     random_state=self.random_state,
                                     refresh=self.refresh)
         model.fit(dtm)
     self.model = model
     return model
示例#3
0
 def test_lda_getting_started(self):
     X = np.array([[1, 1], [2, 1], [3, 1], [4, 1], [5, 8], [6, 1]])
     model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=1)
     doc_topic = model.fit_transform(X)
     self.assertIsNotNone(doc_topic)
     self.assertIsNotNone(model.doc_topic_)
     self.assertIsNotNone(model.components_)
示例#4
0
 def setUpClass(cls):
     test_dir = os.path.dirname(__file__)
     nyt_ldac_fn = os.path.join(test_dir, 'nyt.ldac')
     vocab = guidedlda.datasets.load_vocab(guidedlda.datasets.NYT)
     cls.word2id = word2id = dict((v, idx) for idx, v in enumerate(vocab))
     cls.dtm = dtm = guidedlda.utils.ldac2dtm(open(nyt_ldac_fn), offset=0)
     cls.n_iter = n_iter = 1
     cls.n_topics = n_topics = 5
     cls.random_seed = random_seed = 1
     cls.model = model = guidedlda.GuidedLDA(n_topics=n_topics,
                                             n_iter=n_iter,
                                             random_state=random_seed)
     cls.seed_topic_list = [
         ['game', 'team', 'win', 'player', 'season', 'second', 'victory'],
         [
             'percent', 'company', 'market', 'price', 'sell', 'business',
             'stock', 'share'
         ], ['music', 'write', 'art', 'book', 'world', 'film'],
         [
             'political', 'government', 'leader', 'official', 'state',
             'country'
         ]
     ]
     cls.seed_topics = seed_topics = {}
     for t_id, st in enumerate(cls.seed_topic_list):
         for word in st:
             seed_topics[word2id[word]] = t_id
     cls.doc_topic = model.fit_transform(dtm,
                                         seed_topics=seed_topics,
                                         seed_confidence=0.15)
示例#5
0
def do_lda(seed_topics, data):
    model = guidedlda.GuidedLDA(n_topics=30,
                                n_iter=5000,
                                random_state=7,
                                refresh=10)
    model.fit(data, seed_topics=seed_topics, seed_confidence=0.25)
    pickle.dump(model, open("guidedlda_30.pickle", "wb"))
    return model
示例#6
0
def non_guided_analysis(X, vocab, topic_num, n_top_words=TOP_K_WORDS):
    """
    Non_guided Analysis on the given dtm
    """
    model = guidedlda.GuidedLDA(n_topics=topic_num,
                                n_iter=ITERATION_NUMS,
                                random_state=7,
                                refresh=20)
    model.fit(X)
    retrieve_words_from(model, vocab, topic_num, n_top_words)
示例#7
0
 def test_guidedlda_getting_started(self):
     X = np.array([[1, 0], [2, 0], [3, 0], [4, 0], [0, 8], [6, 0]])
     model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=1)
     seed_topics = {0: 0, 1: 1}
     doc_topic = model.fit_transform(X, seed_topics=seed_topics, seed_confidence=0.9)
     self.assertIsNotNone(doc_topic)
     self.assertIsNotNone(model.doc_topic_)
     self.assertIsNotNone(model.components_)
     self.assertEqual(model.word_topic_[0].argmax(), 0)
     self.assertEqual(model.word_topic_[1].argmax(), 1)
示例#8
0
def main(folder):
    word2idx = pickle.load(open(os.path.join(folder, "word_idx.p"), "rb"))
    print(word2idx)
    # Load seed topics
    seed_topics_dic, topics = seed_topics(word2idx)
    
    idx_to_word = {v: k for k, v in word2idx.items()}
    # Load data
    print("Starting training...")
    lda = guidedlda.GuidedLDA(n_topics=len(topics), n_iter=100, random_state=7, refresh=20)
    
    
    ## Concat data
    row, col, data = np.array(()), np.array(()), np.array(())
    
    matrix_data_list = glob.glob(os.path.join(folder, "matrix_data_*.p"))
    np.random.shuffle(matrix_data_list)
    for doc in tqdm.tqdm(matrix_data_list):
        print("Partial fitting", doc)
        res = pickle.load(open(doc, "rb"))
        row = np.append(row, np.int32(res["I"]))
        col = np.append(col, np.int32(res["J"]))
        data = np.append(data, np.int32(res["data"]))
        X = coo_matrix((np.int32(data), (np.int32(row), np.int32(col))))
        
    lda.fit(X, seed_topics=seed_topics_dic, seed_confidence=0)
    
    print("Training done")
    def print_top_words(model, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            message = "Topic #{} - {}: ".format(topic_idx, topics[topic_idx])
            message += " ".join([idx_to_word[i]
                                 for i in topic.argsort()[:-n_top_words - 1:-1]])
            print(message)
    
    def print_sentence_and_topic(sentence, topic):
        print(colored("Sentence:", "blue"), colored(sentence, "green"))
        print(colored("Topic:   ", "blue"), colored(topic, "red"))
    
    print_top_words(lda, 20)
    np.save(open(os.path.join(folder, "guided_components.npy"), "wb"), lda.components_)
    
    ## Test for input sentences
    stemmer = WordNetLemmatizer() 
    while True:
        sentence = input()
        list_words = [w.lower() for w in sentence.split()]
        np_array = np.zeros([1, len(word2idx.keys())])
        for word in list_words:
            stemmed_word = stemmer.lemmatize(word)
            if stemmed_word in word2idx:
                print(stemmed_word)
                np_array[0, word2idx[stemmed_word]] += 1
        topic_dist = lda.transform(np.int32(np_array))
        print_sentence_and_topic(sentence, topics[np.argmax(topic_dist)])
示例#9
0
 def setUpClass(cls):
     test_dir = os.path.dirname(__file__)
     reuters_ldac_fn = os.path.join(test_dir, 'reuters.ldac')
     cls.dtm = scipy.sparse.csr_matrix(
         guidedlda.utils.ldac2dtm(open(reuters_ldac_fn),
                                  offset=0)).astype(np.int64)
     cls.n_iter = n_iter = 1
     cls.n_topics = n_topics = 10
     cls.random_seed = random_seed = 1
     cls.model = guidedlda.GuidedLDA(n_topics=n_topics,
                                     n_iter=n_iter,
                                     random_state=random_seed)
示例#10
0
 def setUpClass(cls):
     test_dir = os.path.dirname(__file__)
     reuters_ldac_fn = os.path.join(test_dir, 'reuters.ldac')
     cls.dtm = dtm = guidedlda.utils.ldac2dtm(open(reuters_ldac_fn),
                                              offset=0)
     cls.n_iter = n_iter = 1
     cls.n_topics = n_topics = 10
     cls.random_seed = random_seed = 1
     cls.model = model = guidedlda.GuidedLDA(n_topics=n_topics,
                                             n_iter=n_iter,
                                             random_state=random_seed)
     cls.doc_topic = model.fit_transform(dtm)
示例#11
0
    def test_lda_monotone(self):
        dtm = self.dtm
        model = self.model
        n_topics = self.n_topics
        random_seed = self.random_seed

        # fit model with additional iterations, verify improvement in log likelihood
        n_iter = self.n_iter * 2
        model_new = guidedlda.GuidedLDA(n_topics=n_topics,
                                        n_iter=n_iter,
                                        random_state=random_seed)
        model_new.fit(dtm)
        self.assertGreater(model_new.loglikelihood(), model.loglikelihood())
def train_model(dataset,
                vocab,
                seed_topic_list,
                model_output_path,
                n_topics=5,
                n_top_words=10):
    """Function takes a dataset and creates a new model based on the privided input dataset
            
    Args:
        dataset (dtm): Dataset in dtm format as guided by guidedlda.datasets.* 
        vocab (list(str)): Global vocab.
        seed_topic_list (array(list(str)): Seed topic keywords used for GuidedLDA.
        model_output_path (str): Path to the ooutput of trained model
        n_topics (int): Number of topics, default=5
        n_top_words (int): Number of top words to be extracted for each topic. default=10

    Returns:
        Creates and model and writes to model_output_path        
    """
    word2id = dict((v, idx) for idx, v in enumerate(vocab))
    model = guidedlda.GuidedLDA(n_topics=n_topics,
                                n_iter=100,
                                random_state=7,
                                refresh=20)

    seed_topics = {}
    for t_id, st in enumerate(seed_topic_list):
        for word in st:
            seed_topics[word2id[word]] = t_id

    model.fit(dataset, seed_topics=seed_topics, seed_confidence=0.25)

    #List the top words of each topic from the trained model.
    topic_word = model.topic_word_
    for i, topic_dist in enumerate(topic_word):
        top_index = np.argsort(topic_dist)[:-(n_top_words + 1):-1]
        topic_words = np.array(vocab)[top_index]
        print('Topic [{}]: {}'.format(TOPIC_INDEX[i], ' '.join(topic_words)))

    print('\n')
    #Test on some files
    #doc_topic = model.transform(dataset)
    #for i in range(9):
    #    print("Top topic: {} [Document Key words: '{}']".format(TOPIC_INDEX[doc_topic[i].argmax()],
    #                                                  ', '.join(np.array(vocab)[list(reversed(dataset[i,:].argsort()))[0:5]])))

    # Dump the model for future production use.
    #model.purge_extra_matrices()
    with open('{}.pickle'.format(model_output_path), 'wb') as file_handle:
        pickle.dump(model, file_handle)
示例#13
0
    def optimize_lda(params,
                     param_dict=param_dict,
                     data_vect=data_vect,
                     seed_topics=seed_topics,
                     seed_confidence=seed_confidence,
                     n_iter=n_iter_param,
                     refresh=refresh_param,
                     random_seed=random_seed,
                     corpus=corpus,
                     dictionary=dictionary,
                     feature_names=feature_names,
                     metric=metric_to_optimize):
        '''
        Function to minimize in hyper parameter optimization
        metric: 'coherence_consistent', 'loglikelihood'
        '''
        model_guidedlda = guidedlda.GuidedLDA(random_state=random_seed,
                                              n_iter=n_iter,
                                              refresh=refresh,
                                              **params,
                                              **param_dict)
        model_guidedlda.fit(X=data_vect,
                            seed_topics=seed_topics,
                            seed_confidence=seed_confidence)

        # to check whether model's params change every trials or not
        # print('alpha,', 'beta,', 'eta,', 'n_topics,', 'random_state')
        # print(model_guidedlda.alpha, model_guidedlda.beta, model_guidedlda.eta, model_guidedlda.n_topics, model_guidedlda.random_state)

        if metric == 'loglikelihood':
            metric_value = abs(model_guidedlda.loglikelihood())
        elif metric == 'coherence_consistent':
            n_top_words = 20
            topic_word = model_guidedlda.topic_word_
            topics_lists = []
            for i, topic_dist in enumerate(topic_word):
                topic_words = list(
                    np.array(feature_names)[np.argsort(topic_dist)]
                    [:-(n_top_words + 1):-1])
                topics_lists.append(topic_words)
            coherence_model_lda = gensim.models.CoherenceModel(
                topics=topics_lists,
                corpus=corpus,
                dictionary=dictionary,
                coherence='u_mass')
            coherence_lda = coherence_model_lda.get_coherence()
            metric_value = abs(coherence_lda)
        else:
            raise ValueError('Choose the metric_to_optimize of the function')
        return metric_value
示例#14
0
    def do_seeded_lda(self, X, vocab, word2id, n_topics, n_iter, alpha, eta, random_state, seed_topic_list, seed_confidence):
        model = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, alpha=alpha, eta=eta, random_state=random_state, refresh=1)

        seed_topics = {}
        for t_id, st in enumerate(seed_topic_list):
            for word in st:
                seed_topics[word2id[word]] = t_id

        model.fit(X, seed_topics=seed_topics, seed_confidence=seed_confidence)

        n_top_words = 10
        topic_word = model.topic_word_
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
            print('Topic {}: {}'.format(i, ' '.join(topic_words)))
        return model.topic_word_
示例#15
0
    def test_lda_zero_iter(self):
        dtm = self.dtm
        model = self.model
        doc_topic = self.doc_topic
        n_topics = self.n_topics
        random_seed = self.random_seed

        # fit a new model with 0 iterations
        n_iter = 0
        model_new = guidedlda.GuidedLDA(n_topics=n_topics,
                                        n_iter=n_iter,
                                        random_state=random_seed)
        doc_topic_new = model_new.fit_transform(dtm)
        self.assertIsNotNone(model_new)
        self.assertIsNotNone(doc_topic_new)
        self.assertLess(model_new.loglikelihood(), model.loglikelihood())
        self.assertFalse((doc_topic_new == doc_topic).all())
示例#16
0
def guided_analysis(X,
                    word2id,
                    vocab,
                    topic_num,
                    confidence,
                    n_top_words=TOP_K_WORDS):
    """
    Guided Analysis on the given dtm
    """
    model = guidedlda.GuidedLDA(n_topics=topic_num,
                                n_iter=ITERATION_NUMS,
                                random_state=7,
                                refresh=20)
    model.fit(X,
              seed_topics=load_seed_topics(word2id),
              seed_confidence=confidence)
    return model
示例#17
0
def runGuidedLDA(doc_collection,no_topics,stop_words,seed_topics):
	print('Start SKLearnLDA...')
	tf_vec=CountVectorizer(max_df=0.95,min_df=2,max_features=no_features,stop_words=stop_words)
	termfreq=tf_vec.fit_transform(doc_collection)
	feature_names=tf_vec.get_feature_names()
	#Run LDA using scitkit learn
	print('Constructing GUIDED LDA model...')
	startlda=time.time()
	ldamodel=guidedlda.GuidedLDA(n_topics=no_topics, n_iter=100, random_state=7, refresh=20).fit(termfreq,seed_topics=seed_topics, seed_confidence=0.15)
	print('LDA Model Construction Took:'+str((time.time()-startlda)/60)+' minutes.')
	startldavecs=time.time()
	print('Constructing LDA vectors...')
	#ldavecs = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit_transform(termfreq,docidentifiers)	
	ldavecs=ldamodel.transform(termfreq)
	print('LDA Vector Construction Took:'+str((time.time()-startldavecs)/60)+' minutes.')
	print('Completed SKLearnLDA!')
	return termfreq,ldamodel,ldavecs,feature_names
示例#18
0
    def test_lda_random_seed(self):
        dtm = self.dtm
        doc_topic = self.doc_topic
        n_iter = self.n_iter
        n_topics = self.n_topics
        random_seed = self.random_seed
        random_state = self.model.random_state

        # refit model with same random seed and verify results identical
        model_new = guidedlda.GuidedLDA(n_topics=n_topics,
                                        n_iter=n_iter,
                                        random_state=random_seed)
        rands_init = model_new._rands.copy()
        doc_topic_new = model_new.fit_transform(dtm)
        rands_fit = model_new._rands.copy()
        random_state_new = model_new.random_state
        np.testing.assert_array_equal(doc_topic_new, doc_topic)
        np.testing.assert_array_equal(random_state_new, random_state)

        # verify random variates are not changed
        np.testing.assert_array_equal(rands_init, rands_fit)
示例#19
0
def build_model(data,
                num_topics,
                seed_topic_list,
                seed_conf,
                top_n=10,
                include_vis=True):

    #form bow matrix to feed as input into training guidedlda model
    data = [' '.join(text) for text in data]
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data).toarray()
    vocab = vectorizer.get_feature_names()
    word2id = dict((v, idx) for idx, v in enumerate(vocab))

    #Creates dictionary that assigns words to topics via their
    #topic id given by the id2word assignment
    seed_topics = {}
    for topic_id, subset in enumerate(seed_topic_list):
        for word in subset:
            if word in word2id:
                seed_topics[word2id[word]] = topic_id

    # Build GuidedLDA model
    guidedlda_model = guidedlda.GuidedLDA(n_topics=num_topics,
                                          n_iter=100,
                                          random_state=7,
                                          refresh=20)
    guidedlda_model.fit(X, seed_topics=seed_topics, seed_confidence=seed_conf)

    top_vocab(guidedlda_model, vocab, top_n)

    # Saves model for production later
    with open('results/guided_lda/guided_lda_{}'.format(num_topics),
              'wb') as f:
        pickle.dump(guidedlda_model, f)
    return guidedlda_model
    '''
示例#20
0
vectorizer = CountVectorizer(min_df=5,
                             max_df=0.9,
                             stop_words='english',
                             lowercase=True,
                             ngram_range=(1, 3))
data_vectorized = vectorizer.fit_transform(train_clean_sentences)
all_text = ' '.join(train_clean_sentences)
tokens = word_tokenize(all_text)
words = [w.lower() for w in tokens]
vocab = sorted(set(words))
word2id = dict((v, idx) for idx, v in enumerate(vocab))
seed_topic_list = [[
    'carbon', 'pricing', 'greenhouse', 'backstop', 'infrastructure'
], ['mobility', 'transit', 'transportation'],
                   ['social', 'exclusion', 'alone', 'friend', 'family']]
model = guidedlda.GuidedLDA(n_topics=4, n_iter=100, refresh=20)
seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

model.fit(data_vectorized, seed_topics=seed_topics, seed_confidence=0.15)
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                             1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

## try Word2Vec
gensim_list = []
示例#21
0
文件: pipeline5.py 项目: 77ph/tgnews
docnames = list(docs.keys())

docnames = np.array(docnames)
vocab = np.array(list(vocab))
vocab_sorter = np.argsort(vocab)

print("done get data.")
""" create en_gLDA pipeline """
print("Extracting tf features for gLDA...")
tf_vectorizer = CountVectorizer(max_df=0.95,
                                min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)
gLDA = guidedlda.GuidedLDA(n_topics=n_components,
                           n_iter=100,
                           random_state=7,
                           refresh=20)
gLDA.fit(tf)

filename_glda = 'glda_model.sav'

#gLDA = pickle.load(open(filename_glda, 'rb'))
#gLDA.fit(tf)
pickle.dump(gLDA, open(filename_glda, 'wb'))
## Use tf-idf features for SGD.
#print("Extracting tf-idf features for SGD...")
#tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
#                                   max_features=n_features,
#                                   stop_words='english')
#t0 = time()
#tfidf = tfidf_vectorizer.fit_transform(data_samples)
def run():
    # Read in data set
    df = pd.read_csv('Movie_Metadata_Sentiments.csv')
    # Subset only emotions required to get overall emotion detected from the text content
    sub_df = df[['anger', 'joy', 'fear', 'sadness']]
    df['Max'] = sub_df.idxmax(axis=1)
    # Split into train and test data set
    train, test = train_test_split(df, test_size=0.2, random_state=1)
    # Save to csv file
    df.to_csv('Movie_Metadata_Sentiments_Modified.csv', encoding='utf-8', header=True)
    test.to_csv('Movie_Metadata_Sentiments_Test.csv', encoding='utf-8', header=True)
    # Pre-process data to be fed into Guide LDA Model
    processed_docs = extract_data(train)

    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=5, no_above=0.4, keep_n=100000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    dict1 = dictionary.token2id
    X = get_term_matrix(processed_docs, dictionary)

    print("Guided LDA")
    emolex_df = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', names=["word", "emotion", "association"],
                            sep='\t')

    # Create seed list for each category
    anger_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'anger')].word
    anger_seed = [item for item in anger_df]

    joy_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'joy')].word
    joy_seed = [item for item in joy_df]

    fear_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'fear')].word
    fear_seed = [item for item in fear_df]

    sadness_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'sadness')].word
    sadness_seed = [item for item in sadness_df]

    # Append all topic list to be fed into model
    seed_topic_list = [anger_seed, joy_seed, fear_seed, sadness_seed]

    seed_topics = {}
    for t_id, st in enumerate(seed_topic_list):
        for word in st:
            try:
                seed_topics[dict1[word]] = t_id
            except:
                pass

    # Train the GuidedLDA model
    model = guidedlda.GuidedLDA(alpha=0.1, n_topics=4, n_iter=1000, random_state=7, refresh=20)
    model.fit(X, seed_topics=seed_topics, seed_confidence=0.20)

    # Check top n words in each topic (emotions)
    n_top_words = 15
    topic_word = model.topic_word_
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(list(dict1.keys()))[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    # Save the model to be imported into Telegram bot
    with open('Movie_Metadata_Sentiments_GLDA.model', 'wb') as file_handle:
        Pickle.dump(model, file_handle)

    # Save the Dictionary and Corpus
    name = "Movie_Metadata_Sentiments"
    dictionary.save("./dictionaries/" + name + ".dict")
    corpora.MmCorpus.serialize("./corpus/" + name + ".mm", bow_corpus)
    # Test the model on test set
    test_model()
示例#23
0
def grid_search_lda_SED(texts,
                        seed_topic_list,
                        n_topics_range,
                        priors_range,
                        out_dir,
                        n_top_words=20,
                        seed_confidence=0.15,
                        iterations=2000,
                        save_doc_top=True,
                        verbose=True):
    '''
    Fit many topic models to pick the most tuned hyperparameters.
    Guidedlda version.

    Each fitted model is saved, filename being in the following format:
    {number of topics}T_{iteration rank}I_.{file extension}


    Parameters
    ----------
    texts : iterable
        already preprocessed text data you want to build seeds on.

    seed_topic_list : list of lists
        list of words, where in seed_topic_list[x][y] 
        x is a topic and y a word belonging in that topic.

    n_topics_range : iterable of int | int
        Number of topics to fit the model with.
        When fitting a single model, :int: is enough.
        Otherwise, input list of ints, a range, or other iterables.

    priors_range : list of tuples
        where every 1st element is alpha, every 2nd is eta. 

    out_dir : str
        path to a directory, where results will be saved (in a child directory).

    n_top_words : int, optional (default: 20)
        when extracting top words associated with each topics, how many to pick?

    seed_confidence : float, optional (default: '0.15')
        When initializing the LDA, where are you on the spectrum
        of sampling from seeds (1), vs. sampling randomly (0)?

    iterations : int, optional (default: 2000)
        maximum number of iterations to fit a topic model with.

    save_doc_top : bool
        save documet-topic matices from models?

    verbose : bool, optional (default: True)
        print progress comments.


    Exports
    -------
    out_dir/report_lines/*
        pickled dict with model information
        (n topics, model coherence, per-topic coherence, hyperparameters)
        
    out_dir/models/*
        gensim objects, where the model is saved.
        
    out_dir/plots/*
        pyLDAvis visualizations of the model
    '''
    # INITIALIZATION
    # prepare foldrs
    make_folders(out_dir)

    # paths
    report_dir = os.path.join(out_dir, "report_lines", "")
    model_dir = os.path.join(out_dir, "models", "")
    plot_dir = os.path.join(out_dir, "plots", "")
    doctop_dir = os.path.join(out_dir, 'doctop_mats', '')

    # if a single model is to be fitted,
    # make sure it can be "iterated"
    if isinstance(n_topics_range, int):
        n_topics_range = [n_topics_range]

    # PREPARE DATA
    # for guidedlda fiting
    X, seed_priors, vectorizer = init_guidedlda(
        texts=texts,
        seed_topic_list=seed_topic_list,
    )

    # for coherence counting
    bows, dictionary = gensim_format(texts)

    # TRAIN MODELS
    i = 0
    for n_top in chain(n_topics_range):

        # iterate over priors
        for alpha_, eta_ in priors_range:

            # track time
            start_time = time()  # track time
            # track iterations
            topic_fname = str(n_top) + "T_"
            alpha_fname = str(alpha_).replace('.', '') + 'A_'
            eta_fname = str(eta_).replace('.', '') + 'E_'

            # paths for saving
            filename = topic_fname + alpha_fname + eta_fname + 'seed'
            report_path = os.path.join(report_dir + filename + '.ndjson')
            model_path = os.path.join(model_dir + filename + '.joblib')
            pyldavis_path = os.path.join(plot_dir + filename +
                                         '_pyldavis.html')
            doctop_path = os.path.join(doctop_dir + filename + '_mat.ndjson')

            # train model
            model = guidedlda.GuidedLDA(n_topics=n_top,
                                        n_iter=iterations,
                                        alpha=alpha_,
                                        eta=eta_,
                                        random_state=7,
                                        refresh=10)

            # TODO: iterate seed_confidence?
            model.fit(X,
                      seed_topics=seed_priors,
                      seed_confidence=seed_confidence)

            # track time usage
            training_time = time() - start_time
            if verbose:
                print('    Time: {}'.format(training_time))

            # save priors
            alpha = model.alpha
            eta = model.eta

            # extract topic words
            topics = []
            for i, topic_dist in enumerate(model.topic_word_):
                topic_words = (
                    # take vocab (list of tokens in order)
                    np.array(vectorizer.get_feature_names())
                    # take term-topic distribution (topic_dist),
                    # where topic_dist[0] is probability of vocab[0] in that topic
                    # and sort vocab in descending order
                    [np.argsort(topic_dist)]
                    # selected & reorder so that only words only n_top_words+1 are kept
                    [:-(n_top_words + 1):-1])
                # array to list
                topic_words = [word for word in topic_words]
                topics.append(topic_words)

            # calculate topic coherence based on the extracted topics
            coh_score, coh_topics = coherence_guidedlda(topics=topics,
                                                        bows=bows,
                                                        dictionary=dictionary,
                                                        texts=texts)

            # save report
            report = (n_top, alpha, eta, training_time, coh_score, coh_topics)
            with open(report_path, 'w') as f:
                ndjson.dump(report, f)

            # save model
            dump(model, model_path)

            # produce a visualization
            nice = pyLDAvis.sklearn.prepare(model, X, vectorizer)
            pyLDAvis.save_html(nice, pyldavis_path)

            # save document-topic matrix
            if save_doc_top:
                doc_topic = (model.transform(X).tolist())

                with open(doctop_path, 'w') as f:
                    ndjson.dump(doc_topic, f)

    return None
示例#24
0
    def __init__(self,
                 data,
                 n_iter=500,
                 eta=0.2,
                 alpha=0.2,
                 seed_confidence=10):
        # Generate sparse matrix representation of documents with stopwords removed.
        self._stopwords = text.ENGLISH_STOP_WORDS.union(
            ['appeared', '8217', '8230', '000'])
        self._vectoriser = CountVectorizer(stop_words=self._stopwords,
                                           max_features=1500)
        self._data = list(data.values)
        self._docs = self._vectoriser.fit_transform(self._data)
        self._features = self._vectoriser.get_feature_names()

        # Specify the guided topics
        self._seed_topics = [
            ['btc', 'bitcoin', 'satoshi'],
            ['eth', 'ethereum', 'vitalik', 'foundation'],
            [
                'altcoin', 'altcoins', 'ltc', 'litecoin', 'xmr', 'monero',
                'zec', 'zcash', 'etc', 'classic', 'xrp', 'ripple', 'trx',
                'tron', 'ada', 'cardano', 'dash', 'digitalcash', 'xtz',
                'tezoz', 'usdt', 'tether'
            ], ['mining', 'hashrate', 'hashing', 'pools', 'reward'],
            ['exchange', 'bitfinex', 'poloniex', 'binance'],
            ['market', 'markets', 'analysis', 'index', 'prices'],
            ['asia', 'china', 'korea', 'japan', 'hong', 'singapore', 'taiwan'],
            ['icos', 'ico', 'offering', 'token', 'tokens', 'raise', 'raised'],
            ['regulation', 'legal', 'law', 'tax', 'taxes'],
            ['blockchain', 'protocol', 'scaling'],
            ['bull', 'bear', 'bullish', 'rally', 'bearish', 'trading'],
            ['technology', 'tech'],
            [
                'ledger', 'trezor', 'keepkey', 'coinomi', 'jaxx',
                'myetherwallet'
            ],
            [
                'fiat', 'reserve', 'gold', 'bank', 'dollar', 'pound', 'euro',
                'yen'
            ],
            [
                'business', 'investor', 'investors', 'revenue', 'enterprise',
                'commerce'
            ], ['commodity', 'oil', 'oil-backed'],
            ['sponsored', 'press', 'release'],
            ['theft', 'stolen', 'scam', 'criminal']
        ]
        self.topic_names = [
            'btc', 'eth', 'altcoins', 'mining', 'exchange', 'market', 'asia',
            'ico', 'regulation', 'blockchain', 'trading', 'technology',
            'wallet', 'fiat', 'business', 'commodity', 'sponsored', 'criminal'
        ]
        self._n_topics = len(self.topic_names)

        # Define LDA model parameters
        self.seed_confidence = seed_confidence
        self._model = guidedlda.GuidedLDA(self._n_topics,
                                          n_iter=n_iter,
                                          alpha=alpha,
                                          eta=eta)
示例#25
0
 def load_glda_topic_model(self):
     print('Loading topics...')
     self.glda = guidedlda.GuidedLDA(n_topics=1)
     self.glda = joblib.load('glda_topic_model.lda')
     self.glda_vectorizer = self.load_glda_term_vector_topic_model()
     self.glda_tf_feature_names = self.glda_vectorizer.get_feature_names()
    def build_lda(self, corpus, n_topics=10):

        n_components = n_topics
        n_top_words = 30

        docs = corpus

        #         print("Stemming...")
        #         stemmer = CorpusStemmer()
        #         docs = stemmer.transform(docs)

        #         print("DOC tagging...")
        #         tagger = CorpusPOSTagger()
        #         tagged_docs = tagger.transform(docs)
        #
        #         tag_constraints = []
        #
        #         # build tags based on Singular Noun, Noun and Adjetive, Noun
        #         label_tags = ['NN,NN', 'JJ,NN', 'NNS,NN' ]
        #         for tags in label_tags:
        #             tag_constraints.append(tuple(map(lambda t: t.strip(),
        #                                                  tags.split(','))))
        #
        #         cand_labels = self.find_labels(n_labels, label_min_df, tag_constraints, tagged_docs, n_cand_labels, docs)
        #
        #         print("Collected {} candidate labels".format(len(cand_labels)))
        #
        #         print("Calculate the PMI scores...")
        #
        #         pmi_cal = PMICalculator(
        #             doc2word_vectorizer=CountVectorizer(
        #                 max_df=.95,
        #                 min_df=5,
        #                 lowercase=True,
        #                 token_pattern= r'\b[a-zA-Z]{3,}\b',
        #                 stop_words=self.load_stopwords()
        #                 ),
        #             doc2label_vectorizer=LabelCountVectorizer())
        #
        #         pmi_w2l = pmi_cal.from_texts(docs, cand_labels)
        #
        bigram = gensim.models.Phrases(docs)

        print('Building BiGrams from the corpus...')
        texts = [bigram[line] for line in docs]

        stop = list(self.load_stopwords())
        stop.append('off')
        stop.append('http')
        stop.append('www')
        stop.append('edt')
        stop.append('est')
        stop.append('mdt')
        stop.append('pst')
        stop.append('pt')

        tf_vectorizer = LemmaCountVectorizer(max_df=.95,
                                             min_df=2,
                                             lowercase=True,
                                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                                             stop_words=stop)

        print("Building the Vectorizer for Topic model...")
        tf = tf_vectorizer.fit_transform(
            map(lambda sent: ' '.join(sent), texts))

        self.tf_feature_names = tf_vectorizer.get_feature_names()

        # Save vectorizer to disk as it is needed for service topic-extraction
        self.save_term_vector_topic_model(tf_vectorizer)
        self.save_fitted_term_vector(tf)

        # test semi-supervided Topics
        print("Building the Alternative LDA Topic model...")
        vocab = tuple(self.tf_feature_names)

        seed_topic_list = {}
        word2id = {}

        if seed_topic_list.items().__sizeof__() > 0:
            word2id = dict(
                (v, idx) for idx, v in enumerate(tuple(self.tf_feature_names)))


#         array_tf = tf.toarray()

# Guided LDA with seed topics.
#         seed_topic_list = {'percent': 7, 'year': 7, 'month': 7, 'quarter': 7}
        seed_topics = {}
        for term, topic in seed_topic_list.items():
            seed_topics[word2id[term]] = topic

        model = guidedlda.GuidedLDA(n_topics=n_components,
                                    n_iter=100,
                                    random_state=7,
                                    refresh=50)

        logger.propagate = False

        model.fit(tf, seed_topics=seed_topics, seed_confidence=0.15)

        #         model.fit(array_tf)

        self.lda = model

        print("Printing the Alternative LDA Topic model...")
        topic_word = model.topic_word_
        n_top_words = 30

        # print topics with words and score rank
        for i, topic_dist in enumerate(topic_word):
            #             topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
            #             print('Topic {}: {}'.format(i, ' '.join(topic_words)))
            topic_ = topic_dist
            topic_ = topic_ / topic_.sum(
            )  # normalize to probability distribution
            bestn = matutils.argsort(topic_, 30, reverse=True)
            topic_ = [(self.tf_feature_names[id], topic_[id]) for id in bestn]
            topic_ = ' + '.join(['%.3f*"%s"' % (v, k) for k, v in topic_])
            print("Topic#", i, ":", topic_)

        print("Saving the LDA Topic model to .lda ...")
        self.save_lda_topic_model()

        print("Building the LDA Visualization for Topic model...")

        py_glda_vis = MyDisplay()

        list_topic_names = [
            'T00', 'T01', 'T02', 'T03', 'T04', 'T05', 'T06', 'T07', 'T08',
            'T09', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17',
            'T18', 'T19', 'T20', 'T21', 'T22', 'T23', 'T24', 'T25', 'T26',
            'T27', 'T28', 'T29', 'T30', 'T31', 'T32', 'T33', 'T34', 'T35',
            'T36', 'T37', 'T38', 'T39'
        ]

        list_topic_labels = []

        visualization = py_glda_vis.prepare_glda(model,
                                                 tf,
                                                 tf_vectorizer,
                                                 mds='tsne')

        # processing the labels in the same order of topic relevance
        print(list(visualization[6:])[0])

        for i, topic in enumerate(list(visualization[6:])[0]):
            list_topic_labels.append(list_topic_names[topic - 1])

        topic_name = {"topic.names": list_topic_labels}

        print(topic_name)

        #         visualization_html = py_lda_vis.prepared_data_to_html(visualization,
        #                                                               json_names=topic_name)
        py_glda_vis.save_html(visualization,
                              'GLDA_Visualization_labels.html',
                              json_names=topic_name)

        py_glda_vis.save_html(visualization,
                              'GLDA_Visualization_nolabels.html')
        # this example is to be used for future Topic Model builder
        #         try:
        #             self.load_lda_topic_model()
        #         except FileNotFoundError:
        # if no lda topic_model file, build a new one

        #         self.encoded_html = base64.b64encode(visualization_html.encode())
        self.encoded_html = ''

        self.topics = {}
示例#27
0
def home():
    return "Hello, World!"  # return a string
    data = pd.read_csv('text data.csv')

    texts = data['Article']
    labels = data['Class']
    profession = data['Profession']
    stop = stopwords.words('english')
    stemmer = SnowballStemmer("english")
    preprocess = data['Article'].apply(
        lambda x: [item for item in x if item not in stop])
    preprocess = data["Article"].apply(lambda x: [stemmer.stem(y) for y in x])
    preprocess = data['Article'].str.replace("Context\n", " ")
    preprocess = data['Article'].str.replace("Context:", " ")
    preprocess = data['Article'].str.replace("CONTEXT:", " ")
    preprocess = data['Article'].str.replace("Context:", " ")
    preprocess = data['Article'].str.replace("Context.", " ")
    path = 'record.txt'
    with io.open(path, encoding='utf-8') as f:
        text = f.read().lower()
    print('corpus length:', len(text))
    chars = sorted(list(set(text)))
    print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    # cut the text in semi-redundant sequences of maxlen characters
    maxlen = 40
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i:i + maxlen])
        next_chars.append(text[i + maxlen])

    print('Vectorization...')
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        y[i, char_indices[next_chars[i]]] = 1

    vocab = guidedlda.datasets.load_vocab(guidedlda.datasets.REUTERS)
    # Guided LDA Implementation
    model = guidedlda.GuidedLDA(n_topics=52,
                                n_iter=100,
                                random_state=7,
                                refresh=20)
    model.fit(y)
    topic_word = model.topic_word_
    n_top_words = 8
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    dataSet = pd.read_csv("new_topics.csv")

    def text_process(mess):
        nopunc = [char for char in mess if char not in string.punctuation]
        nopunc = ''.join(nopunc)
        return [
            word for word in nopunc.split()
            if word.lower() not in stopwords.words('english')
        ]

    pipeline = Pipeline([
        ('bow', CountVectorizer(analyzer=text_process)),
        ('tfidf',
         TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier',
         MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
    ])
    pipeline.fit(dataSet["Topics"], dataSet["Profession"])
    a = dataSet["Topics"]
    prediction = pipeline.predict(a)
    print(classification_report(dataSet["Profession"], prediction))
    b = ["Data"]
    prediction = pipeline.predict(b)
    print("Predicted Profession:", prediction)
示例#28
0
 def test_lda_loglikelihoods(self):
     X = np.array([[1, 1], [2, 1], [3, 1], [4, 1], [5, 8], [6, 1]])
     model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=1)
     model.fit(X)
     self.assertGreater(len(model.loglikelihoods_), 1)
示例#29
0
get_lda_summary(5, 10, climate['text_processed'], 'climate_lda')
get_lda_summary(5, 10, yv['text_processed'], 'yv_lda')
get_lda_summary(5, 10, hk['text_processed'], 'hk_lda')
get_lda_summary(5, 10, usa['text_processed'], 'usa_lda')

##########
## guided lda
##########

import guidedlda
vocab = count_vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(vocab))

seed_topic_list = [['😢'], ['😡', '😠'], ['😀', '☺️'], ['🤣', '😂']]

model = guidedlda.GuidedLDA(n_topics=4, n_iter=100, random_state=7, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

model.fit(count_data, seed_topics=seed_topics, seed_confidence=0.15)

n_top_words = 10
topic_word = model.topic_word_

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words +
                                                             1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))
示例#30
0
 def test_lda_constructor(self):
     n_topics = 10
     model1 = guidedlda.GuidedLDA(n_topics)
     self.assertIsNotNone(model1)
     model2 = guidedlda.GuidedLDA(n_topics=n_topics)
     self.assertIsNotNone(model2)