예제 #1
0
    def topic_analysis(self,
                       n_topics=10,
                       model_type='lda',
                       n_terms=50,
                       n_highlighted_topics=5,
                       plot=False,
                       save=False,
                       kwargs=None):
        '''
        DESC: Latent topic modeling of tf/tfidf/binary matrix. If plot, generates termite plot of latent topics.
        for corpus on n topics
        --Input--
            n_topics: (int) number of latent topics
            model_type: (str) 'nmf','lsa','lda' or sklearn.decomposition.<model>
            n_terms: (int) number of key terms ploted in termite plot (y-axis)
            n_highlighted_topics: (int) number of highlighted key topics sorted by importance, max highlighted topics is 6
            plot = (bool) if True will create a terminte plot of latent topics
            save = (str) filename to save plot
            kwargs = (dic) takes hyperparameters --> see sklearn.decomposition.<model>
        ----------------------------------
        --Output--
            Creates topic_matrix of num_docs X n_topics dimensions, topic weights/importance for each topic, and termite plot of key terms to latent topics
        '''
        if n_highlighted_topics > 6:
            print('Value Error: n_highlighted_topics must be =< 5')
            return
        highlighting = {}
        self.model = textacy.TopicModel(model_type,
                                        n_topics=n_topics,
                                        kwargs=kwargs)
        self.model.fit(self.tfidf)
        self.topic_matrix = self.model.transform(self.tfidf)
        for topic_idx, top_terms in self.model.top_topic_terms(
                self.vectorizer.feature_names,
                topics=range(n_topics),
                weights=False):
            self.latent_topics_top_terms[topic_idx] = top_terms
            # print('Topic {}: {}' .format(topic_idx, top_terms))
        for topic, weight in enumerate(
                self.model.topic_weights(self.topic_matrix)):
            self.topic_w_weights[topic] = weight
            highlighting[weight] = topic
            # print('Topic {} has weight: {}' .format(topic, weight))

        sort_weights = sorted(highlighting.keys())[::-1]
        highlight = [
            highlighting[i] for i in sort_weights[:n_highlighted_topics]
        ]
        self.model.termite_plot(self.tfidf, \
                                self.vectorizer.feature_names, \
                                topics=-1,  \
                                n_terms=n_terms, \
                                highlight_topics=highlight,col_labels=['Topic 1','Topic 2','Topic 3','Topic 4','Topic 5','Topic 6','Topic 7','Topic 8','Topic 9'])
        plt.tight_layout()
        print('plotting...')
        if save:
            plt.savefig(save)
        if plot:
            plt.show()
        return
예제 #2
0
 def test_vectorization_and_topic_modeling_functionality(self):
     n_topics = 10
     top_n = 10
     vectorizer = textacy.Vectorizer(
         weighting='tfidf', normalize=True, smooth_idf=True,
         min_df=2, max_df=0.95)
     doc_term_matrix = vectorizer.fit_transform(
         (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
          for doc in self.corpus))
     model = textacy.TopicModel('nmf', n_topics=n_topics)
     model.fit(doc_term_matrix)
     doc_topic_matrix = model.transform(doc_term_matrix)
     self.assertIsInstance(doc_term_matrix, sp.csr_matrix)
     self.assertIsInstance(doc_topic_matrix, np.ndarray)
     self.assertEqual(doc_topic_matrix.shape[1], n_topics)
     for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_n):
         self.assertIsInstance(topic_idx, int)
         self.assertEqual(len(top_terms), top_n)
예제 #3
0
def compute(corpus, tick=utility.noop, method='sklearn_lda', vec_args=None, term_args=None, tm_args=None, **args):
    
    tick()
    
    vec_args = utility.extend({}, DEFAULT_VECTORIZE_PARAMS, vec_args)
    
    terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ]
    fx_terms = lambda: terms # [ doc for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ]
            
    perplexity_score = None
    coherence_score = None
    vectorizer = None
    doc_topic_matrix = None
    doc_term_matrix = None
    
    documents = textacy_utility.get_corpus_documents(corpus)

    if method.startswith('sklearn'):
        
        vectorizer = textacy.Vectorizer(**vec_args)
        doc_term_matrix = vectorizer.fit_transform(fx_terms())

        model = textacy.TopicModel(method.split('_')[1], **tm_args)
        model.fit(doc_term_matrix)
        
        tick()
        
        doc_topic_matrix = model.transform(doc_term_matrix)
        
        tick()
        
        id2word = vectorizer.id_to_term
        bow_corpus = gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False)
        
        # FIXME!!!
        perplexity_score = None
        coherence_score = None
        
    elif method.startswith('gensim_'):
        
        algorithm = method.split('_')[1].upper()
        
        id2word = gensim.corpora.Dictionary(fx_terms())
        bow_corpus = [ id2word.doc2bow(tokens) for tokens in fx_terms() ]
        
        if args.get('tfidf_weiging', False):
            # assert algorithm != 'MALLETLDA', 'MALLET training model cannot (currently) use TFIDF weighed corpus'
            tfidf_model = gensim.models.tfidfmodel.TfidfModel(bow_corpus)
            bow_corpus = [ tfidf_model[d] for d in bow_corpus ]
        
        algorithms = setup_gensim_algorithms(corpus, bow_corpus, id2word, tm_args)
        
        engine = algorithms[algorithm]['engine']
        engine_options = algorithms[algorithm]['options']
        
        model = engine(**engine_options)
        
        if hasattr(model, 'log_perplexity'):
            perplexity_score = model.log_perplexity(bow_corpus, len(bow_corpus))
        
        try:
            coherence_model_lda =  gensim.models.CoherenceModel(model=model, texts=fx_terms(), dictionary=id2word, coherence='c_v')
            coherence_score = coherence_model_lda.get_coherence()
        except Exception as ex:
            logger.error(ex)
            coherence_score = None
            
    processed = topic_model_utility.compile_metadata(
        model,
        bow_corpus,
        id2word,
        documents,
        vectorizer=vectorizer,
        doc_topic_matrix=doc_topic_matrix,
        n_tokens=200
    )
    
    model_data = types.SimpleNamespace(
        topic_model=model,
        id2term=id2word,
        bow_corpus=bow_corpus,
        doc_term_matrix=doc_term_matrix,
        #doc_topic_matrix=doc_topic_matrix,
        #vectorizer=vectorizer,
        processed=processed,
        perplexity_score=perplexity_score,
        coherence_score=coherence_score,
        options=dict(method=method, vec_args=vec_args, term_args=term_args, tm_args=tm_args, **args),
        coherence_scores=None
    )
    
    tick(0)
    
    return model_data
예제 #4
0
        for sgr in textrank:
            print(sgr)
        print("==key_terms_from_semantic_network==")
        for trip in key_terms_from_semantic_network:
            print(trip)
        print("==matches==")
        for match in matches:
            print(match)
        print("\n")

    vectorizer = textacy.Vectorizer(weighting='tfidf',
                                    normalize=True,
                                    smooth_idf=True,
                                    min_df=3,
                                    max_df=0.95)
    doc_term_matrix = vectorizer.fit_transform(
        (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
         for doc in corpus))
    print(repr(doc_term_matrix))

    models = ['nmf', 'lda', 'lsa']
    for m in models:
        model = textacy.TopicModel(m, n_topics=10)
        model.fit(doc_term_matrix)
        doc_topic_matrix = model.transform(doc_term_matrix)
        print("==", m, "==")
        print(doc_topic_matrix.shape)
        for topic_idx, top_terms in model.top_topic_terms(
                vectorizer.id_to_term, top_n=10):
            print('topic', topic_idx, ':', '   '.join(top_terms))
nlp = spacy.load('en')

debates = []
data = csv.reader(open('../debate_csvs/HanDeSeT.csv', 'r'))
for row in data:
    # adapted for handeset which features 7 columns of text per document (row)
    debates.append(
        [row[1], row[6] + row[7] + row[8] + row[9] + row[10] + row[11]])

df = pd.DataFrame(debates, columns=['title', 'text'])
chat_concat = (df.sort_values('title').groupby('title')['text'].agg(
    lambda col: '\n'.join(col.astype(str))))
docs = list(chat_concat.apply(lambda x: nlp(x)))
corpus = textacy.corpus.Corpus(nlp, docs=docs)
vectorizer = textacy.Vectorizer(tf_type='linear',
                                apply_idf=True,
                                idf_type='smooth',
                                norm='l2',
                                min_df=2,
                                max_df=5)
doc_term_matrix = vectorizer.fit_transform(
    (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
     for doc in corpus))
model = textacy.TopicModel('nmf', n_topics=10)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)

for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term,
                                                  top_n=10):
    print('topic', topic_idx, ':', '   '.join(top_terms))
예제 #6
0
    records, 'text')
corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)
corpus

corpus_processed = (textacy.preprocess_text(doc.text,
                                            lowercase=True,
                                            no_punct=True) for doc in corpus)
vectorizer = textacy.Vectorizer(weighting='tf',
                                normalize=True,
                                smooth_idf=True,
                                min_df=2,
                                max_df=0.95)
doc_term_matrix = vectorizer.fit_transform(
    (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
     for doc in corpus))
print(repr(doc_term_matrix))

model = textacy.TopicModel('lda', n_topics=10)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
doc_topic_matrix.shape
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term,
                                                  top_n=10):
    print('topic', topic_idx, ':', '   '.join(top_terms))

model.termite_plot(doc_term_matrix,
                   vectorizer.id_to_term,
                   topics=-1,
                   n_terms=25,
                   sort_terms_by='seriation')