예제 #1
0
def main():
    doc = get_doc()
    print('doc len:', len(doc))

    train_texts = list(build_texts(doc))
    print('train len:', len(train_texts))

    bigram = gensim.models.Phrases(
        train_texts, min_count=10)  # for bigram collocation detection
    stops = set(stopwords.words('english'))  # nltk stopwords list

    train_texts = process_texts(train_texts, bigram, stops)
    print('bigramed train_texts', len(train_texts))
    vocabulary = Dictionary(train_texts)
    print('vocab size:', len(vocabulary))
    # remove extremes
    vocabulary.filter_extremes(
        no_below=3, no_above=0.3
    )  # remove words in less than 5 documents and more than 50% documents
    #vocabulary.filter_n_most_frequent(50)  # Filter out 1000 most common tokens
    # filter_tokens(bad_ids=None, good_ids=None)
    corpus = [vocabulary.doc2bow(text) for text in train_texts]
    print('corpus size:', len(corpus))
    lda = LdaModel(corpus=corpus,
                   id2word=vocabulary,
                   num_topics=10,
                   chunksize=1500,
                   iterations=200,
                   alpha='auto')
    print(
        pd.DataFrame([[word for rank, (word, prob) in enumerate(words)]
                      for topic_id, words in lda.show_topics(
                          formatted=False, num_words=6, num_topics=35)]))
예제 #2
0
파일: cluster.py 프로젝트: PaulHuygen/xtas
def lda(docs, k):
    """Latent Dirichlet allocation topic model.

    Uses Gensim's LdaModel after tokenizing using scikit-learn's
    TfidfVectorizer.

    Parameters
    ----------
    k : integer
        Number of topics.
    """
    from gensim.matutils import Sparse2Corpus
    from gensim.models import LdaModel

    # Use a scikit-learn vectorizer rather than Gensim's equivalent
    # for speed and consistency with LSA and k-means.
    vect = _vectorizer()
    corpus = vect.fit_transform(fetch(d) for d in docs)
    corpus = Sparse2Corpus(corpus)

    model = LdaModel(corpus=corpus, num_topics=k)

    topics = model.show_topics(formatted=False)
    vocab = vect.get_feature_names()
    #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic]
    return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
예제 #3
0
    def get(self, s, e):
        # Loading our datas without treatment
        dataObject = getDatas()
        data = dataObject.get()
        dataEpisode = dataObject.getDataEpisode(data, s, e)

        # preprocess of the words of the episode
        tokenEpisode = []
        tokenEpisode.append(
            [token for token in self.preprocessEpisode(dataEpisode)])
        dictionnaryEpisode = Dictionary(tokenEpisode)

        # creating our model corpus
        model_corpus = []
        for episode in tokenEpisode:
            model_corpus.append(dictionnaryEpisode.doc2bow(episode))

        # Creating our list of topics with the LDA models
        topicsList = []
        string = "Voici les sujets recurrents pour l'episode " + e + " de la saison " + s
        topicsList.append(string)
        lda_model = LdaModel(
            corpus=model_corpus, id2word=dictionnaryEpisode, num_topics=3
        )  # We choose to get only the 3 most significant topics
        for topic_id, topic_keywords in lda_model.show_topics(formatted=False):
            string = "=== Pour le sujet au mot cle principal '" + str(
                lda_model.show_topic(topic_id, topn=1)[0]
                [0]) + "', les mots clefs representatifs sont ==="
            topicsList.append(string)
            # Broswe the keywords of each topic
            for keyword in topic_keywords:
                string = "-> " + str(keyword[0]) + " (" + str(keyword[1]) + ")"
                topicsList.append(string)
        # Return our list of topics
        return topicsList
def get_topic(text):
    np.random.seed(100)
    nlp = spacy.load('en')
    my_stop_words = [
        u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'get'
    ]
    for stopword in my_stop_words:
        lexeme = nlp.vocab[stopword]
        lexeme.is_stop = True
    doc = nlp(text)
    article = []
    texts = []
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article!
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
            # we add the lematized version of the word
            article.append(w.lemma_)
    texts.append(article)
    # getting bigrams out of words using gensim
    bigram = gensim.models.Phrases(texts)
    texts = [bigram[line] for line in texts]
    # Creating corpus with our words
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(i) for i in texts]
    # Applying LDA and LSI models
    lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
    ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
    lsitopics = [[word for word, prob in topic]
                 for topicid, topic in lsimodel.show_topics(formatted=False)]
    ldatopics = [[word for word, prob in topic]
                 for topicid, topic in ldamodel.show_topics(formatted=False)]
    topics = []
    for i in ldatopics:
        topics.append(i[0])
    tags = nltk.pos_tag(topics)
    # removing verbs as generally nouns are topics
    lfinaltopics = [
        word for word, pos in tags
        if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP'
        and pos != 'VBZ' and pos != 'VBG' and pos != 'JJ' and pos != 'RB'
    ]
    ldafinaltopics = list(set(lfinaltopics))
    lstopics = []
    for i in lsitopics:
        for j in i:
            lstopics.append(j)
    ltags = nltk.pos_tag(lstopics)
    lsifinaltopics = [
        word for word, pos in ltags
        if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP'
        and pos != 'VBZ' and pos != 'VBG' and pos != 'RB' and pos != 'JJ'
    ]

    # Intersection of results from both models
    finaltopics = list(set(ldafinaltopics) & set(lsifinaltopics))
    final_topics = []
    for i in finaltopics:
        if len(i) >= 2:
            final_topics.append(i)
    return final_topics
예제 #5
0
def ret_top_model(corpus):
    """
    Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the
    quality of the topic model we produce, we can see what the interpretability of the best topic is and keep
    evaluating the topic model until this threshold is crossed.

    Returns:
    -------
    lm: Final evaluated topic model
    top_topics: ranked topics in decreasing order. List of tuples
    """
    top_topics = [(0, 0)]
    rounds = 1
    high = 0.0
    out_lm = None
    #while top_topics[0][1] < 0.97 and rounds < 2: #0.97
    while True:
        lm = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary, minimum_probability=0)
        coherence_values = {}
        for n, topic in lm.show_topics(num_topics=-1, formatted=False):
            topic = [word for word, _ in topic]
            cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary, window_size=10)
            coherence_values[n] = cm.get_coherence()
        top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
        if high < top_topics[0][1]:
            high = top_topics[0][1]
            out_lm = lm
        print('round ',rounds,':',top_topics[0][1])
        if rounds > 2:
            break
        rounds+=1
    return out_lm, top_topics, high
예제 #6
0
def topic_analysis(df, nTopics=5, cleanTextCol='cleaned_text'):
  df[cleanTextCol]=df[cleanTextCol].fillna('')
  cleandata = df[cleanTextCol].fillna('').apply(lambda x: x.split(' '))
  dictionary = corpora.Dictionary(cleandata)
  tokens = [dictionary.doc2bow(d) for d in cleandata]
  model = LdaModel(tokens, num_topics=nTopics, id2word=dictionary, 
                update_every=1, chunksize=50, passes=10,
                per_word_topics=True, alpha='auto')
  docweights = [model.get_document_topics(t, minimum_probability=0) for t in tokens]
  doctopics = pd.DataFrame(docweights).apply(lambda x: x.apply(lambda y: y[-1] if y else 0))
  doctopics.columns = [f'topic{n+1}' for n in doctopics.columns]
  doctopics['KeyTopic']=doctopics.apply(lambda y:doctopics.columns[y==y.max()][0], axis=1)

  # create topicdescribe
  topics = model.show_topics(num_words=6)
  keywords = [re.findall(r'\*"(.*?)"',d[1]) for d in topics]
  weights = [re.findall(r'([\d\.]+)\*', d[1]) for d in topics]
  kwdf= pd.DataFrame(keywords, columns=[f'keyword_{n}' for n in range(len(keywords[0]))])
  wtdf= pd.DataFrame(weights, columns=[f'weight_{n}' for n in range(len(weights[0]))])
  topicDescribe =  kwdf.merge(wtdf,left_index=True, right_index=True)
  topicDescribe[sorted(topicDescribe.columns, key=lambda x:x.split('_')[-1])]
  topicDescribe['KeyTopic'] = [f'topic{n+1}' for n in range(len(topics))]
  topicDescribe['TopicKeywords'] = [' '.join(k) for k in keywords]
  topicDescribe['DocCount'] = doctopics['KeyTopic'].value_counts().sort_index().values
  topicDescribe = topicDescribe[['KeyTopic']+[col for col in topicDescribe.columns if \
    col != 'KeyTopic']]
  
  doctopics= doctopics.merge(topicDescribe[['KeyTopic','TopicKeywords']], on='KeyTopic', how='left')
  return doctopics, topicDescribe, model, tokens, dictionary
예제 #7
0
def ret_top_model(threshold, corpus, dictionary, texts):
    """
    Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the
    quality of the topic model we produce, we can see what the interpretability of the best topic is and keep
    evaluating the topic model until this threshold is crossed. 
    
    Returns:
    -------
    lm: Final evaluated topic model
    top_topics: ranked topics in decreasing order. List of tuples
    """
    top_topics = [(0, 0)]
    while top_topics[0][1] < threshold:
        lm = LdaModel(corpus=corpus, id2word=dictionary)
        coherence_values = {}
        for n, topic in lm.show_topics(num_topics=-1, formatted=False):
            topic = [word for word, _ in topic]
            cm = CoherenceModel(topics=[topic],
                                texts=texts,
                                dictionary=dictionary,
                                window_size=10)
            coherence_values[n] = cm.get_coherence()
        top_topics = sorted(coherence_values.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
    return lm, top_topics
예제 #8
0
def lda(docs, k):
    """Latent Dirichlet allocation topic model.

    Uses Gensim's LdaModel after tokenizing using scikit-learn's
    TfidfVectorizer.

    Parameters
    ----------
    k : integer
        Number of topics.
    """
    from gensim.matutils import Sparse2Corpus
    from gensim.models import LdaModel

    # Use a scikit-learn vectorizer rather than Gensim's equivalent
    # for speed and consistency with LSA and k-means.
    vect = _vectorizer()
    corpus = vect.fit_transform(fetch(d) for d in docs)
    corpus = Sparse2Corpus(corpus)

    model = LdaModel(corpus=corpus, num_topics=k)

    topics = model.show_topics(formatted=False)
    vocab = vect.get_feature_names()
    #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic]
    return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
예제 #9
0
    def word_cloud(self, model: LdaModel, stopwords_path, save_path):
        with open(stopwords_path, 'r', encoding='utf8') as f:
            words = f.readlines()

        stopwords = add_stop_words(words)
        print('stop words added')
        word_cloud = PersianWordCloud(only_persian=True,
                                      max_words=10,
                                      stopwords=stopwords,
                                      width=800,
                                      height=800,
                                      background_color='black',
                                      min_font_size=1,
                                      max_font_size=300)
        topics = model.show_topics(formatted=False)

        for i, topic in enumerate(topics):
            topic_words = dict(topic[1])
            print(topic_words)
            new = {}
            for word in topic_words.keys():
                reshaped = get_display(arabic_reshaper.reshape(word))
                new[reshaped] = topic_words[word]
            print(new)
            word_cloud.generate_from_frequencies(new)
            image = word_cloud.to_image()
            image.show()
            s = save_path + '_topic_' + str(i) + '.png'
            print(s)
            image.save(s)
예제 #10
0
def chosen_lda(corpus, dictionary, data, n_topics, alpha=.1, eta=0.01):
    '''
    This function trains a Gensim LDA model on chosen hyperparameters
    
    Arguments:
    ----------
    corpus : matrix-format corpus (BOW or TF-IDF)
    dictionary : corpus-related dictionary
    data : text data for coherence score computation
    n_topics : number of desired topics
    alpha : alpha parameter (from 0 to infinity)
    eta : beta parameter (from 0 to infinity)
    
    Outputs:
    ----------
    lda : trained model
    '''
    
    lda = LdaModel(corpus=corpus, 
                id2word=dictionary, 
                num_topics=35, 
                random_state=100, 
                alpha=alpha, 
                eta=eta)
    
    ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)]
    lda_coherence = CoherenceModel(topics=ldatopics, texts=data, dictionary=dictionary, window_size=10).get_coherence()
    print(lda_coherence)
    lda.print_topics(num_topics=n_topics)
    
    lda.save('../03_Dump/model')
    return lda
예제 #11
0
def do_cluster(obj, query):
    texts = [article['title'] for article in obj]

    processor = Processor(query)

    tokens = [processor.get_tokens(text) for text in texts]

    dictionary = corpora.Dictionary(tokens)

    corpus = [dictionary.doc2bow(token) for token in tokens]

    num_clusters = len(texts) / 5
    model = LdaModel(corpus,
                     num_topics=num_clusters,
                     id2word=dictionary,
                     update_every=5,
                     chunksize=10000,
                     passes=50)

    # size 10
    topic_matrix = model.show_topics(formatted=False, num_topics=num_clusters)

    clusters = [{
        "keywords": [str(word) for word, _ in topic[1]],
        "articles": []
    } for topic in topic_matrix]

    for i, document in enumerate(corpus):

        topic = np.array(model.get_document_topics(document))
        cluster = int(topic[np.argmax(topic[:, 1])][0])

        clusters[cluster]['articles'].append(obj[i])

    return clusters
def comparison(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LSI Model output')
    print(lsimodel.show_topics())

    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
    print('hdp model output')
    print(hdpmodel.show_topics())

    ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LDA Model output')
    print(ldamodel.show_topics())


    pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

    lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

    def evaluate_bar_graph(coherences, indices):
        assert len(coherences) == len(indices)
        n = len(coherences)
        x = np.arange(n)
        plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
        plt.xlabel('Models')
        plt.ylabel('Coherence Value')
        plt.show()

    evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
예제 #13
0
def LDA_gensim(corpus, num_topics, id2word, passes, iterations, coherence):
    ''' LATENT DIRICHLET ALLOCATION
    # Generative model that assumes each document is a mixture of topics, each topic is a mixture of words.
    '''
    print 'Latent Dirichlet Allocation'
    #id2word = id2word.id2token # make an index to word dictionary
    lda_model = LdaModel(corpus = corpus, num_topics = num_topics, id2word = id2word, passes = passes, iterations = iterations)
    lda_topics = lda_model.show_topics(formatted = False)
    # compute coherence score
    coherence_model = CoherenceModel(model = lda_model, texts = text_input, dictionary = id2word, coherence = coherence)
    coherence_lda = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    return lda_model
예제 #14
0
class LDA(GenericModel):
    """
    Wrapper for Gensim LdaModel and LdaMulticore
    """
    def __init__(self, *args, **kwargs):
        """
        All provided arguments will be passed to LdaModel or
        LdaMulticore constructors (the latter in case 'workers'
        is present in keyword arguments)

        :param args: positional arguments to initialize model with
        :param kwargs: keyword arguments to pass to model constructor
        """
        if 'workers' in kwargs.keys():
            self.__model__ = LdaMulticore(*args, **kwargs)
        else:
            self.__model__ = LdaModel(*args, **kwargs)

    def fit(self, data: Any, *args, **kwargs):
        # Actually, I think there is no need for this as
        # we can simply use update() for uninitialized model
        self.__model__.update(corpus=data, *args, **kwargs)

    def update(self, data: Any, *args, **kwargs):
        self.__model__.update(corpus=data, *args, **kwargs)

    def get_topics(self,
                   docs: Optional[Iterable[Any]] = None,
                   *args,
                   **kwargs):
        if docs is None:
            topics = self.__model__.show_topics(formatted=False,
                                                *args,
                                                **kwargs)
        else:
            topics = map(
                partial(self.__model__.get_document_topics,
                        per_word_topics=True), docs)
        topics, t_copy, t_copy_1 = tee(topics, 3)

        ids = map(lambda x: x[0], topics)
        words = map(lambda x: x[1], t_copy)
        words = map(lambda x: list(zip(*x))[0], words)
        scores = map(lambda x: x[1], t_copy_1)
        scores = map(lambda x: list(zip(*x))[1], scores)

        topics = zip(ids, zip(words, scores))

        return topics
예제 #15
0
def topicModeling(corpus, dictionary, texts):

    ldamodel = LdaModel(corpus=corpus,
                        num_topics=3,
                        id2word=dictionary,
                        passes=5)

    x = ldamodel.show_topics()  #show generated topics

    #----------------------------------------------------------
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series(
                    [int(topic_num),
                     round(prop_topic, 4), topic_keywords]),
                                                       ignore_index=True)
            else:
                break
    sent_topics_df.columns = [
        'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'
    ]

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

    #-------Generate Visualization------------------------------

    pyLDAvis.enable_notebook()

    topicModel = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    pyLDAvis.save_html(
        topicModel,
        '/Users/[email protected]/Documents/projects/PEM/elon.html')

    pyLDAvis.show(topicModel)

    return x, sent_topics_df
예제 #16
0
    def runModels(self, number_of_topics, corpus, dictionary, start, end):

        #do hdp model

        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

        hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10)
        hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(hdptopics)

        #add results to total kept in a list
        #   addToResults(result_dict)

        #output results
        self.printResults(number_of_topics, hdptopics, 'hdp', start, end)

        #d lda model
        ldamodel = LdaModel(corpus=corpus,
                            num_topics=number_of_topics,
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

        ldamodel.save('lda' + number_of_topics + '.model')
        ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(ldatopics)
        #   addToResults(result_dict)
        self.printResults(number_of_topics, ldatopics, 'lda', start, end)

        visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

        location = os.path.join(pn, 'topic_model_results')

        #visualize outputs in html
        pyLDAvis.save_html(
            visualisation,
            os.path.join(
                location, 'LDA_Visualization' + str(number_of_topics) + "_" +
                start + "_" + end + '.html'))
def prepare_twitter_data(data_file, type_of_analysis):
    labels = []
    text_fake, text_normal = '', ''
    df = pd.read_csv(data_file, sep='|', encoding='utf-8', keep_default_na=False)
    print('removing duplicates')
    df = utils.remove_duplicates(df)
    print('getting preprocessed train articles')
    idx = 0
    for key, item in enumerate(df['article_text']):
        idx += 1
        if df['is_fake'].values[key] == 1:
            text_fake += get_preprocessed_text(item)
            labels.append('FAKE')
        else:
            text_normal += get_preprocessed_text(item)
            labels.append('NOT_FAKE')
        if idx % 100 == 0:
            print('got {} of {} preprocessed train articles'.format(idx, len(df)))

    print('Finished gathering train text items')

    train = pd.DataFrame()
    train['data'] = df[type_of_analysis]
    train['labels'] = df['is_fake']


    #  TOPIC MODELLING
    nlp = German()
    stop_words = get_stop_words('de')
    stop_words.append('foto')
    stop_words.append('⬅')

    for stopword in stop_words:
        lexeme = nlp.vocab[stopword]
        lexeme.is_stop = True

    texts = get_spacy_corpus(train['data'], nlp, logging=True, topic_modelling=True)
    bigram = gensim.models.Phrases(texts)
    texts = [bigram[line] for line in texts]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
    print(ldamodel.show_topics())
    return do_create_twitter(train, None)
예제 #18
0
def BasicLDA(doclist, num_topics):
    start = time.clock()
    num_topics = num_topics
    texts = clean(doclist)
    print(texts[1])
    # frequency = {}
    # for text in texts:
    #     for token in text:
    #         if token not in frequency:
    #             frequency[token] = 0
    #         else:
    #             frequency[token] += 1
    dictionary = corpora.Dictionary(texts)
    size_dictionary = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=10, iterations=100)
    topics = []
    for i in lda.show_topics(num_topics=-1, num_words=20):
        print(i)
        topics.append(i)

    for i in lda.get_document_topics(corpus):  # i是按照词袋中的顺序,每个文档的主题分布
        s = str(i)
        pattern1 = r'\((\d+),'
        a = re.findall(pattern1, s)
        print(a)  # 匹配出每个文档的包含的主题标签

        word_list = []  # 存放当前文档包含的所有的主题
        for idx in a:  # 取主题号
            w = topics[int(idx)]  # 取主题词分布
            word_list.append(w)  # 按照主题标签, 把对应主题的词分布 ,按照顺序存起来

        l = [list(k)[1] for k in i]  # list(k)[1] 每个主题的取概率
        doc2top = {}
        for num in range(len(l)):
            doc2top[l[num]] = word_list[num]

        print(doc2top)
        break
        # print(list(chain.from_iterable(zip(l, word_list))))

    elapsed = time.clock() - start
    return lda, corpus, dictionary, size_dictionary, elapsed
예제 #19
0
def build_topic(name,jsoname,contentField="text",usetfidf=False,output=False,plot=True,filetype='csv',sfilename=None):
	data_JLM=None
	if sfilename:
		with open(sfilename) as f:fr_stop = [i.split('\n')[0] for i in f.readlines()]
	else:fr_stop = get_stop_words('fr')
	try:
		with open(jsoname) as json_data: data_JLM = json.load(json_data)	
		docs=data_JLM["tweets"]
		dictionary = gensim.corpora.Dictionary(docs)
		corpus=[dictionary.doc2bow(doc) for doc in docs]
	except:
		docs=[]
		corpus=[]
		dictionary = gensim.corpora.Dictionary(docs)
		if filetype=='csv':data = pd.read_csv(name,sep=';')[contentField]#
		else:exit()
		tagger=treetaggerwrapper.TreeTagger(TAGLANG='fr',TAGDIR="../ouest-france2/TreeTagger")
		for text in data:
			try:tags=tagger.tag_text(text)
			except:continue
		
			doc=[]
			for i in tags:
				tmp=i.split("\t")
				if len(tmp)<3 or len(tmp[0])<4:continue
				if tmp[1] in ["NOM","VER"] and tmp[-1] not in fr_stop and "’" not in tmp[-1] :doc+=[tmp[-1]]
			if "cantine" in doc and "porc" in doc:docs+=[doc]
			dic=dictionary.doc2bow(doc,allow_update=True)
			corpus+=[dic]
		with open(jsoname, 'w') as outfile:
			json.dump({"tweets":docs}, outfile)
	
	if usetfidf:
		tfidf = gensim.models.TfidfModel(corpus)
		corpus=tfidf[corpus]
	#kmeanTest(docs,10)
	ldamodel=LdaModel(corpus=corpus, id2word=dictionary,num_topics=5)
	if output:print(ldamodel.show_topics(num_topics=-1, num_words=10))
	
	if plot:
		data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
		pyLDAvis.show(data)
예제 #20
0
    def topicsLDA_gensim(self,
                         num_topics=10,
                         num_words=10,
                         num_iterations=2000,
                         chunksize=20000,
                         decay=0.5):
        lda = LdaModel(corpus=self.corpus,
                       num_topics=num_topics,
                       id2word=self.id2word,
                       chunksize=chunksize,
                       iterations=num_iterations,
                       alpha='auto',
                       eta='auto',
                       decay=decay)

        # documents for each topic
        if self.doc2class:
            doc_idx = 0
            for line in lda[self.corpus]:
                # get topic with maximum percentage
                if line:
                    topic_idx = max(line, key=lambda item: item[1])[0]
                else:
                    # if there is no topic assign a random one
                    topic_idx = random.randint(0, num_topics - 1)
                # make the dictionary
                if self.doc2topicLDA_gensim.get(
                        self.doc2class[doc_idx]) is None:
                    self.doc2topicLDA_gensim[self.doc2class[doc_idx]] = {}
                    for i in range(0, num_topics):
                        self.doc2topicLDA_gensim[
                            self.doc2class[doc_idx]][i] = 0
                self.doc2topicLDA_gensim[
                    self.doc2class[doc_idx]][topic_idx] += 1
                doc_idx += 1
            print self.doc2topicLDA_gensim
        # return topics
        return lda.show_topics(num_topics=num_topics,
                               num_words=num_words,
                               formatted=False)
예제 #21
0
def ret_top_model():
    top_topics = [(0, 0)]
    rounds = 1
    high = 0.0
    out_lm = None
    #while top_topics[0][1] < 0.97 and rounds < 2: #0.97
    while True:
        lm = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary)
        coherence_values = {}
        for n, topic in lm.show_topics(num_topics=-1, formatted=False):
            topic = [word for word, _ in topic]
            cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary, window_size=10)
            coherence_values[n] = cm.get_coherence()
        top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
        if high < top_topics[0][1]:
            high = top_topics[0][1]
            out_lm = lm
        print('round ',rounds,':',top_topics[0][1])
        if rounds > 2:
            break
        rounds+=1
    return out_lm, top_topics, high
예제 #22
0
def test_gm_lda():
    test_file_path = "training_data/positive_sent_sample"
    text_file_path = "training_data/seg_sent_without_label"
    stop_words_file_path = "filter_words/stop_words"

    text_file = codecs.open(test_file_path, "r", encoding='utf8')
    stop_words_file = codecs.open(stop_words_file_path, 'r', encoding='utf8')
    list = []
    stop_words = []
    for stop_w in stop_words_file:
        stop_words.append(stop_w.replace("\r\n", ""))

    for line in text_file:
        line = remove_stop_words(line, stop_words)
        list.append(line.replace(" \r\n", "").split(" "))
    '''
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tf_idf = transformer.fit_transform(vectorizer.fit_transform(list))
    words = vectorizer.get_feature_names()
    weight = tf_idf.toarray()
    '''

    dic = corpora.Dictionary(list)  # 生成文档词典,每一个词与一个索引值对应
    corpus = [dic.doc2bow(text) for text in list]  # 词频统计,转换为空间向量格式
    ids = dic.token2id
    lda = LdaModel(
        corpus=corpus,
        id2word=dic,
        num_topics=20,
        alpha='auto',
    )

    result_list = lda.show_topics(num_topics=20, num_words=30)

    for tup in result_list:
        print(tup[1])
        print("============================")
예제 #23
0
def find_topic():
    """
    LdaModel params
        passes: Number of passes through the entire corpus
        chunk_size: how many documents to load into memory
        update_every: number of chunks to process prior to moving onto the M step of EM
    """
    with gzip.open(config['fun2vec']['corpus'], 'rb') as f:
        words = pickle.load(f)
    # 辞書作成
    dictionary = corpora.Dictionary(words)
    dictionary.filter_extremes(no_below=30, no_above=0.3)

    # コーパスを作成
    corpus = [dictionary.doc2bow(_words) for _words in words]
    # corpora.MmCorpus.serialize('cop.mm', corpus)
    lda = LdaModel(corpus,
                   num_topics=10,
                   chunksize=10000,
                   update_every=2,
                   id2word=dictionary)
    lda.save(config['topic_model'])
    pprint(lda.show_topics(num_words=20))
예제 #24
0
class LdaWord2VecModel:
    def __init__(self,
                 corpus,
                 w2v_size=100,
                 topics=100,
                 w2v_path='',
                 lda_path=''):
        """initialize LdaWord2VecModel
        initialize and train the LdaWord2VecModel according to the args.
        
        Args:
            corpus: the corpus used to train, a sequence of sequence of words.
            w2v_size: size of word vector, 100 by default.
            topics: num_topics of LDA model, 100 by default.
            w2v_path: the path to load or save word vector model, '' by default, which means not loading or saving.
            lda_path: the path to load or save LDA model, '' by default, which means not loading or saving.
        """
        # 保留类的初始化变量
        self.topics = topics
        self.w2v_size = w2v_size

        # 训练或载入词向量模型
        if w2v_path != '':
            if os.path.exists(w2v_path):
                self.w2v_model = Word2Vec.load(w2v_path)
            else:
                self.w2v_model = Word2Vec(corpus, size=w2v_size)
                self.w2v_model.save(w2v_path)
        else:
            self.w2v_model = Word2Vec(corpus, size=w2v_size)

        # 训练或载入LDA模型
        if lda_path != '':
            if os.path.exists(lda_path):
                self.lda_model = LdaModel.load(lda_path)
            else:
                word_dict = Dictionary(corpus)
                bow_corpus = self.BowCorpus(word_dict, corpus)
                self.lda_model = LdaModel(bow_corpus,
                                          id2word=word_dict,
                                          num_topics=topics)
                self.lda_model.save(lda_path)
        else:
            word_dict = Dictionary(corpus)
            bow_corpus = self.BowCorpus(word_dict, corpus)
            self.lda_model = LdaModel(bow_corpus,
                                      id2word=word_dict,
                                      num_topics=topics)

        # 计算主题向量
        topic_bow = self.lda_model.show_topics(num_topics=-1)
        self.topic_vecs = []
        for topic in topic_bow:
            vec = np.zeros(w2v_size, dtype=float)
            for word_tuple in topic[1].split(' + '):
                weight, word = word_tuple.split('*')
                if word[1:-1] in self.w2v_model.wv:
                    vec += self.w2v_model.wv[word[1:-1]] * float(weight)
            self.topic_vecs.append(vec)

    def get_topics(self, topics=10, words=10):
        return self.lda_model.show_topics(num_topics=topics, num_words=words)

    def get_topic_vecs(self):
        return self.topic_vecs

    def get_word_vecs(self):
        return self.w2v_model.wv

    def predict(self, doc):
        # 计算文档向量
        doc_vec = np.array(w2v_size, dtype - float)
        for sent in doc:
            for word in sent:
                if word in self.w2v_model.wv:
                    doc_vec += self.w2v_model.wv[word]

        # 寻找余弦相似度最大的主题向量
        topic = -1
        cos_max = 0
        for i in range(len(self.topic_vecs)):
            cos = np.dot(doc_vec, self.topic_vecs[i]) / np.sqrt(
                sum(doc_vec**2) * sum(self.topic_vec[i]**2))
            if cos >= cos_max:
                topic = i
                cos_max = cos

        return topic

    class BowCorpus:
        def __init__(self, word_dict, corpus):
            self.word_dict = word_dict
            self.corpus = corpus

        def __iter__(self):
            for doc in self.corpus:
                yield self.word_dict.doc2bow(doc)
예제 #25
0
                         eval_every=1,
                         chunksize=4000,
                         passes=20,
                         iterations=100,
                         alpha='auto',
                         eta='auto',
                         random_state=42)

    print("Run for %d topics in %.2f mins" % (num_topics,
                                              (perf_counter() - start) / 60))
    start = perf_counter()
    # pprint(lda_model.print_topics(num_topics=num_topics, num_words=num_keywords))
    print("\n\n")

    topics_shown = lda_model.show_topics(num_topics=num_topics,
                                         num_words=num_keywords,
                                         formatted=False)
    # print(topics_shown)
    word_freq = {}
    word_sum = {}
    topics_to_save = {}
    for num, rep in topics_shown:
        if num not in topics_to_save:
            topics_to_save[num] = {}
        for word, freq in rep:
            if word not in word_freq:
                word_freq[word] = 0
            word_freq[word] = word_freq[word] + 1
            if word not in word_sum:
                word_sum[word] = 0
            word_sum[word] = word_sum[word] + freq
예제 #26
0
def return_suggested_articles(request):
    """
    returns suggested articles based on topic of one currently being viewed

    Parameters
    ----------
    request : request (flask.Request): The request object

    Returns
    -------
    JSON of google search queries for articles to read

    """

    # get the requested json for the webpage
    request_json = request.get_json(silent=True)

    # get the headline and article
    headline = request_json['headline']
    article = request_json['article']
    print('requested json headline and article text')

    # make into one text file
    combined_article = headline + '. ' + article

    # set to 1 for single doc lda, 0 for tfidf
    do_single_document_LDA = 1

    # number of query words to return
    n_search_words = 5

    # can identify ngrams, but slows down performance
    do_ngrams = 1

    ### SINGLE DOC LDA PARAMS

    # set the number of topics to generate (5 seems to work pretty well)
    num_lda_topics = 5

    # set the number of passes
    n_passes = 10

    # if avoiding repeated words (only relevant if num_lda_topics > 1)
    do_unique_search_words = 0

    print('Downloading stop words')
    # download stopwords list
    # if use_bucket:
    download_blob('debiaser_data', 'sw1k.csv', '/tmp/sw1k.csv')

    # load stop words into pandas and then into list
    stop_words = pd.read_csv('/tmp/sw1k.csv')

    # remove from memory
    os.remove('/tmp/sw1k.csv')

    stop_words = stop_words['term']
    stop_words = [word for word in stop_words]

    # # adding some custom words
    stop_words.append('said')
    stop_words.append('youre')
    stop_words.append('mph')
    stop_words.append('inc')
    stop_words.append('cov')
    stop_words.append('jr')
    stop_words.append('dr')
    stop_words.append('ads')
    stop_words.append('cookies')
    stop_words.append('factset')

    print('Downloading news organizations from AllSidesMedia')
    # download all_sides_media list
    # if use_bucket:
    download_blob('debiaser_data',
                  'allsides_final_plus_others_with_domains.csv',
                  '/tmp/allsides_final_plus_others_with_domains.csv')

    # load domain names into dataframe and then get only names and
    all_sides = pd.read_csv('/tmp/allsides_final_plus_others_with_domains.csv')

    # remove from memory
    os.remove('/tmp/allsides_final_plus_others_with_domains.csv')

    # get the domain
    # all_sides_names = all_sides['name']
    all_sides_domains = all_sides['domain']
    # all_sides_names_domains = pd.concat([all_sides_names,all_sides_domains],axis=1)

    # get dictionary of entities in article
    # entity_dict = entity_recognizer(combined_article,nlp)

    if do_single_document_LDA:

        print('splitting article into sentences')
        # break up into sentences
        combined_article = tokenize.sent_tokenize(combined_article)

    else:

        # make into one element list for downstream processing
        combined_article = [combined_article]

    print('pre processing article text')
    # process article
    article_processed = process_all_articles(combined_article, nlp)

    print('removing stopwords')
    # remove stopwords
    article_processed = remove_stopwords(article_processed, stop_words)

    # floor for the frequency of words to remove
    # word_frequency_threshold = 1

    # get corpus, dictionary, bag of words
    # processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow(article_processed,
    #                                                                                       word_frequency_threshold)

    if do_single_document_LDA:

        if do_ngrams:
            # load bigram trigram quadgram models
            bigram_mod_fname = '/tmp/bigram_mod.pkl'
            trigram_mod_fname = '/tmp/trigram_mod.pkl'
            quadgram_mod_fname = '/tmp/quadgram_mod.pkl'

            download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname)
            download_blob('debiaser_data', 'trigram_mod.pkl',
                          trigram_mod_fname)
            download_blob('debiaser_data', 'quadgram_mod.pkl',
                          quadgram_mod_fname)

            with open(bigram_mod_fname, 'rb') as pickle_file:
                bigram_mod = pickle.load(pickle_file)

            with open(trigram_mod_fname, 'rb') as pickle_file:
                trigram_mod = pickle.load(pickle_file)

            with open(quadgram_mod_fname, 'rb') as pickle_file:
                quadgram_mod = pickle.load(pickle_file)

            print('FINDING QUADGRAMS')

            # make up to quad grams
            article_processed = make_quadgrams(article_processed, bigram_mod,
                                               trigram_mod, quadgram_mod)

            # remove to free memory
            os.remove(bigram_mod_fname)
            os.remove(trigram_mod_fname)
            os.remove(quadgram_mod_fname)

        print('generating dictionary and bag of words vector...')
        start = time.process_time()
        processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow(
            article_processed)
        print('TIME FOR GENERATING DICTIONARY AND BOW VECTOR')
        print(time.process_time() - start)

        print('generating lda model...')
        start = time.process_time()
        # generate the LDA model
        lda = LdaModel(corpus=bow_corpus,
                       num_topics=num_lda_topics,
                       id2word=processed_dictionary,
                       passes=n_passes)
        print('TIME FOR GENERATING LDA MODEL')
        print(time.process_time() - start)

        # get the topics from the lda model
        lda_topics = lda.show_topics(formatted=False)

        # ALL INTERESTING BUT DEPRECATED FOR NOW
        # WILL FOLLOW SIMPLER APPROACH:
        # Just take top word in each generated topic

        # get top words per topic
        lda_top_topic_words_string, lda_top_topic_words_list = get_lda_top_topic_words(
            lda_topics, num_lda_topics, do_unique_search_words, n_search_words)

    # doing tfidf
    else:

        # specify file name
        tfidf_matrix_filename = '/tmp/tfidf_matrix.pkl'

        # download the tfidf matrix
        print('DOWNLOADING TFIDF MODEL')
        download_blob('debiaser_data', 'tfidf_matrix.pkl',
                      tfidf_matrix_filename)

        with open(tfidf_matrix_filename, 'rb') as pickle_file:
            tfidf = pickle.load(pickle_file)

        # remove from memory
        os.remove(tfidf_matrix_filename)

        if do_ngrams:
            # load bigram trigram quadgram models
            bigram_mod_fname = '/tmp/bigram_mod.pkl'
            trigram_mod_fname = '/tmp/trigram_mod.pkl'
            quadgram_mod_fname = '/tmp/quadgram_mod.pkl'

            download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname)
            download_blob('debiaser_data', 'trigram_mod.pkl',
                          trigram_mod_fname)
            download_blob('debiaser_data', 'quadgram_mod.pkl',
                          quadgram_mod_fname)

            with open(bigram_mod_fname, 'rb') as pickle_file:
                bigram_mod = pickle.load(pickle_file)

            with open(trigram_mod_fname, 'rb') as pickle_file:
                trigram_mod = pickle.load(pickle_file)

            with open(quadgram_mod_fname, 'rb') as pickle_file:
                quadgram_mod = pickle.load(pickle_file)

            # make up to quad grams
            combined_article = make_quadgrams(combined_article, bigram_mod,
                                              trigram_mod, quadgram_mod)

            # remove to free memory
            os.remove(bigram_mod_fname)
            os.remove(trigram_mod_fname)
            os.remove(quadgram_mod_fname)

        # download dictionary
        id2word_fname = '/tmp/id2word.pkl'
        download_blob('debiaser_data', 'id2word_ec2.pkl', id2word_fname)

        with open(id2word_fname, 'rb') as pickle_file:
            processed_dictionary = pickle.load(pickle_file)

        # remove to free memory
        os.remove(id2word_fname)

        print('GENERATING BOW VECTOR FOR ARTICLE')
        # get bag of words representation
        bow_corpus_article = [
            processed_dictionary.doc2bow(text) for text in combined_article
        ]

        print('GETTING TF IDF SCORE')
        tfidf_vector = tfidf[bow_corpus_article[0]]

        # sort the tfidf vector
        tfidf_vector = sorted(tfidf_vector, key=getKey, reverse=True)

        # if there are fewer words than search words, then just use how many words there are
        if len(tfidf_vector) < n_search_words:
            n_search_words = len(tfidf_vector)

        top_tfidf_values = [
            tfidf_vector[i][0] for i in range(0, n_search_words)
        ]
        print(top_tfidf_values)

        top_words_list = [
            processed_dictionary[i].replace("_", " ") for i in top_tfidf_values
        ]

        top_words_string = ' '
        for word in top_words_list:
            if word not in top_words_string:
                top_words_string += ' ' + word

    # get dictionary of google queries
    queries_dict = {}

    for domain in all_sides_domains:

        # if this is single document lda
        if do_single_document_LDA:
            query = 'www.news.google.com/search?q=site:' + domain + lda_top_topic_words_string

        # if this is tfidf
        else:
            query = 'www.news.google.com/search?q=site:' + domain + top_words_string

        queries_dict[domain] = query

    return json.dumps(queries_dict)
예제 #27
0
#coherence_model_lda = CoherenceModel(model=lda_model, texts=nps_comment_filtered, dictionary=id2word, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
예제 #28
0
def lda_analysis(input_data, num_topics=3, random_state=1):

    # treat each set of documents as a separate corpus and find topics?
    for key, value in input_data.items():
        _texts = []
        for k, v in input_data[key].items():
            _texts.append(' '.join(input_data[key][k]['lemmas']))

        texts = [simple_preprocess(doc) for doc in _texts]
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(line) for line in texts]

        # build lda model:
        lda_model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics,
                             random_state=random_state)

        # get document topic distribution:
        doc_topic_dist = get_corpus_topics(_texts, lda_model)

        topic_terms = lda_model.show_topics(num_words=100)
        # get top words for each topic:
        topic_term_dict = OrderedDict()
        rel_terms = []
        for topic_dist in topic_terms:
            topic_id = topic_dist[0]
            topic_term_dict[topic_id] = {}
            topic_terms = topic_dist[1]
            for _split in topic_terms.split('+'):
                topic_term_prob = _split.split('*')[0]
                topic_term = str(_split.split('*')[1]).replace('"', '').strip()
                topic_term_dict[topic_id][topic_term] = float(topic_term_prob)
                # rel_terms.append(topic_term)

        summary_sentences = {}
        sen_ranker = []
        # calculate rank for each sentence with respect to each topic:
        for k, v in input_data[key].items():
            sen = k
            # sen = sen.lower()
            sen_length = len(sen.split(' '))
            sen_id = input_data[key][sen]['doc_id']
            if sen_length <= 7:
                continue
            sen_topic = []
            # compute score for each topic:
            for topic in range(num_topics):
                rel_sen_terms = list(
                    set(input_data[key][k]['lemmas'])
                    & set(topic_term_dict[topic].keys()))
                sen_score = 0
                for term in rel_sen_terms:
                    sen_score += topic_term_dict[topic][term]

                sen_topic.append((topic, sen_score, sen, sen_id))

            # select top one from sen_topic and append to sen_ranker:
            top_sen_topic = sorted(sen_topic, key=lambda x: x[1],
                                   reverse=True)[0]
            sen_ranker.append(top_sen_topic)

        for _sen in sen_ranker:
            topic = _sen[0]
            sen_score = _sen[1]
            sen = _sen[2]
            sen_id = _sen[3]
            input_data[key][sen].update({"LDAscore": sen_score})
            input_data[key][sen].update({"lda_topic_id": topic})

    return input_data
예제 #29
0
def topic_model_gensim_lda(col: str, prefix=None, min_topics=19,max_topics=19,step=2) -> None:
    def trigram_bow_generator(filepath: str):
        '''
        generator function to read docs from a file
        and yield a bag-of-words representation
        '''
        for doc in LineSentence(filepath):
            yield trigram_dictionary.doc2bow(doc)

    if prefix is None:
        prefix = ''
    # for topic modeling
    
    trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt'
    print(f'Loading input file {trigram_docs_filepath}')
    trigram_dictionary_filepath = data_dir_processed / f'{prefix}{col}_trigram_dict_all.dict'
    trigram_bow_filepath = data_dir_processed / f'{prefix}{col}_trigram_bow_corpus_all.mm'

    #resp_whytfa_trigram_transformed_docs_all.txt

    # turn to posix filepaths until gensim supports this
    # trigram_docs_filepath = trigram_docs_filepath.as_posix()
    trigram_docs_filepath =  trigram_docs_filepath.as_posix()
    trigram_dictionary_filepath = trigram_dictionary_filepath.as_posix()
    trigram_bow_filepath = trigram_bow_filepath.as_posix()

    # TODO - change 1 == 1 lines to overwrite_interim

    # this is a bit time consuming - make the if statement True
    # if you want to learn the dictionary yourself.
    if 1 == 1:
        trigram_docs = LineSentence(trigram_docs_filepath)
        # learn the dictionary by iterating over all of the docs
        trigram_dictionary = Dictionary(trigram_docs)
        print(trigram_dictionary)
        #for k, v in trigram_dictionary.iteritems():
        #    print (f'{k}, {v}')


        # filter tokens that are very rare or too common from
        # the dictionary (filter_extremes) and reassign integer ids (compactify)
        trigram_dictionary.filter_extremes(no_below=min_absolute_frequency,
                                           no_above=max_relative_frequency,
                                           keep_n=max_features,
                                           )
        trigram_dictionary.compactify()
        print(trigram_dictionary)
        #for k, v in trigram_dictionary.iteritems():
        #    print (f'{k}, {v}')

        if verbose:
            logger.info(f'Saving trigram dictionary: {trigram_dictionary_filepath} {len(trigram_dictionary)}')
        trigram_dictionary.save(trigram_dictionary_filepath)

    # load the finished dictionary from disk
    if verbose:
        logger.info(f'Loading trigram dictionary: {trigram_dictionary_filepath}')
    trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

    # this is a bit time consuming - make the if statement True
    # if you want to build the bag-of-words corpus yourself.
    if 1 == 1:
        # generate bag-of-words representations for
        # all docs and save them as a matrix
        if verbose:
            print(f'Saving corpus: {trigram_bow_filepath}')
        MmCorpus.serialize(trigram_bow_filepath,
                           trigram_bow_generator(trigram_docs_filepath))
    # load the finished bag-of-words corpus from disk
    if verbose:
        print(f'Loading corpus: {trigram_bow_filepath}')
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    num_topics_range = range(min_topics, max_topics + 1, step)

    #iterations = 2000
    #chunksize = 100  # more than the number of docs?

    passes = 10
    # iterations = 400
    iterations = 100
    # chunksize = len(trigram_bow_corpus)
    chunksize = 100  # more than the number of docs?
    eta = 'auto'
    #eval_every = None  # Don't evaluate model perplexity, takes too much time.
    workers=1
    print(f'cpu_count:{cpu_count()}')
    alpha='auto'
    if multicore:
        # for multicore; one fewer than the number of cores
        workers = cpu_count() - 1
        if verbose:
            print(f'Multiprocessing with {workers} cores (one fewer than the number of cores)')
    else:
        # for singnle core; cannot use in multicore
        alpha = 'auto'

    # now_str = datetime.now(timezone('US/Pacific')).strftime('%Y-%m-%d-%H-%M-%S')
    now_str = ''#datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    save_dir = data_dir_processed / f'{prefix}{col}_gensim_lda_models_{now_str}'
    if not save_dir.exists():
        save_dir.mkdir(parents=True, exist_ok=True)
    # save_dir_s3 = f'{data_dir_processed_s3}/{prefix}{col}_gensim_lda_models_{now_str}'

    # lm_list = []
    c_v = []
    u_mass = []
    perp = []
    #alg='LDA'
    alg='Mallet'

    for num_topics in num_topics_range:

        if(alg == 'Mallet'):
            logger.info('Using Mallet...')
            #try the Mallet implementation
            ldamallet = LdaMallet(mallet_path, corpus=trigram_bow_corpus, num_topics=num_topics, id2word=trigram_dictionary,workers=workers,iterations=iterations)

            ldamallet_filepath = (save_dir / f'gensim_ldamallet_{num_topics}_topics').as_posix()
            ldamallet.save(ldamallet_filepath)

            for t in ldamallet.show_topics(num_topics=-1, num_words=10, formatted=False):
                words = [w[0] for w in t[1]]
                logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words)))

            # Show Topics
            #print(ldamallet.show_topics(formatted=False))

            # Compute Coherence Score
            cm = CoherenceModel(model=ldamallet, texts=trigram_docs, dictionary=trigram_dictionary, coherence='c_v')
            c_v.append(cm.get_coherence())
            cm = CoherenceModel(model=ldamallet, corpus=trigram_bow_corpus,
                            dictionary=trigram_dictionary, coherence='u_mass')#, processes=workers)
            u_mass.append(cm.get_coherence())
            #perp_lower_bound = ldamallet.log_perplexity(trigram_bow_corpus)
            #perp.append(2**(-perp_lower_bound))
            perp.append(0)

        else:
            logger.info('Using LDA...')
            #TODO: try with and without alpha
            ldamodel = LdaModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary,
                                num_topics=num_topics, passes=passes, iterations=iterations,
                                chunksize=chunksize, eta=eta, #eval_every=eval_every,
                                alpha=alpha,
                                random_state=np.random.RandomState(seed=10101010),
                                )
            #ldamodel = LdaMulticore(corpus=trigram_bow_corpus, id2word=trigram_dictionary,
            #                     num_topics=num_topics, passes=passes, iterations=iterations,
            #                     chunksize=chunksize, eta=eta, #eval_every=eval_every,
            #                     random_state=np.random.RandomState(seed=10101010),
            #                     workers=workers
            #                     )                                 
             
            ldamodel_filepath = (save_dir / f'gensim_lda_{num_topics}_topics').as_posix()
            ldamodel.save(ldamodel_filepath)

            for t in ldamodel.show_topics(num_topics=-1, num_words=50, formatted=False):
                words = [w[0] for w in t[1]]
                logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words)))

            cm = CoherenceModel(model=ldamodel, texts=trigram_docs,
                            dictionary=trigram_dictionary, coherence='c_v')#, processes=workers)
            c_v.append(cm.get_coherence())
            cm = CoherenceModel(model=ldamodel, corpus=trigram_bow_corpus,
                            dictionary=trigram_dictionary, coherence='u_mass') #, processes=workers)
            u_mass.append(cm.get_coherence())
            perp_lower_bound = ldamodel.log_perplexity(trigram_bow_corpus)
            perp.append(2**(-perp_lower_bound))

    coh_perp = pd.DataFrame(
        data=np.array([c_v, u_mass, perp]).T,
        columns=['c_v', 'u_mass', 'perp'],
        index=list(num_topics_range))
    coh_perp.index.name = 'num_topics'
    coh_perp_filepath = save_dir / 'coherence_perplexity.csv'
    coh_perp.to_csv(coh_perp_filepath)
    logger.info('coherence_docs={0}, coherence_corpus={1}, perplexity={2}'.format(c_v, u_mass, perp))
예제 #30
0
파일: demo.py 프로젝트: pielstroem/Topics
def upload_file():
    """
    Upload csv files and create:
        * ~/out/corpus.dict
        * ~/out/corpus.lda
        * ~/out/corpus.lda.state
        * ~/out/corpus.mm
        * ~/out/corpus.mm.index
        * ~/out/corpus_doclabels.txt
        * ~/out/corpus_topics.txt
        * ~/mycorpus.txt

    As well as (for example):
        * ~/swcorp/Doyle_AStudyinScarlet.txt
        * ~/swcorp/Lovecraft_AttheMountainofMadness.txt
        * etc.
    """

    # INPUT
    # columns to read from csv file
    columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']

    # parts-of-speech to include into the model
    pos_tags = ['ADJ', 'NN', 'V']

    # stopwords
    regex = re.compile('\w+')
    stopwords = request.files['stoplist']
    stopwords = str(stopwords.readlines())
    stopwords = regex.findall(stopwords)
    stopwords.extend(("'", "'d", "'s")) # temporary solution
    print(stopwords)

    # document size (in words)
    doc_size = 1000

    # uses the pipeline's ParagraphId to split text into documents,
    # overrides doc_size - 1: on, 0: off
    doc_split = 0

    # no. of topics to be generated
    no_of_topics = 30

    # no. of lda iterations - usually, the more the better, but
    # increases computing time
    no_of_passes = 1

    # perplexity estimation every n chunks -
    # the smaller the better, but increases computing time
    eval = 1

    # documents to process at once
    chunk = 100

    # "symmetric", "asymmetric", "auto", or array
    # (default: a symmetric 1.0/num_topics prior) affects sparsity of
    # the document-topic (theta) distribution
    alpha = "symmetric"

    # custom alpha may increase topic coherence, but may also produce
    # more topics with zero probability alpha = np.array([ 0.02, 0.02,
    # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04,
    # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02])

    # can be a number (int/float), an array, or None
    # affects topic-word (lambda) distribution - not necessarily
    # beneficial to topic coherence
    eta = None

    # PREPROCESSING
    files = request.files.getlist('files')
    docs = []
    doc_labels = []

    print("\n reading files ...\n")

    for file in files:
        file_label = secure_filename(file.filename).split('.')[0]

        df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE)
        df = df[columns]
        df = df.groupby('CPOS')

        doc = pd.DataFrame()
        for p in pos_tags:  # collect only the specified parts-of-speech
            doc = doc.append(df.get_group(p))
            # construct documents
            if doc_split:  # size according to paragraph id
                doc = doc.groupby('ParagraphId')
                for para_id, para in doc:
                    docs.append(para['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(para_id)]))
            else:  # size according to doc_size
                doc = doc.sort_values(by='TokenId')
                i = 1
                while(doc_size < doc.shape[0]):
                    docs.append(
                        doc[:doc_size]['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(i)]))
                    doc = doc.drop(doc.index[:doc_size])
                    i += 1
                docs.append(doc['Lemma'].values.astype(str))
                doc_labels.append(''.join([file_label, " #", str(i)]))

            if not os.path.exists(os.path.join(os.getcwd(), "swcorp")):
                os.makedirs(os.path.join(os.getcwd(), "swcorp"))

            swpath = os.path.join('swcorp', "".join(file_label))

            with open(swpath + ".txt", 'w', encoding="utf-8") as text:
                text.write(" ".join(
                    word for word in doc['Lemma'].values.astype(str)
                    if word not in stopwords))

    print("\n normalizing and vectorizing ...\n")

    # texts = [
    #   [word for word in doc if word not in stopwords] for doc in docs]

    print("\n stopwords removed ...\n")

    print("\n writing mastercorpus ...\n")

    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    with open(mastercorpus, 'w', encoding="utf-8") as data:
        folder = glob.glob("swcorp/*")
        for text in folder:
            with open(text, 'r', encoding="utf-8") as text:
                textline = [re.sub(
                    r'\\n\\r', '', document) for document in ' '.join(
                        text.read().split())]
                if text != folder[-1]:
                    data.write("".join(textline) + "\n")
                else:
                    data.write("".join(textline))

    # MAIN PART
    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    dictionary = corpora.Dictionary(
        line.lower().split() for line in open(
            mastercorpus, encoding="utf-8"))

    class MyCorpus(object):
        def __iter__(self):
            for line in open('mycorpus.txt'):
                # assume there's one document per line, tokens
                # separated by whitespace
                yield dictionary.doc2bow(line.lower().split())

    # corpus = buildCorpus(mastercorpus, dictionary)

    corpus = MyCorpus()

    # corpus = glob.glob("swcorpus/*")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)): os.makedirs(os.path.join
    # (os.path.join(os.getcwd(), 'out'), foldername))

    MmCorpus.serialize(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus.mm'])), corpus)
    mm = MmCorpus('out/corpus.mm')

    print(mm)

    # doc_labels = glob.glob("corpus/*")

    print("fitting the model ...\n")

    model = LdaModel(
        corpus=mm, id2word=dictionary, num_topics=no_of_topics,
        passes=no_of_passes, eval_every=eval, chunksize=chunk,
        alpha=alpha, eta=eta)

    # model = LdaMulticore(corpus=corpus, id2word=dictionary,
    # num_topics=no_of_topics, passes=no_of_passes,
    # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

    print(model, "\n")

    topics = model.show_topics(num_topics=no_of_topics)

    for item, i in zip(topics, enumerate(topics)):
        print("topic #"+str(i[0])+": "+str(item)+"\n")

    print("saving ...\n")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)):
    # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'),
    # foldername))

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f:
            for item in doc_labels:
                f.write(item + "\n")

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_topics.txt"])), "w", encoding="utf-8") as f:
        for item, i in zip(topics, enumerate(topics)):
            f.write(
                "".join(["topic #", str(i[0]), ": ", str(item), "\n"]))

    dictionary.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'dict'])))
    # MmCorpus.serialize(
    # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
    # [foldername, 'mm'])), corpus)
    model.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'lda'])))

    print("\n ta-daaaa ...\n")
    
    # VISUALIZATION
    no_of_topics = model.num_topics
    no_of_docs = len(doc_labels)
    doc_topic = np.zeros((no_of_docs, no_of_topics))
    
    for doc, i in zip(corpus, range(no_of_docs)):
        # topic_dist is a list of tuples (topic_id, topic_prob)
        topic_dist = model.__getitem__(doc)
        for topic in topic_dist:
            doc_topic[i][topic[0]] = topic[1]
    
    # get plot labels
    topic_labels = []
    for i in range(no_of_topics):
        # show_topic() returns tuples (word_prob, word)
        topic_terms = [x[0] for x in model.show_topic(i, topn=3)]
        topic_labels.append(" ".join(topic_terms))
        
    # cf. https://de.dariah.eu/tatom/topic_model_visualization.html

    if no_of_docs > 20 or no_of_topics > 20:
        plt.figure(figsize=(20, 20)) # if many items, enlarge figure
    plt.pcolor(doc_topic, norm=None, cmap='Reds')
    plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
    plt.xticks(
        np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
    plt.gca().invert_yaxis()
    plt.colorbar(cmap='Reds')
    plt.tight_layout()
    plt.savefig("./static/corpus_heatmap.svg")
    return render_template('success.html')
예제 #31
0
    vocab = Dictionary.load_from_text('./vocab.txt')
    corpus = UnlabeledCorpus('./rumor_train.csv', vocab)
    valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab)
    valid_sentences = [doc for doc in valid_corpus][5000:]

    # varing number of topics
    # result = {}
    # for num_topics in [2, 4, 8, 16, 32, 64]:
    #     best_value = -100
    #     for i in range(5):
    #         model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics)
    #         likelihood = model.log_perplexity(valid_sentences)
    #         best_value = max(best_value, likelihood)
    #     result[num_topics]= best_value
    #
    # for num_topics, likelihood in result.iteritems():
    #     print 'num_topics: %d, best word_likelihood: %f' % (num_topics, likelihood)

    model = LdaModel(corpus=corpus, id2word=vocab, num_topics=8, passes=2)
    model.save('./lda_model.txt')
    # print topics to a file
    topics = model.show_topics(num_topics=100, num_words=50)
    with codecs.open('./topics.txt', 'w', 'utf-8') as out_f:
        for topic in topics:
            topic_id, topic_str = topic[0], topic[1]
            out_f.write('%d:\n%s\n' % (topic_id, topic_str))
        out_f.write('\n')



예제 #32
0
def create_lda_model():
    logging.info('about to create all docs from chunks')
    start_time = datetime.datetime.now()
    create_all_docs()
    end_time = datetime.datetime.now()
    logging.info('total time is: %s', end_time - start_time)

    logging.info('about to load all docs')
    with open('./resources/LDA_processing/all_docs.pkl', mode='rb') as f:
        all_docs = pickle.load(f)

    logging.info('about to load english words')
    with open('./resources/LDA_input/english_full_list.txt') as f:
        english_words = f.read().splitlines()

    good_english_words = set(english_words[75:21000])
    del english_words
    logging.info('about to remove all stop-words and unknown words')
    texts = []
    for i, doc in enumerate(all_docs):
        filtered_doc = [word for word in doc if word in good_english_words]
        texts.append(filtered_doc)
        if i % 5000 == 0:
            logging.info('Finished doc: %s', i)

    logging.info('about to release memory of all_docs and english_words')
    del all_docs
    del good_english_words

    logging.info('about to save texts')
    with open('./resources/LDA_processing/texts.pkl', mode='wb') as f:
        pickle.dump(texts, f)

    logging.info('about to load texts')
    with open('./resources/LDA_processing/texts.pkl', mode='rb') as f:
        texts = pickle.load(f)

    logging.info('about to create dictionary')
    dictionary = corpora.Dictionary(texts)
    keys = dictionary.keys()
    logging.info('dict size before filter: %s', len(keys))
    dictionary.filter_extremes(keep_n=150000)
    dictionary.filter_extremes(no_below=150, no_above=0.05)
    keys = dictionary.keys()
    logging.info('dict size after filter: %s', len(keys))
    dictionary.save('./resources/LDA_processing/lda.dict')
    dictionary.save_as_text('./resources/LDA_processing/lda_dict.txt')

    logging.info('about to create corpus')
    corpus = [dictionary.doc2bow(text) for text in texts]

    logging.info('about to save corpus as mm file')
    corpora.MmCorpus.serialize('./resources/LDA_processing/corpus.mm', corpus)

    logging.info('about to load dictionary file')
    dictionary = corpora.Dictionary.load('./resources/LDA_processing/lda.dict')

    logging.info('about to load corpus as mm file')
    corpus = corpora.MmCorpus('./resources/LDA_processing/corpus.mm')

    logging.info('about to start LDA model')
    lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
    logging.info('finished LDA model')

    logging.info('about to save ldaModel')
    lda.save('./resources/LDA_processing/LdaModel')

    logging.info('about to load ldaModel')
    lda = LdaModel.load('./resources/LDA_processing/LdaModel')

    logging.info('about to find topics')
    topics = lda.show_topics(num_topics=num_topics, num_words=10000, log=True, formatted=False)

    logging.info('about to save topics')
    with open('./resources/LDA_processing/topics.pkl', mode='wb') as f:
        pickle.dump(topics, f)

    dict_word_sets = find_words_from_lda_model()
    with open('./resources/LDA_processing/dict_word_sets.pkl', mode='wb') as f:
        pickle.dump(dict_word_sets, f)

    topics_words = extract_words_from_word_sets()
    with open('./resources/LDA_result/topic_words', mode='wt', encoding='utf-8') as f:
        f.write('\n'.join(topics_words))
예제 #33
0
파일: lda2.py 프로젝트: pielstroem/Topics
MmCorpus.serialize(corpusPath, corpus)

mm = MmCorpus(corpusPath)

doc_labels = makeDocLabels(path)

log.info('fitting the model ...')

# fitting the model
model = LdaModel(corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
                 eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

log.info('generated topics...')

# print topics
topics = model.show_topics(num_topics=no_of_topics)

for item, i in zip(topics, enumerate(topics)):
    log.info('topic #%s: %s', i[0], item)


log.info('saving results...')

# create output folder
if not os.path.exists("out"): os.makedirs("out")

# save doc_labels for further use
with open(os.path.join(os.path.join(os.getcwd(), "out"),''.join([foldername, "_doclabels.txt"])), "w", encoding="utf-8") as f:
    for item in doc_labels: f.write(item+"\n")
	
# save topics for further use
예제 #34
0
# Count words in the 'objective', keeping only those that occur at least 5 times
vectorizer = fe.text.CountVectorizer(stop_words='english', min_df=5)
X = vectorizer.fit_transform(h2020.objective)

# Convert to gensim format
corpus = Sparse2Corpus(X, documents_columns=False)

# Create mapping from word IDs (integers) to words (strings)
id2word = dict(enumerate(vectorizer.get_feature_names()))

# Fit LDA model with 10 topics
lda = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)

# Show top 5 words for each of the 10 topics
lda.show_topics(num_topics=10, num_words=5)
'''
word2vec using gensim
'''

# Convert adjectives and verbs to corresponding lemmas using spaCy
objectives = [ \
    [ x.lemma_ if x.pos == spacy.parts_of_speech.ADJ or \
                  x.pos == spacy.parts_of_speech.VERB \
      else x.text \
      for x in en(text) ] \
    for text in h2020.objective ]

# Fit word2vec model
w2c = Word2Vec(sentences=objectives, size=100, window=5, min_count=5)
    dictionary
)
pyLDAvis.save_html(p, '../../results/reports/onsite_search_terms_lda_2017_2019_20_topic.html')


# The size of the bubble measures the importance of the topics, relative to the data.
# 
# The terms are ordered by saliency (how much the term tells you about the topic).
# 
# The relevance slider can be used to adjust saliency scores.

num_topics = 20


censored = [9]
[x for x in ldamodel.show_topics(num_topics=num_topics) if x[0] not in censored]


# ### Tracking trends over time
# 
# Given a gensim model, label a corpus by topic and plot them over time. How do they change relative to one another?
# 
# Top topics may follow similar trends to global search patterns. Instead, look at "% of searches that are topic".

# First, we need to label the training data

from tqdm import tqdm


tqdm.pandas()
예제 #36
0
파일: docs.py 프로젝트: rafunchik/shrimps

#  vamos a utilizar Latent Dirichlet Allocation para tratar de categorizar los abstracts
# este se demora la primera q lo corres para entrenar el modelo
print("lda")
lda_filename = 'model.lda'
if not os.path.isfile(lda_filename):
    lda = LdaModel(corpus, num_topics=5,
                   id2word=dictionary,
                   update_every=5,
                   chunksize=10000,
                   passes=100)
    lda.save('/tmp/model.lda')
else:
    lda = LdaModel.load('/tmp/model.lda')
lda.show_topics()
topics_matrix = lda.show_topics(formatted=False, num_words=7)

print(topics_matrix)
print(len(topics_matrix))

for topic in topics_matrix:
    i = topic[1]
    print([str(word) for word in i])
#
# topics_matrix = np.array(topics_matrix)
#
# topic_words = topics_matrix[:, :, 1]
# for i in topic_words:
#     print([str(word) for word in i])