def create_evaluation_perplexity(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    try:
        id2word = Dictionary.load(corpus_fname + '.dict')
        corpus = MalletCorpus(corpus_fname, id2word=id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    held_out = list()
    training = list()
    target_len = int(0.1 * len(corpus))
    logger.info('Calculating perplexity with held-out %d of %d documents' %
                (target_len, len(corpus)))

    ids = set()
    while len(ids) < target_len:
        ids.add(random.randint(0, len(corpus)))

    for doc_id, doc in enumerate(corpus):
        if doc_id in ids:
            held_out.append(doc)
        else:
            training.append(doc)

    model = LdaModel(training,
                     id2word=corpus.id2word,
                     alpha=config.alpha,
                     passes=config.passes,
                     num_topics=config.num_topics)

    pwb = model.log_perplexity(held_out)

    with open(config.path + 'evaluate-perplexity-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, pwb])
Exemplo n.º 2
0
    def Topic_Num_Decision(self, start, stop, size):

        model_list = []
        coherence_values = []
        topic_n_list = []
        perplexity_values = []

        for num_topics in range(start, stop, size):
            model = LdaModel(self.corpus,
                             num_topics=num_topics,
                             id2word=self.dictionary)
            model_list.append(model)

            coherencemodel = CoherenceModel(
                model=model,
                texts=self.news_doc,
                dictionary=self.dictionary,
                coherence="c_v",
            )
            coherence_values.append(coherencemodel.get_coherence())
            topic_n_list.append(num_topics)
            perplexity_values.append(model.log_perplexity(self.corpus))
            print(num_topics)

        return model_list, coherence_values, perplexity_values
Exemplo n.º 3
0
def get_gensim_topics(num_topics_list, sentences, print_flag = False):
    """
    Gensim by default employs a version of count vectorization
    input: sentences (list of list of words)
    outputs coherence, perplexity, and topics 
    prints topics if print == True 
    """
    texts = sentences.apply(retokenize).tolist() 
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    perplexity_ls = []
    coherence_ls = []
    for i in num_topics_list:
        lda = LdaModel(corpus, num_topics=i, id2word = dictionary, random_state = 10)
        perplexity = lda.log_perplexity(corpus)
        perplexity_ls.append(perplexity) 
        coherence_model_lda = CoherenceModel(model = lda, texts = texts, dictionary = dictionary, coherence = 'c_v')
        coherence = coherence_model_lda.get_coherence()
        coherence_ls.append(coherence)

        if print_flag == True:
            print('Num. Topics: ', i)
            print('')
            for i in (lda.print_topics()):
                words = i[1]
                words_ls = words.split('+')
                words_ls = ([i.split('*')[1] for i in words_ls])
                words_ls = [i.replace('"', '') for i in words_ls]
                print(', '.join(words_ls))
        print('')
    return perplexity_ls, coherence_ls
Exemplo n.º 4
0
    def get_model(dictionary, corpus, max_topics):  #Here's the beast
        best_lda_model = None
        best_score = None
        best_num_topics = 1
        current_num_topics = 1

        for i in range(1, max_topics):
            lda_model = LdaModel(corpus=corpus,
                                 num_topics=current_num_topics,
                                 id2word=dictionary,
                                 passes=1000)
            coherence_model_lda = CoherenceModel(model=lda_model,
                                                 texts=corpus_texts,
                                                 dictionary=dictionary,
                                                 coherence='c_v')

            coherence_lda = coherence_model_lda.get_coherence()
            current_perplexity_score = lda_model.log_perplexity(corpus)
            current_score = coherence_lda  #New variable because I was playing around with using some function of the coherence and perplexity, but just went with coherence
            print("Topics: ", current_num_topics, "Perplexity Score: ",
                  current_perplexity_score, "Coherence Score: ", coherence_lda)

            # Saves the model with the highest score
            if best_score == None or current_score > best_score:
                best_score = current_score
                best_lda_model = lda_model
                best_num_topics = current_num_topics

            current_num_topics += 1

        print("\nBest Num Topic: ", best_num_topics, best_score)
        return best_lda_model
Exemplo n.º 5
0
def calculateLDA(dictionary, corpus, texts, list_num_topics, saveModelPath=[]):
    """
    Computes LDA models for given list with number of topics and save them to disk
    And calculates coherence values for each model

    Parameters:
    ----------
    dictionary:         Gensim dictionary
    corpus :            Gensim corpus
    texts :             Preprocessed  texts
    list_num_topics:    list with number of topics to find for LDA
    saveModelPath:      if empty, do nothing
                        otherwise save model to disk

    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    logPerplex_list = []
    for num_topics in list_num_topics:
        print("\tNumber of topics:", num_topics)
        lm = LdaModel(corpus=corpus,
                      num_topics=num_topics,
                      id2word=dictionary,
                      random_state=0,
                      chunksize=5000,
                      passes=50,
                      eval_every=None,
                      alpha='auto',
                      eta='auto',
                      iterations=50)

        lm_list.append(lm)
        cm = CoherenceModel(model=lm,
                            corpus=corpus,
                            dictionary=dictionary,
                            texts=texts,
                            coherence='c_v',
                            processes=-1)
        logPerplex_list.append(lm.log_perplexity(corpus))
        c_v.append(cm.get_coherence())

        if saveModelPath != []:
            lm.save(saveModelPath + "K_{}.model".format(num_topics))

    print("Number topics:", list_num_topics)
    print("Coherence scores:", c_v)
    print("LogPerplexity: ", logPerplex_list)
    return lm_list, c_v, logPerplex_list
Exemplo n.º 6
0
    def train_lda(self, cache_path):
        print(cache_path)
        trainBatchIter = BatchIterBert(self.trainDataIter,
                                       filling_last_batch=False,
                                       postProcessor=batchPostProcessor,
                                       batch_size=1)
        bow_list = []
        for item in trainBatchIter:
            bow = item[1].squeeze().detach().numpy().tolist()
            bow_list.append(self.bow_2_gensim(bow))
        print(len(bow_list))
        #print(self.dictProcess.common_dictionary.id2token)
        lda = LdaModel(np.array(bow_list),
                       num_topics=50,
                       passes=200,
                       chunksize=len(bow_list),
                       id2word=self.dictProcess.common_dictionary)
        #print(lda.show_topic(1, topn=10))
        output_topic_line = ''
        for topic_id in range(50):
            current_topic_list = []
            current_topic = lda.show_topic(topic_id, topn=10)
            for topic_tuple in current_topic:
                current_topic_list.append(topic_tuple[0])
            output_topic_line += ' '.join(current_topic_list) + '\n'
            #print(current_topic_list)

        topic_file = os.path.join(cache_path, 'ldatopic.txt')
        with open(topic_file, 'w') as fo:
            fo.write(output_topic_line)

        testBatchIter = BatchIterBert(self.testDataIter,
                                      filling_last_batch=False,
                                      postProcessor=batchPostProcessor,
                                      batch_size=1)

        test_bow_list = []
        word_count = 0
        for item in testBatchIter:
            bow = item[1].squeeze().detach().numpy().tolist()
            word_count += sum(bow)
            test_bow_list.append(self.bow_2_gensim(bow))

        print(word_count)
        ppl = lda.log_perplexity(test_bow_list, len(test_bow_list))
        print(ppl)
        bound = lda.bound(test_bow_list)
        print(bound / word_count)
        print(np.exp2(-bound / word_count))
Exemplo n.º 7
0
 def best_lda_model(self):
     tuple_list = []
     for n in range(3, 50):
         test_model = LdaModel(
             corpus=self.corpus['content'].tolist(),
             id2word=self.dictionary,
             num_topics=n)  # try the distributed parameter
         tperplexity = test_model.log_perplexity(self.test.content.tolist(),
                                                 total_docs=None)
         tuple_list.append((n, tperplexity))
         # if tperplexity < self.perplexity:
         #     self.model = test_model
         #     self.perplexity = tperplexity
         #     print("New lower log_perplexity with",n,"topics")
         if n % 10 == 0:
             print(n)
     plt.scatter(*zip(*tuple_list))
     plt.show()
Exemplo n.º 8
0
def evaluate_perplexity(dictionary, corpus, texts, limit):
    perplex = np.zeros((1, limit), dtype=np.float16)

    lda_list = []
    for num_topics in range(1, limit + 1):
        print("Topic %d" % num_topics)
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lda_list.append(lm)

        perplex[0, num_topics - 1] = lm.log_perplexity(corpus)

    # Show graph
    x = range(1, limit + 1)
    plt.plot(x, perplex.T)
    plt.xlabel("k topics")
    plt.ylabel("Perplexity")
    plt.show()

    return lda_list, perplex
Exemplo n.º 9
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3,
                             coherence_measure="c_v"):
    coherence_values = []
    perplexities = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus, num_topics=num_topics)
        perplexities.append(np.exp2(-model.log_perplexity(corpus)))
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence=coherence_measure)
        coherence_values.append(coherencemodel.get_coherence())

    return np.array(coherence_values,
                    dtype=np.float32), np.array(perplexities, dtype=np.float32)
Exemplo n.º 10
0
class MyLda:
    def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15):
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.myDictionary = myDictionary
        self.model = LdaModel(self.myDictionary.doc2bows, \
         id2word=self.myDictionary.dictionary, \
         num_topics=num_topics)
        self.topic2ids, self.id2topics = self.get_mappings()
        self.coherenceModel = None
        print("- Created MyLda with {} topics".format(self.num_topics))

    def get_mappings(self):
        topic2ids, id2topics = defaultdict(list), defaultdict(list)
        for i, doc2bow in enumerate(self.myDictionary.doc2bows):
            topic_pairs = self.model.get_document_topics(doc2bow)
            for j, (topic, prob) in enumerate(topic_pairs):
                if prob >= self.topic_threshold or j == 0:
                    topic2ids[topic].append(i)
                    id2topics[i].append(topic)
        return topic2ids, id2topics

    def get_topic_terms(self, topic):
        terms = self.model.get_topic_terms(topic)
        return terms

    def get_top_topic(self):
        top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows)
        average = sum([t[1] for t in top_topics]) / self.num_topics
        return top_topics, average

    def get_perplexity(self):
        return self.model.log_perplexity(self.myDictionary.doc2bows)

    def get_coherence(self):
        if not self.coherenceModel:
            self.coherenceModel = CoherenceModel(model=self.model, \
             corpus=self.myDictionary.doc2bows, \
             dictionary=self.myDictionary.dictionary, \
             coherence='u_mass')
        return self.coherenceModel.get_coherence()
Exemplo n.º 11
0
def gridsearch_graph(dictionary, corpus, texts, list_num_topics):
    """
    Function to display num_topics - LDA graph using c_v coherence

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : preprocessed tweets
    list_num_topics: list with number of topics to calculate the LDA on

    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    logPerplex_list = []
    for num_topics in list_num_topics:
        print("number of topics:", num_topics)
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=0,
                      chunksize=5000, passes=50, eval_every=None, alpha='auto', eta='auto', iterations=50)


        lm_list.append(lm)
        logPerplex_list.append(lm.log_perplexity(corpus))
        cm = CoherenceModel(model=lm, corpus=corpus, dictionary=dictionary, texts=texts, coherence='c_v', processes=-1)
        c_v.append(cm.get_coherence())

    # Show graph
    #x = list_num_topics #range(1, limit)
    #plt.plot(x, c_v)
    #plt.xlabel("num_topics")
    #plt.ylabel("Coherence score")
    #plt.legend(("c_v"), loc='best')
    #plt.show()

    return lm_list, c_v, logPerplex_list
Exemplo n.º 12
0
def cluster_questions(topic_num,
                      res_path,
                      q_path='datasets\DialogQA\Qall.txt',
                      a_path='datasets\DialogQA\Aall.txt'):
    with open(a_path, 'r', encoding='utf-8') as f:
        common_texts = [text.split() for text in f.readlines()]

    with open(q_path, 'r', encoding='utf-8') as f:
        questions = [text for text in f.readlines()]

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    lda = LdaModel(common_corpus, num_topics=topic_num)

    questions_clusterd = [[] for i in range(topic_num)]
    print('Questions : ', len(questions))
    perp = lda.log_perplexity(common_corpus)
    for i, q in enumerate(questions):
        other_corpus = [common_dictionary.doc2bow(common_texts[i])]
        vector = lda[other_corpus]
        # print(vector[0])
        max_prob = 0
        for (idx, prob) in vector[0]:
            # print(idx)
            if prob > max_prob:
                topic = idx
                max_prob = prob
        questions_clusterd[topic].append(q)
        # print(topic)
    if (not os._exists(res_path)):
        os.makedirs(res_path)
    for top in range(topic_num):
        with open(res_path + str(top) + '.txt', 'w', encoding='utf-8') as f:
            for quest in questions_clusterd[top]:
                f.write(quest)
                # f.write('\n')

    return perp
Exemplo n.º 13
0
def lda_main(word_with_pos=WORD_WITH_POS, topic_num=LDA_TOPIC_NUM):
    LDA_MODEL = './models/lda_{}.model'.format(topic_num)
    stop_word = read_stopword()
    begin_t = time.time()
    perplexity_f = open('perplexity.txt', 'a')

    def func(line):
        '''
        捆绑词性是否
        '''
        line = line.strip()
        json_data = json.loads(line)
        content = json_data['content']
        if word_with_pos:
            word_list = [j[0] + j[1] for j in content if j[0] not in stop_word]
        else:
            word_list = [j[0] for j in content if j[0] not in stop_word]

        return word_list

    with open(DATA_JSONLINE) as f:

        # words = [func(i) for i in f.readlines()]
        words = []
        for i in f.readlines():
            words.append(func(i))
        print('数据装载完毕! use ',
              time.time() - begin_t, 'sec.\n begin lda modeling')
        dic = corpora.Dictionary(words)
        corpus = [dic.doc2bow(text) for text in words]
        dic.save(DICTIONARY_PATH)
        corpora.MmCorpus.serialize(CORPUS_PATH, corpus)
        lda = LdaModel(corpus=corpus, id2word=dic, num_topics=topic_num)
        lda.save(LDA_MODEL)
        print(topic_num, ',', lda.log_perplexity(corpus), file=perplexity_f)
        vis_data = pyLDAvis.gensim.prepare(lda, corpus, dic)
        vis_html_path = 'ldavis_{}.html'.format(topic_num)
        pyLDAvis.save_html(vis_data, vis_html_path)
        print('LDA 建模完成!\nTotal use:', time.time() - begin_t, 'sec.')
Exemplo n.º 14
0
    def LDAmodel(words, num_topics=5, num_words=5):
        """
        1. the number of words
        2. the mixture of topics ex: 1/2 the topic “health” and 1/2 the topic “vegetables" etc..
        3. the probability of topic depends on their dominancy
        """
        dictionary = corpora.Dictionary(words)
        # Term Document Frequency
        corpus = [dictionary.doc2bow(word) for word in words]
        # save it!
        pickle.dump(corpus, open('corpus.pkl', 'wb'))
        dictionary.save('dictionary.gensim')
        # Train model
        ldamodel = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20)
        # lda_model = LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
        topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
        # Validation
        # A measure of how good the model is. lower the better.
        val_perplexity = ldamodel.log_perplexity(corpus)
        # cohherent score
        coherence_ldamodel = CoherenceModel(model=ldamodel, texts=words, dictionary=dictionary, coherence='c_v')
        val_coherence = coherence_ldamodel.get_coherence()

        return topics, val_perplexity, val_coherence
Exemplo n.º 15
0
def topic_model_gensim_lda(col: str, prefix=None, min_topics=19,max_topics=19,step=2) -> None:
    def trigram_bow_generator(filepath: str):
        '''
        generator function to read docs from a file
        and yield a bag-of-words representation
        '''
        for doc in LineSentence(filepath):
            yield trigram_dictionary.doc2bow(doc)

    if prefix is None:
        prefix = ''
    # for topic modeling
    
    trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt'
    print(f'Loading input file {trigram_docs_filepath}')
    trigram_dictionary_filepath = data_dir_processed / f'{prefix}{col}_trigram_dict_all.dict'
    trigram_bow_filepath = data_dir_processed / f'{prefix}{col}_trigram_bow_corpus_all.mm'

    #resp_whytfa_trigram_transformed_docs_all.txt

    # turn to posix filepaths until gensim supports this
    # trigram_docs_filepath = trigram_docs_filepath.as_posix()
    trigram_docs_filepath =  trigram_docs_filepath.as_posix()
    trigram_dictionary_filepath = trigram_dictionary_filepath.as_posix()
    trigram_bow_filepath = trigram_bow_filepath.as_posix()

    # TODO - change 1 == 1 lines to overwrite_interim

    # this is a bit time consuming - make the if statement True
    # if you want to learn the dictionary yourself.
    if 1 == 1:
        trigram_docs = LineSentence(trigram_docs_filepath)
        # learn the dictionary by iterating over all of the docs
        trigram_dictionary = Dictionary(trigram_docs)
        print(trigram_dictionary)
        #for k, v in trigram_dictionary.iteritems():
        #    print (f'{k}, {v}')


        # filter tokens that are very rare or too common from
        # the dictionary (filter_extremes) and reassign integer ids (compactify)
        trigram_dictionary.filter_extremes(no_below=min_absolute_frequency,
                                           no_above=max_relative_frequency,
                                           keep_n=max_features,
                                           )
        trigram_dictionary.compactify()
        print(trigram_dictionary)
        #for k, v in trigram_dictionary.iteritems():
        #    print (f'{k}, {v}')

        if verbose:
            logger.info(f'Saving trigram dictionary: {trigram_dictionary_filepath} {len(trigram_dictionary)}')
        trigram_dictionary.save(trigram_dictionary_filepath)

    # load the finished dictionary from disk
    if verbose:
        logger.info(f'Loading trigram dictionary: {trigram_dictionary_filepath}')
    trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

    # this is a bit time consuming - make the if statement True
    # if you want to build the bag-of-words corpus yourself.
    if 1 == 1:
        # generate bag-of-words representations for
        # all docs and save them as a matrix
        if verbose:
            print(f'Saving corpus: {trigram_bow_filepath}')
        MmCorpus.serialize(trigram_bow_filepath,
                           trigram_bow_generator(trigram_docs_filepath))
    # load the finished bag-of-words corpus from disk
    if verbose:
        print(f'Loading corpus: {trigram_bow_filepath}')
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    num_topics_range = range(min_topics, max_topics + 1, step)

    #iterations = 2000
    #chunksize = 100  # more than the number of docs?

    passes = 10
    # iterations = 400
    iterations = 100
    # chunksize = len(trigram_bow_corpus)
    chunksize = 100  # more than the number of docs?
    eta = 'auto'
    #eval_every = None  # Don't evaluate model perplexity, takes too much time.
    workers=1
    print(f'cpu_count:{cpu_count()}')
    alpha='auto'
    if multicore:
        # for multicore; one fewer than the number of cores
        workers = cpu_count() - 1
        if verbose:
            print(f'Multiprocessing with {workers} cores (one fewer than the number of cores)')
    else:
        # for singnle core; cannot use in multicore
        alpha = 'auto'

    # now_str = datetime.now(timezone('US/Pacific')).strftime('%Y-%m-%d-%H-%M-%S')
    now_str = ''#datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    save_dir = data_dir_processed / f'{prefix}{col}_gensim_lda_models_{now_str}'
    if not save_dir.exists():
        save_dir.mkdir(parents=True, exist_ok=True)
    # save_dir_s3 = f'{data_dir_processed_s3}/{prefix}{col}_gensim_lda_models_{now_str}'

    # lm_list = []
    c_v = []
    u_mass = []
    perp = []
    #alg='LDA'
    alg='Mallet'

    for num_topics in num_topics_range:

        if(alg == 'Mallet'):
            logger.info('Using Mallet...')
            #try the Mallet implementation
            ldamallet = LdaMallet(mallet_path, corpus=trigram_bow_corpus, num_topics=num_topics, id2word=trigram_dictionary,workers=workers,iterations=iterations)

            ldamallet_filepath = (save_dir / f'gensim_ldamallet_{num_topics}_topics').as_posix()
            ldamallet.save(ldamallet_filepath)

            for t in ldamallet.show_topics(num_topics=-1, num_words=10, formatted=False):
                words = [w[0] for w in t[1]]
                logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words)))

            # Show Topics
            #print(ldamallet.show_topics(formatted=False))

            # Compute Coherence Score
            cm = CoherenceModel(model=ldamallet, texts=trigram_docs, dictionary=trigram_dictionary, coherence='c_v')
            c_v.append(cm.get_coherence())
            cm = CoherenceModel(model=ldamallet, corpus=trigram_bow_corpus,
                            dictionary=trigram_dictionary, coherence='u_mass')#, processes=workers)
            u_mass.append(cm.get_coherence())
            #perp_lower_bound = ldamallet.log_perplexity(trigram_bow_corpus)
            #perp.append(2**(-perp_lower_bound))
            perp.append(0)

        else:
            logger.info('Using LDA...')
            #TODO: try with and without alpha
            ldamodel = LdaModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary,
                                num_topics=num_topics, passes=passes, iterations=iterations,
                                chunksize=chunksize, eta=eta, #eval_every=eval_every,
                                alpha=alpha,
                                random_state=np.random.RandomState(seed=10101010),
                                )
            #ldamodel = LdaMulticore(corpus=trigram_bow_corpus, id2word=trigram_dictionary,
            #                     num_topics=num_topics, passes=passes, iterations=iterations,
            #                     chunksize=chunksize, eta=eta, #eval_every=eval_every,
            #                     random_state=np.random.RandomState(seed=10101010),
            #                     workers=workers
            #                     )                                 
             
            ldamodel_filepath = (save_dir / f'gensim_lda_{num_topics}_topics').as_posix()
            ldamodel.save(ldamodel_filepath)

            for t in ldamodel.show_topics(num_topics=-1, num_words=50, formatted=False):
                words = [w[0] for w in t[1]]
                logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words)))

            cm = CoherenceModel(model=ldamodel, texts=trigram_docs,
                            dictionary=trigram_dictionary, coherence='c_v')#, processes=workers)
            c_v.append(cm.get_coherence())
            cm = CoherenceModel(model=ldamodel, corpus=trigram_bow_corpus,
                            dictionary=trigram_dictionary, coherence='u_mass') #, processes=workers)
            u_mass.append(cm.get_coherence())
            perp_lower_bound = ldamodel.log_perplexity(trigram_bow_corpus)
            perp.append(2**(-perp_lower_bound))

    coh_perp = pd.DataFrame(
        data=np.array([c_v, u_mass, perp]).T,
        columns=['c_v', 'u_mass', 'perp'],
        index=list(num_topics_range))
    coh_perp.index.name = 'num_topics'
    coh_perp_filepath = save_dir / 'coherence_perplexity.csv'
    coh_perp.to_csv(coh_perp_filepath)
    logger.info('coherence_docs={0}, coherence_corpus={1}, perplexity={2}'.format(c_v, u_mass, perp))
Exemplo n.º 16
0
print 'Building bag-of-words corpus ...'
bow_corpus = [dictionary.doc2bow(t) for t in texts]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 9 / 10
training = bow_corpus[:size]
testing = bow_corpus[size:]
t0 = time()
print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics,
                                                           len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5)
print("done in %0.3fs." % (time() - t0))
print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2**-(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity

for i in range(0, Num_Topics):
    temp = lda.show_topic(i, 10)
    terms = []
    for term in temp:
        terms.append(term[1])
    print "Top 10 terms for topic #" + str(i) + ": " + ", ".join(terms)
Exemplo n.º 17
0
print 'Saving dictionary (%s)...' % DICT
dictionary.save(DICT)

print 'Building bag-of-words corpus ...'
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 4 / 5
training = bow_corpus[:size]
testing = bow_corpus[size:]

print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000)

print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2 ** -(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity




def takeTokenList_ReturnModel(tokenList, dictionaryForLDA, corpus, baseFolder, topicList, passList, loadTrainedLDAIfExists):

    winningModel_SavePath = os.path.join(baseFolder,'Winning LDA Model')
    path_LDA_LTrainingOutput = os.path.join(baseFolder, "LDA_LTrainingOutput.csv")
    if loadTrainedLDAIfExists and os.path.exists(winningModel_SavePath) and os.path.exists(path_LDA_LTrainingOutput):
        print ("Loading pre-trained LDA model from %s"%(winningModel_SavePath))
        winningLDAModel = gensin_models.LdaModel.load(winningModel_SavePath)
        ldaResultOutput_df = pd.read_csv(path_LDA_LTrainingOutput, header=0, index_col=0)
        _, numberOfTopics = pd.DataFrame(ldaResultOutput_df).sort_values(by=['Coherence'], ascending=False).filter(items=['ActualModel', 'TopicNum']).head(1).values[0]

    else:
        if type(topicList)==int:
            topicList=[topicList]
        elif type(topicList)==list:
            topicList=topicList
        else:
            topicList=[7]
        print ('LDA Topis to check: %s'%str(topicList))

        if type(passList)==int:
            passList=[passList]
        elif type(passList)==list:
            passList=passList
        else:
            passList=[10]

        print('LDA Passes to check: %s' % str(passList))

        ldaResultOutput={}

        for top in topicList:
            for passN in passList:

                ldaModelTitle = '\nLDA_%s_Topics_%s_Passes' % (top, passN)
                start_time = time()
                print("Training LDA Model: %s - StartTime: %s"%(ldaModelTitle,start_time))

                ldaResultOutput[ldaModelTitle] = {'TopicNum': top, 'PassNum': passN}

                ldaTest = LdaModel(corpus=corpus, id2word=dictionaryForLDA, iterations=100, num_topics=top, passes=passN)
                Perplexity = ldaTest.log_perplexity(corpus)

                cohrM = CoherenceModel(model=ldaTest, texts=tokenList, corpus=corpus,  dictionary=dictionaryForLDA, coherence='c_v', processes=1)
                cohrScore = cohrM.get_coherence()

                timeInSeconds = time() - start_time

                print("Coherence: %s"%(round(cohrScore,3)))

                ldaResultOutput[ldaModelTitle]['TopicNum'] = top
                ldaResultOutput[ldaModelTitle]['PassNum'] = passN
                ldaResultOutput[ldaModelTitle]['Perplexity'] = round(Perplexity,3)
                ldaResultOutput[ldaModelTitle]['Coherence'] = round(cohrScore,3)
                ldaResultOutput[ldaModelTitle]['TimeInSec'] = round(timeInSeconds,3)

                ldaResultOutput[ldaModelTitle]['ActualModel'] = ldaTest

        ldaResultOutput_df = pd.DataFrame(ldaResultOutput).T.sort_values(by=['Coherence'], ascending=False).copy()
        print(pd.DataFrame(ldaResultOutput_df).sort_values(by=['Coherence'], ascending=False))

        winningLDAModel,numberOfTopics,Coherence,Perplexity = pd.DataFrame(ldaResultOutput_df).sort_values(by=['Coherence'], ascending=False).filter(items=['ActualModel','TopicNum','Coherence','Perplexity']).head(1).values[0]



        #pickle.dump(winningLDAModel, open(winningModel_SavePath, "wb"))
        winningLDAModel.save(winningModel_SavePath)
        print("Winning Model Details:")
        print(ldaResultOutput_df.head(1).values)

    return winningLDAModel,ldaResultOutput_df,numberOfTopics,Coherence,Perplexity
Exemplo n.º 19
0
    passes=5,
    chunksize=10000,
    alpha='asymmetric',
    decay=0.5,
    offset=64,
    eta=None,
    eval_every=0,
    iterations=100,
    gamma_threshold=0.001,
    per_word_topics=True)

## See the topics
lda_model.print_topics(-1)  #this allows to observe the topics
lda_model.get_topic_terms(0,
                          topn=10)  # this provides the top 10 words in topic 0
lda_model.log_perplexity(corpus)  # this compute the log perplexity
lda_model.get_document_topics(
    corpus[0]
)  # This provide the document topic distribution. Note that by default, when a document has a low probability on a topic, it is not displayed
lda_model.get_document_topics(
    corpus[0], minimum_probability=0
)  # This provide the document topic distribution. Here, every topics and associated probabilities are printed.
### Document topic
####
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)
8
Exemplo n.º 20
0
def train(args):
    lda_model = None
    corpus = None
    perplexities = []
    coherence_values = []
    topics = range(2,
                   int(args['--num_topics']) +
                   1, 2) if args['--all'] else range(
                       int(args['--num_topics']),
                       int(args['--num_topics']) + 1)
    if args['--all']:
        target_date = calculate_target_date(float(
            args['--proportion'])) + timedelta(days=-1)
    else:
        target_date = datetime.now().date() + timedelta(
            days=-int(args['--days']))
    for num_topics in topics:
        os.makedirs(name='./model/{}/{}'.format(num_topics, target_date),
                    exist_ok=True)
        try:
            logger.info("loading model")
            lda_model = LdaModel.load('./model/{}/{}/topic_{}.model'.format(
                num_topics, target_date, num_topics))
            corpus, dictionary = load_corpus_dictionary(
                float(args['--proportion']))
        except:
            logger.info("not found model saved")
            corpus, dictionary = load_corpus_dictionary(
                float(args['--proportion']))
            logger.info("training model")
            lda_model = LdaModel(
                corpus=corpus,
                num_topics=num_topics,
                id2word=dictionary,  # Dictionary对象
                chunksize=int(args['--chunk_size']),
                passes=int(args['--passes']),
                alpha='symmetric' if args['--alpha'] else 'auto',
                eta=None if args['--eta'] else 'auto',
                decay=float(args['--decay']),
                offset=float(args['--offset']),
                eval_every=int(args['--eval_every']),
                iterations=int(args['--iterations']),
                gamma_threshold=float(args['--gamma_threshold']),
                minimum_probability=float(args['--minimum_probability']),
                random_state=int(args['--random_state']),
                per_word_topics=True if args['--per_word_topics'] else False)
            logger.info("saving trained model")
            lda_model.save('./model/{}/{}/topic_{}.model'.format(
                num_topics, target_date, num_topics))
        finally:
            perplexities.append(np.exp2(-lda_model.log_perplexity(corpus)))
            # u_mass得分越低越好
            coherence_values.append(
                CoherenceModel(model=lda_model,
                               corpus=corpus,
                               coherence='u_mass').get_coherence())
            lda_model.print_topics(5, 5)

    if args['--all']:
        os.makedirs(name='./pic/{}/'.format(target_date), exist_ok=True)
        draw_graph_perplexity(args, perplexities, topics, target_date)
        draw_graph_coherence(args, coherence_values, topics, target_date)
    else:
        save_ppl_coh(perplexities[0], coherence_values[0], int(args['--days']))
        logger.info("perplexity: {}; coherence: {}.".format(
            perplexities[0], coherence_values[0]))
Exemplo n.º 21
0
# %%
## トピック数の探索
start = 2
limit = 10
step = 1

coherence_vals = []
perplexity_vals = []

for n_topic in tqdm(range(start, limit, step)):
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=n_topic,
                         random_state=0)
    perplexity_vals.append(np.exp2(-lda_model.log_perplexity(corpus)))
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=df['words'],
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence_vals.append(coherence_model_lda.get_coherence())

# %%
# evaluation
x = range(start, limit, step)

fig, ax1 = plt.subplots(figsize=(12, 5))

# coherence
c1 = 'darkturquoise'
ax1.plot(x, coherence_vals, 'o-', color=c1)
Exemplo n.º 22
0
lda_model = LdaModel(corpus=corpus,
                       id2word=id2word,
                       num_topics=10,
                       random_state=100,
                       update_every=1,
                       chunksize=100,
                       passes=10,
                       alpha='auto',
                       per_word_topics=True)

# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
#coherence_model_lda = CoherenceModel(model=lda_model, texts=nps_comment_filtered, dictionary=id2word, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
Exemplo n.º 23
0
    def gen_glda_model_sv(self,
                          vectorizer,
                          n_topics,
                          finalCorpus,
                          update_mat=False,
                          alpha='auto',
                          eta="auto",
                          ittrs=300):
        # vectorizer = gen_feature_vectorizer(self,t)
        # print(finalCorpus)
        if update_mat or self.tfMatrix == None:  # dont want to re-create the everytime we want to generate an lda_model (e.g. different topic numbers)
            # unless this is a new vectorizer, or a new corpus
            t0 = time()
            self.tfMatrix = vectorizer.fit_transform(finalCorpus)
            # transformer = TfidfTransformer()               # this would be a substitue for TFIDFVectorizor, but already using it...
            # self.tfMatrix = transformer.fit_transform(TermDocMatrix)
            print("[tffeature]: gen_lda_model: transform done in %0.3fs." %
                  (time() - t0))
            print("[tffeature]: gen_lda_model: tfMatrix shape:",
                  self.tfMatrix.shape)

            self.tfMatrix = normalize(self.tfMatrix, norm='l1', axis=1)
            print('[tffeature]: type after normalize: ', type(self.tfMatrix))

        # invert vocabulary
        # idx_to_term
        vocab_key = []
        if self.ind2vocab == None:
            inv_vocabulary = {}
            i = 0
            for w in sorted(vectorizer.vocabulary_):
                #inv_vocabulary[vectorizer.vocabulary_[w]] = w
                inv_vocabulary[i] = w
                i += 1
                vocab_key.append(vectorizer.vocabulary_[w])
                # if vectorizer.vocabulary_[w] == 0:
                #     print(w)
        self.ind2vocab = inv_vocabulary
        self.tfMatrix = self.tfMatrix[:, vocab_key]
        # print(self.tfMatrix[0].toarray().tolist() )
        # print(self.tfMatrix[1].toarray().tolist() )
        # print(self.tfMatrix[2].toarray().tolist() )
        print(self.ind2vocab[0], self.ind2vocab[1], self.ind2vocab[2])
        # row_sums = scipy_sparse_matrix.sum(axis=1)
        # scipy_sparse_matrix = scipy_sparse_matrix / row_sums[:, np.newaxis]
        corpus = gensim.matutils.Sparse2Corpus(self.tfMatrix,
                                               documents_columns=False)

        # print(corpus)
        # print(max(inv_vocabulary.keys()))
        # print('scipy shape', self.tfMatrix.shape)
        # print('vocabs: ', len(vectorizer.vocabulary_), len(inv_vocabulary))
        np.random.seed(self.RSEED)
        random.seed(self.RSEED)
        lda = LdaModel(corpus,
                       num_topics=n_topics,
                       id2word=self.ind2vocab,
                       alpha=alpha,
                       eta=1.0 / n_topics,
                       random_state=np.random.RandomState(self.RSEED),
                       iterations=ittrs,
                       minimum_probability=0.001,
                       minimum_phi_value=0.001)
        print('[tffeature]: lda perplexity:', lda.log_perplexity(corpus))
        return lda, vocab_key
        for i, topics in enumerate(lda.get_document_topics(corpus)):
            doc_topics = pd.concat([
                doc_topics,
                pd.DataFrame(topics, columns=['topic', 'value']).assign(doc=i)
            ])
        doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv',
                          index=False)

        model_file = datapath((model_path / f'{key}_{n_topics}').resolve())
        lda.save(model_file)
        train_lda = LdaModel(corpus=train_corpus,
                             num_topics=n_topics,
                             id2word=pd.Series(train_tokens).to_dict())

        # see https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.log_perplexity
        test_perplexity = 2**(-train_lda.log_perplexity(test_corpus))

        # https://markroxor.github.io/gensim/static/notebooks/topic_coherence_tutorial.html
        u_mass = np.mean([
            c[1] for c in lda.top_topics(
                corpus=corpus, coherence='u_mass', topn=n_topics)
        ])

        # extrinsic - need to provide external corpus
        # cm = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_uci')
        # uci = cm.get_coherence()

        result_ = [
            vocab_size, test_vocab, max_features, n_topics, test_perplexity,
            u_mass
        ]
Exemplo n.º 25
0
def grid_lda(dictionary,
             corpus,
             texts,
             max_topics,
             min_topics=5,
             step=5,
             save=True,
             plot=True):
    np.random.seed(49)
    import time
    coherence_scores = []
    lda_list = []
    perplexity = []
    passes = 20
    iterations = 100
    eval_every = 50
    with open('log_LDA.txt', 'w') as f:
        for num_topics in range(min_topics, max_topics + 1, step):
            print('#' * 100)
            print('Training LDA with {} Topics'.format(num_topics))
            print()

            warnings.filterwarnings("ignore", category=DeprecationWarning)
            start = time.time()
            lda = LdaModel(corpus=corpus,
                           num_topics=num_topics,
                           id2word=dictionary)  #,
            #passes=passes,iterations=iterations,eval_every=eval_every)
            lda_list.append(lda)
            coherencemodel = CoherenceModel(model=lda,
                                            texts=texts,
                                            dictionary=dictionary,
                                            coherence='c_v')
            coherence_score = coherencemodel.get_coherence()
            print('Coherence Score: ', coherence_score)
            coherence_scores.append(coherence_score)
            perplexity.append(lda.log_perplexity(corpus))
            print('Perplexity: ', perplexity[-1])
            print('Trained in {:0.3f}s'.format(time.time() - start))

            f.write('#' * 100 + ' \n')
            f.write('Training LDA with {} Topics'.format(num_topics) + ' \n')
            f.write('Coherence Score: {}'.format(coherence_score) + ' \n')
            f.write('Perplexity: {}'.format(perplexity[-1]) + ' \n')
            f.write('Trained in {:0.3f}s'.format(time.time() - start) + ' \n')

            if save:
                lda.save(
                    '../Models/grid/{}_clusters_full_grid_active_score{:0.3f}.model'
                    .format(num_topics, coherence_score))
                print(
                    'Model Saved under : ../Models/grid/{}_clusters_full_grid_active_score{:0.3f}.model'
                    .format(num_topics, coherence_score))
                print()
        f.close()
    if plot:
        x = range(min_topics, max_topics + 1, step)
        plt.plot(x, coherence_scores)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        #plt.legend(("coherence_values"), loc='best')
        plt.savefig('Coherence.png')
        plt.show()

        x = range(min_topics, max_topics + 1, step)
        plt.plot(x, perplexity)
        plt.xlabel("Num Topics")
        plt.ylabel("Log Perplexity")
        plt.savefig('Perplexity.png')
        #plt.legend(("coherence_values"), loc='best')
        plt.show()

    return lda_list, coherence_scores, perplexity