Пример #1
0
def runlda(filetopicwords,fileinput,NUMTOPICS=30,NUMPASSES=10,NUMITERATIONS=10):
    print('runlda...')
    from gensim.corpora import Dictionary
    from gensim.models.ldamodel import LdaModel
    import numpy as np
    docs,word2freqtopics = [],{}
    fr = open(fileinput,'r')
    for line in fr:
        words = line.strip('\r\n').split(' ')
        docs.append(words)
        for word in words:
            if not word in word2freqtopics:
                word2freqtopics[word] = [0,[0. for i in range(NUMTOPICS)]]
            word2freqtopics[word][0] += 1
    fr.close()
    V = len(word2freqtopics)
    dct = Dictionary(docs)
    model = LdaModel(corpus=[dct.doc2bow(doc) for doc in docs],id2word=dct, \
            num_topics=NUMTOPICS,passes=NUMPASSES,iterations=NUMITERATIONS) 
    fw = open(filetopicwords,'w')
    for topicid in range(NUMTOPICS):
        s = 'topic '+str(topicid)
        wordscores = []
        for (wordid,score) in model.get_topic_terms(topicid,topn=V):
            if score < 1e-6: break
            wordscores.append([dct[wordid],score])
        scoresum = sum([x[1] for x in wordscores])
        for [word,score] in wordscores:
            s += ','+word+':'+str(np.round(score/scoresum,6))
            word2freqtopics[word][1][topicid] = score
        fw.write(s+'\n')
    fw.close()
    '''
Пример #2
0
def plottopicpop():
    internet = [0 for i in range(10)]
    developing = [0 for i in range(10)]
    habr = [0 for i in range(10)]
    n = 0
    for year in range(2006, 2016):
        articles, numberofarticles = getarticlesbyyear(year)
        print("Got articles for", str(year))
        # Normalaize texts
        i = 0
        for article in articles:
            article = replacesymbols(article)
            articles[i] = normalaisestr(article.lower())
            i += 1
        print('Normalaised')
        
        # Remove unnecessary words
        texts = [[word for word in article if word not in stoplist]
                 for article in articles]
        print('Deleted stopwords')
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        print('Starting training')
        # Щадящий режим для ОЗУ
        for i in range(numberofarticles // 100):
            begin = 100 * i
            end = 100 * (i + 1)
            if end > numberofarticles:
                end = numberofarticles
            lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin)

            for j in range(lda.num_topics):
                topics = lda.get_topic_terms(j, 15)
                # print(topics)
                for topic in topics[0]:
                    top = dictionary.get(topic)
                    # print(top)
                    if "интернет" == top:
                        internet[n] += 1
                    if "разработка" == top:
                        developing[n] += 1
                    if "хабра" == top:
                        habr[n] += 1
            del lda
        n += 1

        print(internet,'\n', developing, '\n', habr)

    plt.title('Population of 3 topics.')
    plt.xlabel('Year 2006 - 2015')
    plt.ylabel('Number of articles')
    plt.plot(internet, label="Интернет")
    plt.plot(developing, label="Разработка")
    plt.plot(habr, label="Хабра")
    plt.legend()
    plt.show()
Пример #3
0
def ldaforhabr():
    numberofarticles = 0
    articles, numberofarticles = getarticles()
    print("Got articles")
    # Normalaize texts
    i = 0
    for article in articles:
        article = replacesymbols(article)
        articles[i] = normalaisestr(article.lower())
        i += 1
    print('Normalaised')
    # Remove unnecessary words
    texts = [[word for word in article if word not in stoplist]
             for article in articles]
    print('Deleted stopwords')
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    print('Starting training')
    f = open('lda.log', 'w')
    for i in range(i // numberofarticles):
            begin = 100 * i
            end = 100 * (i + 1)
            if end > numberofarticles:
                end = numberofarticles
            lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin)

            for j in range(lda.num_topics):
                topics = lda.get_topic_terms(j, 15)
                f.write(str(begin + j) + ": ")
                # print(topics)
                for topic in topics[0]:

                    top = dictionary.get(topic)
                    if top is not None:
                        f.write(top + '\n')

                f.write('-----------\n')
            # i += 1
            del lda
    f.close()
Пример #4
0
def lda_score(text, sub_sen_vec):
    plain_text = list(
        map(lambda x: cut(x).split(),
            filter(lambda x: x != '',
                   split_sentences(text)[::2])))
    common_dict = Dictionary(plain_text)
    common_corpus = [common_dict.doc2bow(t) for t in plain_text]

    no_topics = int(len(plain_text) / 2)
    no_words = int(len(text) / no_topics / 2)

    lda = LdaModel(common_corpus, num_topics=no_topics)

    topic_list = []
    for i in range(no_topics):
        topic_list.append(
            [common_dict[t[0]] for t in lda.get_topic_terms(i, no_words)])

    score_list = []
    for topic in topic_list:
        score_list.append(get_corr_sif(topic, sub_sen_vec))
    return max(score_list)
Пример #5
0
def saliency_index(lda: LdaModel, corpus, words: Dictionary):

    full_corpus = list(chain(*corpus))

    N = len(words)
    total = sum(words.cfs[i] for i in range(N))
    frequencies = [words.cfs[i] / total for i in range(N)]

    topics = lda.print_topics()

    relative_likelihood = [0. for _ in range(N)]

    for topic_id, topic_prob in lda.get_document_topics(
            full_corpus, minimum_probability=0.):
        for term, cond_prob in lda.get_topic_terms(topic_id, topn=None):

            relative_likelihood[term] += cond_prob * log(
                cond_prob / topic_prob)

    saliencies = [f * l for f, l in zip(frequencies, relative_likelihood)]

    return {words[i]: s for i, s in enumerate(saliencies)}
Пример #6
0
               num_topics=topic_num,
               alpha=alpha,
               random_state=1)

doc_topics = [lda[c] for c in corpus]

avg_doc_topics = mean([len(t) for t in doc_topics])

print(f"topics num of doc = {avg_doc_topics}")

topic_freq = frequencies([t[0] for dt in doc_topics for t in dt])

print('----------')

for i in range(topic_num):
    items = [(dic[t[0]], t[1]) for t in lda.get_topic_terms(i, topn=5)]
    freq = topic_freq[i] if i in topic_freq else 0

    print(f"topic_id = {i}, freq = {freq}, items = {items}")

print('----------')

for i in range(len(corpus)):
    dts = lda.get_document_topics(corpus[i], per_word_topics=True)

    for dt in dts[2]:
        item = dic[dt[0]]
        print(f"corpus = {i}, item = {item}, topic_id = {dt[1]}")

vis = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs=1, sort_topics=False)
Пример #7
0
data_file = sys.argv[1]
topic_num = int(sys.argv[2])
limit_topics = 3

sentences = list(word2vec.LineSentence(data_file))

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

lda = LdaModel(corpus = corpus, id2word = dic, num_topics = topic_num)

doc_topics = [lda[c] for c in corpus]

avg_doc_topics = mean([len(t) for t in doc_topics])

if avg_doc_topics > limit_topics:
  warnings.warn(f'topic_num is small. topics num of doc = {avg_doc_topics}')

flatten = lambda x: sum(x, [])

topic_freq = Counter(flatten([[x[0] for x in t] for t in doc_topics]))

print('topic,freq,item,prob')

for i in range(topic_num):
  for t in lda.get_topic_terms(i):
    item = dic[t[0]]

    print(f'{i},{topic_freq[i]},{item},{t[1]}')
Пример #8
0
class CustomLda(object):
    def __init__(self, data=None, dictionary=None):
        """ initialize, data should be provided, only when unpickling class object it is not needed!"""
        self.data = data
        self.model = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.dictionary = dictionary
        if self.data is not None:
            if self.dictionary is None:
                self.dictionary = Dictionary(self.data)
            self.corpus = [self.dictionary.doc2bow(text) for text in self.data]
        else:
            self.dictionary = None
            self.corpus = None
        self.distributed = None
        self.chuncksize = None
        self.passes = None
        self.update_every = None
        self.alpha = None
        self.eta = None
        self.decay = None
        self.offset = None
        self.eval_every = None
        self.gamma_threshold = None
        self.minimum_probability = None
        self.ns_conf = None
        self.minimum_phi_value = None
        self.per_word_topics = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.model = None
        self.coherence_model = None
        self.coherence = None
        self.coherence_type = None

    def train(self,
              num_topics,
              iterations=1500,
              random_state=1,
              distributed=False,
              chunksize=2000,
              passes=1,
              update_every=1,
              alpha='symmetric',
              eta=None,
              decay=0.5,
              offset=1.0,
              eval_every=10,
              gamma_threshold=0.001,
              minimum_probability=0.01,
              ns_conf=None,
              minimum_phi_value=0.01,
              per_word_topics=False,
              workers=1):
        """train lda model. If workers >1, goes multicore"""

        self.distributed = distributed
        self.chuncksize = chunksize
        self.passes = passes
        self.update_every = update_every
        self.alpha = alpha
        self.eta = eta
        self.decay = decay
        self.offset = offset
        self.eval_every = eval_every
        self.gamma_threshold = gamma_threshold
        self.minimum_probability = minimum_probability
        self.ns_conf = ns_conf
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics
        self.num_topics = num_topics
        self.iterations = iterations
        self.random_state = random_state
        self.workers = workers

        if self.workers > 1:
            self.model = LdaMulticore(
                workers=3,
                corpus=self.corpus,
                id2word=self.dictionary,
                iterations=self.iterations,
                num_topics=self.num_topics,
                random_state=self.
                random_state,  # distributed=self.distributed,
                chunksize=self.chuncksize,
                passes=self.passes,  # update_every= self.update_every,
                alpha=self.alpha,
                eta=self.eta,
                decay=self.decay,
                offset=self.offset,
                eval_every=self.eval_every,
                gamma_threshold=self.gamma_threshold,
                minimum_probability=self.
                minimum_probability,  # ns_conf=self.ns_conf,
                minimum_phi_value=self.minimum_phi_value,
                per_word_topics=self.per_word_topics)
        else:
            self.model = LdaModel(corpus=self.corpus,
                                  id2word=self.dictionary,
                                  iterations=self.iterations,
                                  num_topics=self.num_topics,
                                  random_state=self.random_state,
                                  distributed=self.distributed,
                                  chunksize=self.chuncksize,
                                  passes=self.passes,
                                  update_every=self.update_every,
                                  alpha=self.alpha,
                                  eta=self.eta,
                                  decay=self.decay,
                                  offset=self.offset,
                                  eval_every=self.eval_every,
                                  gamma_threshold=self.gamma_threshold,
                                  minimum_probability=self.minimum_probability,
                                  ns_conf=self.ns_conf,
                                  minimum_phi_value=self.minimum_phi_value,
                                  per_word_topics=self.per_word_topics)
        print('Trained!')

    def _train_coherence_model(self, coherence_type='u_mass'):
        """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'"""
        self.coherence_model = CoherenceModel(model=self.model,
                                              texts=self.data,
                                              dictionary=self.dictionary,
                                              coherence=coherence_type)

    def _calculate_coherence(self, coherence_type='u_mass'):
        self._train_coherence_model(coherence_type=coherence_type)
        self.coherence = self.coherence_model.get_coherence()

    def get_coherence(self, coherence_type='u_mass'):
        if coherence_type != self.coherence_type:
            self._calculate_coherence(coherence_type=coherence_type)
        return self.coherence

    def get_topic_terms(self, num, topn=10):
        return self.model.get_topic_terms(num, topn=topn)

    def get_preplexity(self):
        return self.model.log_perplexity(self.corpus)

    def get_topics(self, num):
        return self.model.show_topics(num)

    def _make_visualization(self):
        """prepare visualisation for display/saving"""
        return pyLDAvis.gensim.prepare(self.model,
                                       self.corpus,
                                       self.dictionary,
                                       sort_topics=False)

    def display(self):
        """display LDAvis in notebook"""
        visualisation = self._make_visualization()
        return pyLDAvis.display(visualisation)

    def save_ldavis(self, filename='topic.html'):
        """save LDAvis to .html"""
        ldavis = self._make_visualization()
        pyLDAvis.save_html(ldavis, filename)

    def save_lda(self, filename):
        """save lda model only"""
        self.model.save(filename)

    def pickle(self, filename):
        """save class instance to file"""
        f = open(filename, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    @staticmethod
    def unpickle(filename):
        """read class instance from file"""
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def predict_topic(self, doc_list):
        """predict topic of document list (consists of strings"""
        topic_list = []
        for doc in doc_list:
            bow = self.dictionary.doc2bow(str(doc).split())
            topics_probs = self.model.get_document_topics(bow)
            topics_probs.sort(key=lambda tup: tup[1], reverse=True)
            topic_list.append(topics_probs)
        return topic_list
Пример #9
0
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath

num_topics = 4
# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

print common_dictionary.items()
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=num_topics)

temp_file = datapath(
    "/Users/wanghaoxian/Documents/GitHub/recommend/dataContest/model")
lda.save(temp_file)
list = lda.get_document_topics(common_corpus)

for topic in list:
    print topic
for i in range(0, num_topics, 1):
    print i, lda.get_topic_terms(i, 3)
    for w in list(jieba.cut(line, cut_all=True)):
        if len(w) > 1 and w not in stoplist:
            doc.append(w)
    segtexts.append(doc)

dictionary = Dictionary(segtexts)
dictionary.filter_extremes(2, 1.0, keep_n=1000)  #词典过滤,保留1000个
corpus = [dictionary.doc2bow(text) for text in segtexts]
lda = LdaModel(corpus, id2word=dictionary,
               num_topics=num_topics)  #指定id2word,可以直接显示词汇而非其id
topics = lda.print_topics(num_topics=num_topics,
                          num_words=10)  #list (topic_id, [(word, value), … ])
print(topics)
#可视化
font = r'C:\Windows\Fonts\simfang.ttf'
wc = WordCloud(collocations=False,
               font_path=font,
               width=2800,
               height=2800,
               max_words=20,
               margin=2)
for topicid in range(0, num_topics):
    tlist = lda.get_topic_terms(topicid, topn=1000)  #定义词云图中的词汇数  p(w|z)
    #print(tlist)
    wdict = {}  #['词a':100 '词b':90,'词c':80]
    for wv in tlist:
        wdict[dictionary[wv[0]]] = wv[1]
    print(wdict)
    wordcloud = wc.generate_from_frequencies(wdict)
    wordcloud.to_file('topic_' + str(topicid) + '.png')  #保存图片
Пример #11
0
#     return [(i, model_lda.print_topic(i)) for i in top_k_topics]


# In[116]:


# `get_document_topics()` returns topic probability distribution for given document
topic_dist_675_a = model_lda.get_document_topics(corpus_train[15])
pprint(sorted(topic_dist_675_a))


# In[117]:


topicid = 3
model_lda.get_topic_terms(topicid, topn=10)


# In[118]:


text_train[doc_id]


# In[119]:


doc_id = 15
topic_dist_15_b = sorted(get_topics(corpus_train[doc_id], k=10)), text_train[doc_id]
pprint(topic_dist_15_b)
Пример #12
0
for line in file:
    #seg_list = jieba.cut(line, cut_all=True)
    seg_list = jieba.analyse.extract_tags(line, topK=40, withWeight=True)
    words = []
    for word, w in seg_list:
        if (len(word) < 2):
            continue
        words.append(word.encode('utf-8'))
    texts.append(words)

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=num_topics)

temp_file = datapath(
    "/Users/wanghaoxian/Documents/GitHub/recommend/dataContest/model")
lda.save(temp_file)
documentTopics = lda.get_document_topics(common_corpus)

for doc in documentTopics:
    print doc

for i in range(0, num_topics, 1):
    print "topic", i
    terms = lda.get_topic_terms(i, num_topic_terms)
    for term in terms:
        print common_dictionary[term[0]], term[1]
Пример #13
0
def get_most_common(title_list,
                    dic,
                    num=COMMON_TOPIC_WORDS_NUM,
                    random_state=None):
    '''最頻出の話題の単語num個のセットを取得する'''

    bow = [dic.doc2bow(title) for title in title_list]
    # TODO: 適切なトピック数を取得して設定する
    if LOG_LEVEL == 'DEBUG':
        random_state = 123
    model = LdaModel(bow,
                     id2word=dic,
                     num_topics=TOPIC_NUM,
                     random_state=random_state)
    # 各タイトルを分類
    topic_id_list = []
    for idx, title in enumerate(title_list):
        logger.debug('title')
        logger.debug(title)
        doc_topics_tuple = model.get_document_topics(dic.doc2bow(title),
                                                     minimum_probability=0.0)
        doc_topic_dist = [[val[0], val[1]] for val in doc_topics_tuple]
        doc_topic_dist = np.array(doc_topic_dist)
        if idx == 0:
            topic_dist_arr = doc_topic_dist
        else:
            topic_dist_arr = np.vstack([topic_dist_arr, doc_topic_dist])
        topic_id = int(
            sorted(doc_topic_dist, key=lambda x: x[1], reverse=True)[0][0])
        topic_id_list.append(topic_id)
    if LOG_LEVEL == 'DEBUG':
        # titleごとのトピック分布
        df_topic_dist = pd.DataFrame({
            'title': title_list,
            'topic_id': topic_id_list
        })
        # トピックごとの単語分布
        cols = ['{}_{}'.format(word_no, elem) \
                for word_no in range(10) \
                    for elem in range(2)]
        df_word_dist = pd.DataFrame()
        arr_dist = topic_dist_arr.reshape(-1, model.get_topics().shape[0], 2)
        for topic_id in range(model.get_topics().shape[0]):
            df_topic_dist['topic_{}'.format(topic_id)] = arr_dist[:, topic_id,
                                                                  1]
            topic_terms = model.get_topic_terms(topic_id,
                                                topn=int(len(cols) / 2))
            topic_terms_2 = []
            for term in topic_terms:
                topic_terms_2 = topic_terms_2 + [
                    dic.id2token[term[0]], term[1]
                ]
            df_word_dist = df_word_dist.append(
                pd.Series(topic_terms_2, name='topic_{}'.format(topic_id)))
        df_topic_dist.to_csv(
            os.path.join('test', 'classified_topic_{}.csv' \
                .format(datetime.today().strftime(format='%Y%m%d'))),
            index=False,
            encoding='cp932'
        )
        df_word_dist.columns = cols
        df_word_dist.to_csv(
            os.path.join('test', 'word_distribution_per_topic_{}.csv' \
                .format(datetime.today().strftime(format='%Y%m%d'))),
            encoding='cp932'
        )
    # 最頻出の話題を取得
    topic_id_counter = Counter(topic_id_list)
    most_common_topic_id = topic_id_counter.most_common(1)[0][0]
    topic_terms = model.get_topic_terms(most_common_topic_id)
    logger.debug('')
    logger.debug('topic_id_counter: ' + str(topic_id_counter))
    logger.debug('most_common_topic_id: ' + str(most_common_topic_id))
    logger.debug(topic_terms)
    # 最頻出の話題の重要な単語num個を取得
    important_word_list = [
        dic.id2token[topic_tuple[0]] for topic_tuple in topic_terms[:num]
    ]
    logger.debug(important_word_list)
    return important_word_list
Пример #14
0
class GensimLDA:
    def __init__(self, texts):
        self.dictionary = Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]

        self.k_topics = None
        self.model = None

    def fit(self, k_topics, iterations=50):
        ''''''
        self.k_topics = k_topics
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, \
            num_topics=k_topics, iterations=iterations)

    def get_document_topic_matrix(self, X=None):
        '''Returns an n_docs x k_topics array of probabilities
        of a topic in a given document.'''
        if X is None:
            X = self.corpus
        else:
            X = [self.dictionary.doc2bow(text) for text in X]

        n_docs = len(X)
        V = np.zeros((n_docs, self.k_topics))

        # Extract assignments
        some_iterable = self.model.get_document_topics(
            X)  ## equiv: self.model[X]
        for i, doc_topic in enumerate(some_iterable):
            for topic_id, prob in doc_topic:
                V[i, topic_id] = prob
        return V

    def get_topic_term_matrix(self):
        '''Returns an k_topics x m_words array of probabilities
        of a word in a given topic.'''
        return self.model.get_topics()

    def print_topics(self, top_n=10):
        '''Prints the top_n words in a topic'''
        for row in self.get_topic_term_matrix():
            ranking = np.argsort(row)
            ids = np.arange(len(ranking))[ranking]

            for k in ids[:-top_n:-1]:
                weight = row[k]
                word = self.dictionary.id2token[k]
                print(k, word, weight)
            print()

    def print_topic_words(self, topic_num, topn=None):
        '''Prints the top words and probabilities of a given topic in
        descending probability.'''
        for tok_id, prob in self.model.get_topic_terms(topic_num, topn=topn):
            word = self.dictionary.id2token[tok_id]
            print(word, prob)

    def get_topic_bows(self, num_words=10):
        '''Returns a list (for each topic) containing a list of the top num_words'''
        q = self.model.show_topics(num_topics=self.k_topics,
                                   num_words=num_words,
                                   formatted=False)
        topics = []
        for id, topic in q:
            words = []
            for w, p in topic:
                words.append(w)
            topics.append(words)
        return topics
Пример #15
0
class LDAWDF:
    mysql: mysql.MySQL
    ldamodel: LdaModel
    dictionary = None
    corpus = None

    def __init__(self, mysql):
        self.mysql = mysql
        self.dataFolder = './data/'
        self.saveFile = 'lda_model'
        self.saveFileDict = 'lda_model_dict'

    def trainFromStart(self):
        with self.mysql as db:
            content = db.getContentsText()
        documents = []
        for item in content:
            documents.append(item['content'].split())

        self.dictionary = corpora.Dictionary(documents)

        self.dictionary.filter_extremes(no_below=5, no_above=0.5)

        doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents]

        self.corpus = doc_term_matrix

        # Running and Training LDA model on the document term matrix.
        print("Starting to train LDA Model...")
        self.ldamodel = LdaModel(
            doc_term_matrix,
            num_topics=200,
            id2word=self.dictionary,
            passes=100)

    def printTest(self):
        print(self.ldamodel.print_topics(num_topics=10, num_words=5))

    def save(self):
        self.ldamodel.save(self.dataFolder + self.saveFile)
        self.dictionary.save(self.dataFolder + self.saveFileDict)

    def canLoad(self):
        my_file = Path(self.dataFolder + self.saveFile)
        my_file_dict = Path(self.dataFolder + self.saveFileDict)
        return my_file.is_file() and my_file_dict.is_file()

    def update(self, corpus):
        self.ldamodel.update(corpus)

    def load(self, subfolder=None):
        if subfolder:
            sf = subfolder + '/'
        else:
            sf = ''
        self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile)
        self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)

    def fillDb(self):
        topics = {}
        result = []
        result2 = []
        nbTopics = self.ldamodel.get_topics().shape[0]
        # "Old"
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 3)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            words = []
            for topicTerm in topicTerms:
                words.append(self.dictionary.get(topicTerm[0]))
            topics[topicId] = ' '.join(words)
        with mysql as db:
            contentsText = db.getContentsText()
            for element in contentsText:
                bow = self.dictionary.doc2bow(element['content'].split())
                docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05)
                if len(docTopics) > 0:
                    docTopics.sort(key=lambda x: x[1], reverse=True)
                    result.append((element['url'], topics[docTopics[0][0]]))
                    for docTopic in docTopics:
                        result2.append((element['url'], docTopic[0], str(docTopic[1])))
            db.emptyUrlsTopic()
            db.emptyCurrentUrlsTopic()
            db.emptyCurrentUserTags()
            db.setCurrentUrlsTopic(result2)
            db.setPrecalcTopics()
        # "New"
        terms = []
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 5)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            for topicTerm in topicTerms:
                terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1])))
        with mysql as db:
            db.emptyLdaTopics()
            db.setLdaTopics(terms)


    def get_terms_topics(self, keywords):
        bow = self.dictionary.doc2bow(keywords[:30])
        topics = {}
        keywordsResult = {}
        for word in bow:
            wordTopics = self.ldamodel.get_term_topics(word[0], 0.05)
            keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics}
            for wordTopic in wordTopics:
                wordTopicId = wordTopic[0]
                if wordTopicId not in topics:
                    topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId)
        return {'topics': topics, 'keywords': keywordsResult}
Пример #16
0
def print_topic_terms(model: LdaModel):
    for topic_id in range(model.num_topics):
        top_list = model.get_topic_terms(topic_id)
        print(topic_id, [idx2token[idx] for idx, value in top_list])