Пример #1
0
def LDA_model(corpus_Quran, corpus_NT, corpus_OT):
    # run LDA on the entire set of verses from all corpora
    total_corpus = corpus_Quran + corpus_NT + corpus_OT
    dictionary = Dictionary(total_corpus)
    dictionary.filter_extremes(no_below=50, no_above=0.1)
    corpus = [dictionary.doc2bow(text) for text in total_corpus]
    lda = LdaModel(corpus, num_topics=20, id2word=dictionary, random_state=1)
    # compute document-topic probability for Quran
    dictionary1 = Dictionary(corpus_Quran)
    dictionary1.filter_extremes(no_below=50, no_above=0.1)
    corpus1 = [dictionary1.doc2bow(text) for text in corpus_Quran]
    topics_Quran = lda.get_document_topics(corpus1)
    topic_dic_Quran = {}
    for doc in topics_Quran:
        for topic in doc:
            if topic[0] not in topic_dic_Quran.keys():
                topic_dic_Quran[topic[0]] = topic[1]
            else:
                topic_dic_Quran[topic[0]] += topic[1]
    # compute document-topic probability for OT
    dictionary2 = Dictionary(corpus_OT)
    dictionary2.filter_extremes(no_below=50, no_above=0.1)
    corpus2 = [dictionary2.doc2bow(text) for text in corpus_OT]
    topics_OT = lda.get_document_topics(corpus2)
    topic_dic_OT = {}
    for doc in topics_OT:
        for topic in doc:
            if topic[0] not in topic_dic_OT.keys():
                topic_dic_OT[topic[0]] = topic[1]
            else:
                topic_dic_OT[topic[0]] += topic[1]
    # compute document-topic probability for NT
    dictionary3 = Dictionary(corpus_NT)
    dictionary3.filter_extremes(no_below=50, no_above=0.1)
    corpus3 = [dictionary3.doc2bow(text) for text in corpus_NT]
    topics_NT = lda.get_document_topics(corpus3)
    topic_dic_NT = {}
    for doc in topics_NT:
        for topic in doc:
            if topic[0] not in topic_dic_NT.keys():
                topic_dic_NT[topic[0]] = topic[1]
            else:
                topic_dic_NT[topic[0]] += topic[1]
    for k, v in topic_dic_Quran.items():
        topic_dic_Quran[k] = v / len(corpus_Quran)
    for k, v in topic_dic_OT.items():
        topic_dic_OT[k] = v / len(corpus_OT)
    for k, v in topic_dic_NT.items():
        topic_dic_NT[k] = v / len(corpus_NT)
    return lda, topic_dic_Quran, topic_dic_NT, topic_dic_OT
def getLdaFeature(documents, topicNum):
    '''
     Funciton:
         generate lda features by training lda model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lda features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)    
    corpusD = [dictionary.doc2bow(text) for text in texts]

    # train lda model
#     LogInfo(' Train LDA model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
#     ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    # generate lda features
    LogInfo(' Generate LDA features...')
    ldaFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpus_tfidf:
        topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01)
        for t in topic:
             ldaFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlda")
    ldaFeature = pd.DataFrame(ldaFeature, columns = colName)
    return ldaFeature
Пример #3
0
class LMDL_LDA():
    def __init__(self):
        self.lmdl = LMDL_Corpus()
        self.texts = self.lmdl.get_corpus_texts_words()
        self.dictionary = Dictionary(self.texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        self.lda = LdaModel(self.corpus,
                            num_topics=LDA_NUM_TOPICS,
                            id2word=self.dictionary)

    def print_topics(self):
        return self.lda.print_topics(LDA_NUM_TOPICS)

    def get_document_topics(self, document_name):
        document_tokens = self.lmdl.token_list_processed(document_name)
        topics = self.lda.get_document_topics(
            self.dictionary.doc2bow(document_tokens),
            minimum_probability=None,
            minimum_phi_value=None,
            per_word_topics=False)
        show_topics_list = []
        for topic in topics:
            lda_topic = self.lda.show_topic(topic[0], topn=10)
            show_topics_list.append(lda_topic)
        return show_topics_list

    def top_topics(self):
        return self.lda.top_topics(corpus=self.corpus,
                                   texts=self.texts,
                                   dictionary=self.dictionary,
                                   window_size=None,
                                   coherence='u_mass',
                                   topn=20,
                                   processes=-1)
Пример #4
0
def get_lda_feature():
    doc_train = pd.read_csv(id_content_path)
    documents = doc_train['content'].apply(lambda x: x.split(' '))
    #    建立词和ID的映射字典(id:word)
    dictionary = corpora.Dictionary(documents)
    #    建立文档和id和list(tuple(id,num)) of list df
    ds_df = [dictionary.doc2bow(document) for document in documents]
    #    建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df
    tfidf_model = TfidfModel(ds_df)
    #    获取文档的tdf获取文档tfidf
    ds_tfidf = tfidf_model[ds_df]
    #    定义文档的主题个数
    n = 60
    #    构建lda模型,输入参数是文档的tfidf,并指明主题的个数
    lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12)
    vec_size = (len(documents), n)
    lda_feature = np.zeros(vec_size)
    i = 0

    for doc in ds_tfidf:
        topics = lda_model.get_document_topics(doc, minimum_probability=0.01)
        for topic in topics:
            num_topic = topic[0]
            prob = round(topic[1], 5)
            lda_feature[i, num_topic] = prob
        i += 1

    f_names = get_lda_feacture_name(n)
    pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path,
                                                      index=0)
def cluster(doc_term_matrix, num, word_dict):
    ldamodel = Lda(doc_term_matrix, num_topics=num, id2word=word_dict)
    doc_topics = ldamodel.get_document_topics(
        doc_term_matrix, minimum_probability=0.20)  # needs tuning
    result = [[] for i in range(num)]
    for k, topic in enumerate(doc_topics):
        # Some articles do not have a topic
        if topic:
            topic.sort(key=itemgetter(1), reverse=True)
            result[topic[0][0]].append(k)
    return [map(lambda x: titles[x], result[k]) for k in len(result)]
Пример #6
0
 def gen_ldamodel(self):
     mdf = MyDataFrame()
     df = mdf.new_DataFrame()
     df2 = mdf.m_cut(df)
     filelist=[]
     for i in range(len(df2)):
         filelist.append(df2['fenci'][i])
     #生成文档对应的字典和bow稀疏矩阵
     
     dictionary = corpora.Dictionary(filelist)  
     corpus = [dictionary.doc2bow(text) for text in filelist] # 仍为list in list  
     
     tfidf_model = models.TfidfModel(corpus) # 建立TF-IDF模型  
     corpus_tfidf = tfidf_model[corpus] # 对所需文档计算TF-IDF结果
     corpus_tfidf
     
     #拟合LDA模型
     from gensim.models.ldamodel import LdaModel
     # 列出所消耗的时间备查
     ldamodel = LdaModel(corpus, id2word = dictionary, num_topics = 10, passes = 10) 
     #列出最重要的若干个主题
     ldamodel.print_topics(num_topics = 20,num_words = 10)
     
     #计算各语料的LDA模型值
     corpus_lda = ldamodel[corpus_tfidf] # 此处应当使用和模型训练时相同类型的矩阵
     for doc in corpus_lda:
         print(doc)
     ldamodel.get_topics()#list of list 每个主题中每个词所对应的一个概率矩阵
     
     # 检索和文本内容最接近的主题
      # 检索和0.txt最接近的主题
     query_bow = dictionary.doc2bow(df2['fenci'][0]) # 频数向量
     query_tfidf = tfidf_model[query_bow] # TF-IDF向量
     print("转换后:", query_tfidf[:10])
     ldamodel.get_document_topics(query_bow) # 需要输入和文档对应的bow向量
     # 检索和文本内容最接近的主题       
     ldamodel[query_tfidf] #list,最接近的主题list
Пример #7
0
def saliency_index(lda: LdaModel, corpus, words: Dictionary):

    full_corpus = list(chain(*corpus))

    N = len(words)
    total = sum(words.cfs[i] for i in range(N))
    frequencies = [words.cfs[i] / total for i in range(N)]

    topics = lda.print_topics()

    relative_likelihood = [0. for _ in range(N)]

    for topic_id, topic_prob in lda.get_document_topics(
            full_corpus, minimum_probability=0.):
        for term, cond_prob in lda.get_topic_terms(topic_id, topn=None):

            relative_likelihood[term] += cond_prob * log(
                cond_prob / topic_prob)

    saliencies = [f * l for f, l in zip(frequencies, relative_likelihood)]

    return {words[i]: s for i, s in enumerate(saliencies)}
Пример #8
0
# In[115]:


# # extracts topics for given document from Gensim
# def get_topics(doc, k=5, model_lda=model_lda):
#     topic_id = sorted(model_lda[doc][0], key=lambda x: -x[1])
#     top_k_topics = [x[0] for x in topic_id[:k]]
#     return [(i, model_lda.print_topic(i)) for i in top_k_topics]


# In[116]:


# `get_document_topics()` returns topic probability distribution for given document
topic_dist_675_a = model_lda.get_document_topics(corpus_train[15])
pprint(sorted(topic_dist_675_a))


# In[117]:


topicid = 3
model_lda.get_topic_terms(topicid, topn=10)


# In[118]:


text_train[doc_id]
Пример #9
0
class LDA(object):
    def __init__(self,
                 topics=10,
                 worker=3,
                 pretrained_model=None,
                 dictionary=None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus=[[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data = [
                self._common_dictionary.doc2bow(sentence)
                for sentence in corpus
            ]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data = [
                self._common_dictionary.doc2bow(sentence)
                for sentence in corpus
            ]
            self._model.update(new_corpus_data)

    def inference(self, document=[]):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc = [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
Пример #10
0
Файл: lda.py Проект: freygit/36
class LDA(object):

    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus = [[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model.update(new_corpus_data)

    def inference(self, document = []):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc =  [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
Пример #11
0
def main():
    # SQLite に接続
    datastore.connect()

    # dic_id を1つずつ取得し, その doc_id 内の文章ごとに含まれる単語の原形を sentences に格納
    sentences = []

    for doc_id in datastore.get_all_ids(limit=-1):
        all_tokens = datastore.get_annotation(doc_id, "token")

        for sentence in datastore.get_annotation(doc_id, "sentence"):
            tokens = find_xs_in_y(all_tokens, sentence)

            sentences.append(
                [token["lemma"] for token in tokens if token.get("NE") == "O"])

    # 分析に使用する記事が少ないため, 20文を 1つの文書として扱うように sentence を結合
    n_sent = 20

    docs = [
        list(itertools.chain.from_iterable(sentences[i:i + n_sent]))
        for i in range(0, len(sentences), n_sent)
    ]

    # LDA の計算に使用する単語を選定
    # - 出現頻度が 2つ未満の文書の場合, その単語は計算に使用しない( no_below=2 )
    # - 出現頻度が 3割以上の文書の場合, その単語は計算に使用しない( no_above=0.3 )
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=2, no_above=0.3)

    # 単語の集まりを doc2bow method を用いて, 所定のデータ型に変換
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # LDA モデルを作成
    lda = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

    # 主題の確認
    # Topic の一覧を出力
    # Topic の一覧と合わせて, その Topic の中で確率値の大きい単語上位 10個を出力
    for topic in lda.show_topics(num_topics=-1, num_words=10):
        print(f"Topic id: {topic[0]}    Word: {topic[1]}")

    # 記事の主題分布の推定
    # doc_id ごとに確率値の大きい Topic を出力
    for doc_id in datastore.get_all_ids(limit=-1):
        meta_info = json.loads(
            datastore.get(doc_id=doc_id, fl=["meta_info"])["meta_info"])

        title = meta_info["title"]
        print(title)

        doc = [
            token["lemma"]
            for token in datastore.get_annotation(doc_id, "token")
            if token.get("NE") == "O"
        ]

        topics = sorted(lda.get_document_topics(dictionary.doc2bow(doc)),
                        key=lambda x: x[1],
                        reverse=True)

        for topic in topics:
            print(f"    Topic id: {topic[0]}    Prob: {topic[1]}")

    datastore.close()

    return
Пример #12
0
class LDA_parser():
    """
    This class implements a wrapper pipeline for text preprocessing and LDA parsing of an input corpus 
    in the form ['str','str','str', ... ]. 
    """
    def __init__(self,
                 corpus='',
                 language='english',
                 preprocessor_type="spacy",
                 tags=["DET", "PUNCT", "NUM", "SYM", "SPACE"],
                 custom_filter=[],
                 lemmatize=False,
                 stem=False,
                 min_len=2,
                 num_topics=10,
                 passes=100):
        """ 
        Parses the input text into a suitable format, then performs all LDA extraction tasks. 
        It expects the input corpus to be a list of texts. If input is a long string, it will attempt 
        create documents by splitting by 
        @ params: 
            @ corpus: Input corpus in str or ['str','str','str', ... ] format, where each entry
                      is a document of type str. Alternatively, a str format input (not recommended).
            @ preprocessor_type: Use nltk-based or spaCy-base preprocessor 
            @ language: language to use in the preprocessor 
            @ tags: if spaCy is selected, will filter words with input POS tags 
            @ custom_filter: filter words in this input list in the preprocessing step 
            @ lemmatize: use lemmatization in the preprocessing 
            @ stem: use stemming in the preprocessing  
            @ num_topics: maximum number of topics in the LDA algorithm 
            @ passes: number of training epochs in the LDA 
        """

        print("Initializing model...\n")
        if preprocessor_type == "nltk":
            print("NLTK preprocessor selected.")
            self.preprocessor = nltk_preprocessor(language=language)
        if preprocessor_type == "spacy":
            print("spaCy preprocessor selected.")
            self.preprocessor = spacy_preprocessor(language=language)

        self.language = language  # input language
        self.raw_corpus = ""  # simply stores the input if in str type
        self.clean_corpus = [
        ]  # [doc, doc, ..., doc] = [[sent, sent, ...], ... ,[sent, sent, ...]]
        self.dictionary = None  # holds a corpora.Dictionary representation of corpus
        self.doc2bow_corpus = None  # contains doc2bow vector representations of each document in the corpus
        self.lda_model = None  # LDA model trained on the input corpus
        self.topic_mixtures = [
        ]  # contains str representations of mixtures of words with their probabilities
        self.topics = {
        }  # Contains a dictionary of topics with words and respective mix probabilities once "extract topics" is called.
        self.topic_words = {
        }  # As above, but only contains the respective words of the topic

        # check for raw str corpus format
        if isinstance(corpus, str):
            print(
                "***WARNING***\nRaw input (str) received. Text will be sentence-tokenized and parsed accordingly."
            )
            print("Make sure this is intended. \n")
            self.raw_corpus = str(corpus)  # transform input to string
            self.fit(corpus,
                     raw=True,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)  # fit corpus as raw

        elif corpus == '':
            print("***WARNING***\nNull Corpus")
        # assume input corpus is in the right format
        else:
            self.fit(corpus,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)

    def fit(self,
            corpus,
            raw=False,
            language='english',
            stem=False,
            lemmatize=False,
            num_topics=10,
            passes=100,
            min_len=2,
            echo_corpus=False):
        """ 
        Assumes input corpus is in the right format. 
        @args: 
            @ corpus = input corpus  
            @ language = input language  
            @ stem/lemmatize = if true, stem or lemmatize input corpus
            @ num_topics = number of topics to choose in the algorithm 
            @ passes = number of epochs of the LDA 
            @ min_len = minimum length of words to consider when preprocessing words
        """

        if echo_corpus:
            print("CORPUS: {}".format(corpus))

        t0 = time.time()

        print("Fitting LDA topic modelling...")
        self.raw_corpus = corpus  # input corpus as is
        self.language = language  # in case initial language changed

        if raw:
            print("Preprocessing corpus...(raw)")
            self.clean_corpus = self.preprocessor.preprocess_str_corpus(
                corpus, stem=stem, lemmatize=lemmatize, min_len=min_len)
        else:
            print("Preprocessing corpus...")
            self.clean_corpus = self.preprocessor.preprocess_texts(
                self.raw_corpus, min_len=2)  # preprocess text list

        print("Creating corpora dictionary...")
        self.dictionary = corpora.Dictionary(
            self.clean_corpus)  # create corpora.Dictionary mapping
        print("Translating doc2bow corpus...")
        self.doc2bow_corpus = [
            self.dictionary.doc2bow(text) for text in self.clean_corpus
        ]  # doc2bow corpus representation
        print("Running LDA...")
        self.lda_model = LdaModel(self.doc2bow_corpus,
                                  num_topics=num_topics,
                                  id2word=self.dictionary,
                                  passes=passes)
        self.topic_mixtures = self.lda_model.show_topics(
            num_topics=-1,
            num_words=10)  # string representation of topics mixtures

        t1 = time.time()
        print("\nDone in {:.3f} seconds.".format(t1 - t0))

    def print_topics(self, words_per_topic=5):
        """
        Displays the topics in string format
        """
        topics = self.lda_model.print_topics(num_words=words_per_topic)
        for topic in topics:
            print(topic)

    def extract_topics(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of tuples of words_per_topic many words with 
        probability at least as high as threshold, where the second value is the density 
        for the topic. 
        @params: 
            @ max_words_per_topic: Maximum topic mixture component words to consider. 
            @ threshold: select words whose density is at least this value
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topics = topics  # update attribute

        return topics

    def extract_topic_words(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of words_per_topic many words with 
        probability at least as high as threshold. 
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup[0]
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topic_words = topics  # update attribute

        return topics

    def parse_new(self,
                  new_text,
                  top_n_topics=100,
                  top_n_w=30,
                  max_words_per_topic=50,
                  threshold=0.005,
                  verbose=True):
        """
        Parses a new text by obtaining the most likely topics for the new input, 
        as well as the respective words. This function should be used only after 
        the LDA parser has been fitted. 
        @params: 
            @ new_text: new input text 
            @ top_n_topics: top n topics with larges densities  p(topic)
            @ top_n_w: top n word with largest densities p(word) = p(word|topic)*p(topic)
            @ verbose: display information
            @ max_words_per_topic: maximum words per topic  
            @ thrshold: only consider words with density greater than threshold 
        @returns: 
            @ max_topic: most likely topic for the document 
            @ doc_max_topic_words: words associated with the most likely topic 
            @ doc_topics: all topics related to the document 
            @ doc_topic_words: all words from all topics associated with the document 
        """

        self.extract_topic_words(
            max_words_per_topic,
            threshold)  # extract topics to ensure they are there

        new_text_clean = self.preprocessor.preprocess_sentence(
            new_text)  # preprocess input text
        new_doc_bow = self.dictionary.doc2bow(
            new_text_clean)  # convert to doc2bow

        doc_topics = self.lda_model.get_document_topics(
            new_doc_bow)  # obtain topics for input document
        topic_idx = [tup[0] for tup in doc_topics]  # topic indices

        doc_topic_words = [
            word for idx in topic_idx for word in self.topic_words[idx]
        ]  # extract all words from every topic
        top_n_topics = nlargest(top_n_topics,
                                list(doc_topics),
                                key=lambda x: x[1])  # extract top n topics

        top_n_words = list(
            set([
                word for idx in [tup[0] for tup in top_n_topics]
                for word in self.topic_words[idx]
            ]))  # extrac the word for the topc words

        # Currently, we have access to the top n topics and their actual probabilities.
        # We want to collect all the words for those topics, and multiply them with their probabilities

        words_with_probs = [
        ]  # will store words with their actual probabilities:

        for topic_tup in doc_topics:
            topic_idx = topic_tup[0]  # obtain topic index
            topic_prob = topic_tup[1]  # obtain topic probability p(topic)
            for word_tup in self.lda_model.show_topic(topic_idx, topn=10):
                word_probability = word_tup[
                    1] * topic_prob  # p(w) = p(w|topic)p(topic)
                words_with_probs.append(
                    (word_tup[0], word_probability))  # (word, p(w))

        # obtain the n most likely words according to they individual probabilities
        n_most_likely_words = [
            tup[0] for tup in nlargest(
                top_n_w, list(words_with_probs), key=lambda x: x[1])
        ]

        if verbose:
            print("\nLOGS: \n")
            print("*** Most likely topic: ***\n", top_n_topics)
            print("*** Words for most likely topic: ***\n", top_n_words)
            print("*** All topics: ***\n", doc_topics)
            print("*** All topics words: ***\n", doc_topic_words)

        return n_most_likely_words, top_n_topics, top_n_words, doc_topics, doc_topic_words

    def pickle_save(self, savename="full_LDA_parser.pkl"):
        """ 
        Saves the full model object in pkl format
        """
        pickle.dump(self, open(savename, 'wb'))

    def save_model(self, name="LDA_model"):
        """ 
        Saves the LDA model, doc2bow_corpus and dictionary.
        These parameters can be used to instantiate a gensim 
        model, so there is no load in this class. 
        """
        dictionary_name = name + "_dictionary.gensim"
        corpus_name = name + "_doc2bow_corpus.pkl"
        model_name = name + ".gensim"

        pickle.dump(self.doc2bow_corpus, open(corpus_name,
                                              'wb'))  # save the doc2bow_corpus
        self.dictionary.save(dictionary_name)  # save corpus dictionary mapping
        self.lda_model.save(model_name)  # save the full model
Пример #13
0
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath

num_topics = 4
# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

print common_dictionary.items()
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=num_topics)

temp_file = datapath(
    "/Users/wanghaoxian/Documents/GitHub/recommend/dataContest/model")
lda.save(temp_file)
list = lda.get_document_topics(common_corpus)

for topic in list:
    print topic
for i in range(0, num_topics, 1):
    print i, lda.get_topic_terms(i, 3)
Пример #14
0
class CustomLda(object):
    def __init__(self, data=None, dictionary=None):
        """ initialize, data should be provided, only when unpickling class object it is not needed!"""
        self.data = data
        self.model = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.dictionary = dictionary
        if self.data is not None:
            if self.dictionary is None:
                self.dictionary = Dictionary(self.data)
            self.corpus = [self.dictionary.doc2bow(text) for text in self.data]
        else:
            self.dictionary = None
            self.corpus = None
        self.distributed = None
        self.chuncksize = None
        self.passes = None
        self.update_every = None
        self.alpha = None
        self.eta = None
        self.decay = None
        self.offset = None
        self.eval_every = None
        self.gamma_threshold = None
        self.minimum_probability = None
        self.ns_conf = None
        self.minimum_phi_value = None
        self.per_word_topics = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.model = None
        self.coherence_model = None
        self.coherence = None
        self.coherence_type = None

    def train(self,
              num_topics,
              iterations=1500,
              random_state=1,
              distributed=False,
              chunksize=2000,
              passes=1,
              update_every=1,
              alpha='symmetric',
              eta=None,
              decay=0.5,
              offset=1.0,
              eval_every=10,
              gamma_threshold=0.001,
              minimum_probability=0.01,
              ns_conf=None,
              minimum_phi_value=0.01,
              per_word_topics=False,
              workers=1):
        """train lda model. If workers >1, goes multicore"""

        self.distributed = distributed
        self.chuncksize = chunksize
        self.passes = passes
        self.update_every = update_every
        self.alpha = alpha
        self.eta = eta
        self.decay = decay
        self.offset = offset
        self.eval_every = eval_every
        self.gamma_threshold = gamma_threshold
        self.minimum_probability = minimum_probability
        self.ns_conf = ns_conf
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics
        self.num_topics = num_topics
        self.iterations = iterations
        self.random_state = random_state
        self.workers = workers

        if self.workers > 1:
            self.model = LdaMulticore(
                workers=3,
                corpus=self.corpus,
                id2word=self.dictionary,
                iterations=self.iterations,
                num_topics=self.num_topics,
                random_state=self.
                random_state,  # distributed=self.distributed,
                chunksize=self.chuncksize,
                passes=self.passes,  # update_every= self.update_every,
                alpha=self.alpha,
                eta=self.eta,
                decay=self.decay,
                offset=self.offset,
                eval_every=self.eval_every,
                gamma_threshold=self.gamma_threshold,
                minimum_probability=self.
                minimum_probability,  # ns_conf=self.ns_conf,
                minimum_phi_value=self.minimum_phi_value,
                per_word_topics=self.per_word_topics)
        else:
            self.model = LdaModel(corpus=self.corpus,
                                  id2word=self.dictionary,
                                  iterations=self.iterations,
                                  num_topics=self.num_topics,
                                  random_state=self.random_state,
                                  distributed=self.distributed,
                                  chunksize=self.chuncksize,
                                  passes=self.passes,
                                  update_every=self.update_every,
                                  alpha=self.alpha,
                                  eta=self.eta,
                                  decay=self.decay,
                                  offset=self.offset,
                                  eval_every=self.eval_every,
                                  gamma_threshold=self.gamma_threshold,
                                  minimum_probability=self.minimum_probability,
                                  ns_conf=self.ns_conf,
                                  minimum_phi_value=self.minimum_phi_value,
                                  per_word_topics=self.per_word_topics)
        print('Trained!')

    def _train_coherence_model(self, coherence_type='u_mass'):
        """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'"""
        self.coherence_model = CoherenceModel(model=self.model,
                                              texts=self.data,
                                              dictionary=self.dictionary,
                                              coherence=coherence_type)

    def _calculate_coherence(self, coherence_type='u_mass'):
        self._train_coherence_model(coherence_type=coherence_type)
        self.coherence = self.coherence_model.get_coherence()

    def get_coherence(self, coherence_type='u_mass'):
        if coherence_type != self.coherence_type:
            self._calculate_coherence(coherence_type=coherence_type)
        return self.coherence

    def get_topic_terms(self, num, topn=10):
        return self.model.get_topic_terms(num, topn=topn)

    def get_preplexity(self):
        return self.model.log_perplexity(self.corpus)

    def get_topics(self, num):
        return self.model.show_topics(num)

    def _make_visualization(self):
        """prepare visualisation for display/saving"""
        return pyLDAvis.gensim.prepare(self.model,
                                       self.corpus,
                                       self.dictionary,
                                       sort_topics=False)

    def display(self):
        """display LDAvis in notebook"""
        visualisation = self._make_visualization()
        return pyLDAvis.display(visualisation)

    def save_ldavis(self, filename='topic.html'):
        """save LDAvis to .html"""
        ldavis = self._make_visualization()
        pyLDAvis.save_html(ldavis, filename)

    def save_lda(self, filename):
        """save lda model only"""
        self.model.save(filename)

    def pickle(self, filename):
        """save class instance to file"""
        f = open(filename, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    @staticmethod
    def unpickle(filename):
        """read class instance from file"""
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def predict_topic(self, doc_list):
        """predict topic of document list (consists of strings"""
        topic_list = []
        for doc in doc_list:
            bow = self.dictionary.doc2bow(str(doc).split())
            topics_probs = self.model.get_document_topics(bow)
            topics_probs.sort(key=lambda tup: tup[1], reverse=True)
            topic_list.append(topics_probs)
        return topic_list
Пример #15
0
        lda_voc.doc2bow(token_list) for token_list in token_list_list
    ]

    nmi_vec = []
    for _ in tqdm(range(n_test)):

        # LDA
        if prior_distrib:
            lda = LdaModel(lda_corpus, num_topics=n_group, alpha=topic_distrib)
        else:
            lda = LdaModel(lda_corpus, num_topics=n_group)

        # Id doc
        algo_group_vec = []
        for id_doc in range(len(token_list_list)):
            topic_per_type = lda.get_document_topics(lda_corpus[id_doc],
                                                     per_word_topics=True)[1]
            type_list = []
            topic_list = []
            for type_topic_elem in topic_per_type:
                type_list.append(lda_voc.get(type_topic_elem[0]))
                topic_list.append(type_topic_elem[1][0])

            algo_group_vec.extend([
                topic_list[type_list.index(token)]
                for token in token_list_list[id_doc]
            ])

        nmi_vec.append(
            normalized_mutual_info_score(real_group_vec, algo_group_vec))

    # Writing results
Пример #16
0
documents_list=import_docs()
dictionary=corpora.Dictionary(documents_list)
# Converting list of documents (corpus) into Document Term Matrix using the dictionary

doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents_list]

corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)
nbTopics=33
nbPasses=5
# LDA model by Gensim with prior parameter (eta)
# TO BE DONE

####################
ldamodel = LdaModel(doc_term_matrix, num_topics=nbTopics, id2word = dictionary, passes=nbPasses)
# to have the topic of a document : ldamodel[doc]
topics = ldamodel.get_document_topics(doc_term_matrix, per_word_topics=True)


all_topics = [(doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in topics]
doc_topics, word_topics, word_phis = all_topics[1]
f_res=open("resultsComs.csv", "w")

count=0
for d in all_topics:
	for i in range(len(d[0])):
		f_res.write("%d;%d;%f;"%(count, d[0][i][0], d[0][i][1]))
	f_res.write("\n")
	count+=1
f_res.close()

                csvData.append(row)
    with open(filename, 'w', encoding='utf-8') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(csvData)
        print("Write to database successfully")


print("corpus length:", len(pubs_corpus))
print("dict length:", len(pubs_dictionary.keys()))
topic_word_list_result = dict()
topic_dict = topic_to_lemmatized_word_list(lda)

for id in pubs_eids:
    cur_corpus = id_to_corpus.get(id, None)
    if cur_corpus != None:
        candidate_topics = lda.get_document_topics(
            cur_corpus)  # list all topic index
        best_topic_index = select_highest_prob_topic(
            candidate_topics)  # select the index with highest prob
        if best_topic_index == -1:
            print("no topic document:", id)
            topic_word_list_result[id] = ["unknown"]
        else:
            topic_word_list_result[id] = topic_dict[
                best_topic_index][:
                                  3]  # get corresponding topic word list, and store top 3 words
    else:
        topic_word_list_result[id] = ["unknown"]
write_to_table_file('pubs_metadata_by_scopus.csv')

####################### other tried method #######################
# fixme: coherence is useless, cuz coherence always drop
Пример #18
0
class GensimLDA:
    def __init__(self, texts):
        self.dictionary = Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]

        self.k_topics = None
        self.model = None

    def fit(self, k_topics, iterations=50):
        ''''''
        self.k_topics = k_topics
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, \
            num_topics=k_topics, iterations=iterations)

    def get_document_topic_matrix(self, X=None):
        '''Returns an n_docs x k_topics array of probabilities
        of a topic in a given document.'''
        if X is None:
            X = self.corpus
        else:
            X = [self.dictionary.doc2bow(text) for text in X]

        n_docs = len(X)
        V = np.zeros((n_docs, self.k_topics))

        # Extract assignments
        some_iterable = self.model.get_document_topics(
            X)  ## equiv: self.model[X]
        for i, doc_topic in enumerate(some_iterable):
            for topic_id, prob in doc_topic:
                V[i, topic_id] = prob
        return V

    def get_topic_term_matrix(self):
        '''Returns an k_topics x m_words array of probabilities
        of a word in a given topic.'''
        return self.model.get_topics()

    def print_topics(self, top_n=10):
        '''Prints the top_n words in a topic'''
        for row in self.get_topic_term_matrix():
            ranking = np.argsort(row)
            ids = np.arange(len(ranking))[ranking]

            for k in ids[:-top_n:-1]:
                weight = row[k]
                word = self.dictionary.id2token[k]
                print(k, word, weight)
            print()

    def print_topic_words(self, topic_num, topn=None):
        '''Prints the top words and probabilities of a given topic in
        descending probability.'''
        for tok_id, prob in self.model.get_topic_terms(topic_num, topn=topn):
            word = self.dictionary.id2token[tok_id]
            print(word, prob)

    def get_topic_bows(self, num_words=10):
        '''Returns a list (for each topic) containing a list of the top num_words'''
        q = self.model.show_topics(num_topics=self.k_topics,
                                   num_words=num_words,
                                   formatted=False)
        topics = []
        for id, topic in q:
            words = []
            for w, p in topic:
                words.append(w)
            topics.append(words)
        return topics
Пример #19
0
def train_model(documents, onehot_enc, labels):
    """

    :param documents:
    :param onehot_enc:
    :param labels:
    :return:
    """
    # Configuration variables, how many topics will we attempt to extract from
    # our documents.
    num_topics = 400

    # Start
    print('number of documents: ', len(documents))

    id2word = corpora.Dictionary(documents)

    corpus = [id2word.doc2bow(doc) for doc in documents]
    onehot_labels = onehot_enc.transform(labels)

    print("starting LDA model")
    # plug into LDA model.
    # this can take a while with larger number of documents
    lda = LdaModel(num_topics=num_topics,
                   id2word=id2word,
                   corpus=corpus,
                   passes=50,
                   eval_every=1)
    print("topics:")
    for topic in lda.show_topics(num_topics=num_topics,
                                 num_words=20):  # print_topics():
        print(topic)
    lda.save("trained_ldamodel.model")

    # print("getting topics for testing document")
    # topic_prediction = lda.get_document_topics(bow=corpus[0])

    # print(testing_text_raw)
    # print(topic_prediction)

    print("")
    print(
        "starting setup to train a classifier based on LDA topics for each document"
    )

    topic_vecs = []

    # get topic matches and put them into vectors
    for i in range(len(documents)):
        top_topics = lda.get_document_topics(corpus[i], minimum_probability=0)

        #print(len(top_topics))
        topic_vec = [top_topics[i][1] for i in range(num_topics)]
        topic_vecs.append(topic_vec)

    # train basic logistic regression
    model = LogisticRegression(class_weight='balanced').fit(topic_vecs, labels)
    with open('trained_logreg_model.pkl', 'wb') as f:
        pickle.dump(model, f)

    return model, topic_vecs
Пример #20
0
    print (i, topic)


# visualization of topics
#vis_data = gensimvis.prepare(speeches_topics, lda_corpus, dct)
#pyLDAvis.display(vis_data)




# extract all document-topic distritbutions to dictionnary
document_key = list(speeches.index)
document_topic = {}
for doc_id in range(len(lda_corpus)):
    docbok = lda_corpus[doc_id]
    doc_topics = speeches_topics.get_document_topics(docbok, 0)
    tmp = []
    for topic_id, topic_prob in doc_topics:
        tmp.append(topic_prob)
    document_topic[document_key[doc_id]] = tmp


column_names = ['topic_'+str(i) for i in range(1, num_topics + 1)]


# Topic Distribution over time
topic_df = pd.DataFrame(document_topic)
topic_df_T =  topic_df.transpose()
topic_df_T["Date"] = transcript_date
sorted_df = topic_df_T.sort_values(by = "Date")
sorted_df['datetime'] = pd.to_datetime(sorted_df['Date'])
Пример #21
0
class LDAWDF:
    mysql: mysql.MySQL
    ldamodel: LdaModel
    dictionary = None
    corpus = None

    def __init__(self, mysql):
        self.mysql = mysql
        self.dataFolder = './data/'
        self.saveFile = 'lda_model'
        self.saveFileDict = 'lda_model_dict'

    def trainFromStart(self):
        with self.mysql as db:
            content = db.getContentsText()
        documents = []
        for item in content:
            documents.append(item['content'].split())

        self.dictionary = corpora.Dictionary(documents)

        self.dictionary.filter_extremes(no_below=5, no_above=0.5)

        doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents]

        self.corpus = doc_term_matrix

        # Running and Training LDA model on the document term matrix.
        print("Starting to train LDA Model...")
        self.ldamodel = LdaModel(
            doc_term_matrix,
            num_topics=200,
            id2word=self.dictionary,
            passes=100)

    def printTest(self):
        print(self.ldamodel.print_topics(num_topics=10, num_words=5))

    def save(self):
        self.ldamodel.save(self.dataFolder + self.saveFile)
        self.dictionary.save(self.dataFolder + self.saveFileDict)

    def canLoad(self):
        my_file = Path(self.dataFolder + self.saveFile)
        my_file_dict = Path(self.dataFolder + self.saveFileDict)
        return my_file.is_file() and my_file_dict.is_file()

    def update(self, corpus):
        self.ldamodel.update(corpus)

    def load(self, subfolder=None):
        if subfolder:
            sf = subfolder + '/'
        else:
            sf = ''
        self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile)
        self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)

    def fillDb(self):
        topics = {}
        result = []
        result2 = []
        nbTopics = self.ldamodel.get_topics().shape[0]
        # "Old"
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 3)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            words = []
            for topicTerm in topicTerms:
                words.append(self.dictionary.get(topicTerm[0]))
            topics[topicId] = ' '.join(words)
        with mysql as db:
            contentsText = db.getContentsText()
            for element in contentsText:
                bow = self.dictionary.doc2bow(element['content'].split())
                docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05)
                if len(docTopics) > 0:
                    docTopics.sort(key=lambda x: x[1], reverse=True)
                    result.append((element['url'], topics[docTopics[0][0]]))
                    for docTopic in docTopics:
                        result2.append((element['url'], docTopic[0], str(docTopic[1])))
            db.emptyUrlsTopic()
            db.emptyCurrentUrlsTopic()
            db.emptyCurrentUserTags()
            db.setCurrentUrlsTopic(result2)
            db.setPrecalcTopics()
        # "New"
        terms = []
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 5)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            for topicTerm in topicTerms:
                terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1])))
        with mysql as db:
            db.emptyLdaTopics()
            db.setLdaTopics(terms)


    def get_terms_topics(self, keywords):
        bow = self.dictionary.doc2bow(keywords[:30])
        topics = {}
        keywordsResult = {}
        for word in bow:
            wordTopics = self.ldamodel.get_term_topics(word[0], 0.05)
            keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics}
            for wordTopic in wordTopics:
                wordTopicId = wordTopic[0]
                if wordTopicId not in topics:
                    topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId)
        return {'topics': topics, 'keywords': keywordsResult}
Пример #22
0
        for i in range(0, len(sentences), n_sent)
    ]

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=2, no_above=0.3)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    lda = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

    # 主題の確認
    for topic in lda.show_topics(num_topics=-1, num_words=10):
        print('topic id:{0[0]:d}, words={0[1]:s}'.format(topic))

    # 記事の主題分布の推定
    for doc_id in datastore.get_all_ids(limit=-1):
        meta_info = json.loads(
            datastore.get(doc_id, ['meta_info'])['meta_info'])
        title = meta_info['title']
        print(title)

        doc = [
            token['lemma']
            for token in datastore.get_annotation(doc_id, 'token')
            if token.get('NE') == 'O'
        ]
        for topic in sorted(lda.get_document_topics(dictionary.doc2bow(doc)),
                            key=lambda x: x[1],
                            reverse=True):
            print('\ttopic id:{0[0]:d}, prob={0[1]:f}'.format(topic))
    datastore.close()
Пример #23
0
    df = pd.read_csv("./total_info.txt",
                     sep=',',
                     header=0,
                     names=['A', 'B', 'C', 'D', 'E', 'F'])
    data = df['B']
    data = data.apply(lambda s: clean_text(s))
    datalist = data.values
    print(datalist)
    # 分词
    texts = [[word for word in doc.lower().split()] for doc in datalist]
    print(texts[0])

    common_dictionary = Dictionary(texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=20)
    print(lda.print_topic(10, topn=5))

    lda.save('lda.model')
    lda = LdaModel.load('lda.model')

    tryTxt = "while i be suffer i be able to press and go in subscribe but when i press the video it keep show no connection ."
    trylist = [word for word in tryTxt.lower().split()]
    bow = common_dictionary.doc2bow(trylist)
    print(lda.get_document_topics(bow))

    import pyLDAvis.gensim
    # 浏览器打开http://127.0.0.1:8888/
    vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary)
    pyLDAvis.show(vis)
num_topics = 3
dictionary = corpora.Dictionary(words_list)
corpus = [dictionary.doc2bow(words) for words in words_list]
lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

###output1: topics and corresponding words
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=10))

###output2: 2 ways of showing one topic and corresponding words
lda.print_topic(topicno=0)
lda.show_topic(1)

### ouput3: show topic of one user (even new user)
sorted(lda.get_document_topics(corpus[100],
                               minimum_probability=0,
                               per_word_topics=False),
       key=lambda x: x[1],
       reverse=True)

### output4: visualize LDA
lda_display = pyLDAvis.gensim.prepare(lda,
                                      corpus,
                                      dictionary,
                                      R=15,
                                      sort_topics=False)
pyLDAvis.display(lda_display)

##### Text Similarities
doc = tweets[list(tweets.keys())[2]].replace("|||", "")
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=3)
Пример #25
0
def get_document_topic_weights(lda_model: LdaModel, bow) -> list:
    """Returns the topic/weights matrix of the topic model"""
    return [
        weight for topic, weight in lda_model.get_document_topics(
            bow, minimum_probability=0)
    ]
Пример #26
0
    topics_keywords_weight_map_equalnum[topic_id] = temp_keyword_weight_map
topics_words = {}
for topic_id, topic_keywords in topics_keywords_weight_map_equalnum.items():
    topic_words = []
    for keyword, weight in topic_keywords.items():
        topic_words.append(keyword)
    topics_words[topic_id] = topic_words 
timer(global_time,local_time)

################################################################################################################
# Calculate the LDA topic scores
# Match each company to only one cluster using the highest topic score
################################################################################################################
print('Calculating LDA topic scores, and matching each company to one cluster using the highest topic score...')
local_time = time.time()
content_scores = lda.get_document_topics(word_corpus)
contentidx_topicidx_map = {}
contentidx_topicsocre_map = {}
for contentidx, scores in enumerate(content_scores):
    topicid_topicscore_dict = {}
    for topicid, topicscore in scores:
        topicid_topicscore_dict[topicid] = topicscore
    topicid_topicscore_dict_sorted = {k: v for k, v in sorted(topicid_topicscore_dict.items(), key=lambda item: item[1], reverse=True)}
    # print('topicid_topicscore_dict_sorted:', topicid_topicscore_dict_sorted, list(topicid_topicscore_dict_sorted.keys())[0], list(topicid_topicscore_dict_sorted.values())[0])
    contentidx_topicidx_map[contentidx] = list(topicid_topicscore_dict_sorted.keys())[0]
    score = list(topicid_topicscore_dict_sorted.values())[0]
    contentidx_topicsocre_map[contentidx] = score
timer(global_time,local_time)

################################################################################################################
# Store all the results of every corpus as 
#      ldamodel.save('topic_articles.model')
#print(ldamodel.print_topics(num_topics=2, num_words=4))

ii = ldamodel.print_topics(num_topics=50, num_words=30)
df = pd.DataFrame(ii, columns=['id_topics', 'words']).set_index('id_topics')
df1 = df.to_csv('50_topics_on_articles.csv')
df2 = df.to_excel('50_topics_on_articles.xlsx')

#MAIN PLOT
viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
pyLDAvis.save_html(viz, '50t_articles.html')

#Load a model
#ldamodel.load('topic_articles.model')

yo = ldamodel.get_document_topics(
    doc_term_matrix)  #get topics on all 10339 articles
li = []
for i in range(len(comments)):

    new = pd.DataFrame(yo[i],
                       columns=['id', 'prob'
                                ]).sort_values('prob',
                                               ascending=False).drop(['prob'],
                                                                     axis=1)
    p = new.head(1).values.T.flatten().tolist()
    k = li.append(p)

df_topic_id = pd.DataFrame(li, columns=['topics_id'])
df_topic_id.index.name = 'article_text_id'
#df_topic_id.head(5) #topic_ids for all authors' articles
#len(df_topic_id.index)
#print("getting topics for testing document")
#topic_prediction = lda.get_document_topics(bow=corpus[0])

#print(testing_text_raw)
#print(topic_prediction)

print("")
print(
    "starting setup to train a classifier based on LDA topics for each document"
)

topic_vecs = []

# get topic matches and put them into vectors
for i in range(len(documents)):
    top_topics = lda.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(20)]
    topic_vecs.append(topic_vec)

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import f1_score

# train basic logistic regression
sgd_model = LogisticRegression(class_weight='balanced').fit(topic_vecs, labels)

pred_labels = sgd_model.predict(topic_vecs)

# get accuracy from the training data, just to look at whether this even seems feasible...
# 0.3 f1 score on the training, using 12123 documents. not great results for now.
print("accuracy on training data: ",
      f1_score(labels, pred_labels, average='weighted'))
Пример #29
0
#
# count = 0
# for i in range(len(abs_topics_prob)):
#     if get_doc_topic_id(i, abs_topics_prob) == get_doc_topic_id(i, title_topics_prob):
#         count += 1
# count = 27514
# count / len(abs_topics_prob) = 0.5229406621811685


def get_doc_topic_id(doc_id, docs_topics_prob):
    result = np.where(
        docs_topics_prob[doc_id] == docs_topics_prob[doc_id].max())
    return result[0][0]


doc_topics = lda.get_document_topics(corpus, 0)
probs = [[entry[1] for entry in doc] for doc in doc_topics]
docs_topics_prob = np.array(probs)

topic_doc_year_num = np.zeros((50, 10))
for i in range(len(docs_topics_prob)):
    y = int(nat_data.year[i] - 1971)
    t = get_doc_topic_id(i, docs_topics_prob)
    topic_doc_year_num[y][t] += 1

colors = [
    'rosybrown', 'lightcoral', 'indianred', 'brown', 'peru', 'darkorange',
    'gold', 'yellow', 'green', 'darkgoldenrod'
]

fig = plt.figure(figsize=(30, 15))
Пример #30
0
               random_state=1)

doc_topics = [lda[c] for c in corpus]

avg_doc_topics = mean([len(t) for t in doc_topics])

print(f"topics num of doc = {avg_doc_topics}")

topic_freq = frequencies([t[0] for dt in doc_topics for t in dt])

print('----------')

for i in range(topic_num):
    items = [(dic[t[0]], t[1]) for t in lda.get_topic_terms(i, topn=5)]
    freq = topic_freq[i] if i in topic_freq else 0

    print(f"topic_id = {i}, freq = {freq}, items = {items}")

print('----------')

for i in range(len(corpus)):
    dts = lda.get_document_topics(corpus[i], per_word_topics=True)

    for dt in dts[2]:
        item = dic[dt[0]]
        print(f"corpus = {i}, item = {item}, topic_id = {dt[1]}")

vis = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs=1, sort_topics=False)

pyLDAvis.save_html(vis, dest_file)
Пример #31
0
def get_most_common(title_list,
                    dic,
                    num=COMMON_TOPIC_WORDS_NUM,
                    random_state=None):
    '''最頻出の話題の単語num個のセットを取得する'''

    bow = [dic.doc2bow(title) for title in title_list]
    # TODO: 適切なトピック数を取得して設定する
    if LOG_LEVEL == 'DEBUG':
        random_state = 123
    model = LdaModel(bow,
                     id2word=dic,
                     num_topics=TOPIC_NUM,
                     random_state=random_state)
    # 各タイトルを分類
    topic_id_list = []
    for idx, title in enumerate(title_list):
        logger.debug('title')
        logger.debug(title)
        doc_topics_tuple = model.get_document_topics(dic.doc2bow(title),
                                                     minimum_probability=0.0)
        doc_topic_dist = [[val[0], val[1]] for val in doc_topics_tuple]
        doc_topic_dist = np.array(doc_topic_dist)
        if idx == 0:
            topic_dist_arr = doc_topic_dist
        else:
            topic_dist_arr = np.vstack([topic_dist_arr, doc_topic_dist])
        topic_id = int(
            sorted(doc_topic_dist, key=lambda x: x[1], reverse=True)[0][0])
        topic_id_list.append(topic_id)
    if LOG_LEVEL == 'DEBUG':
        # titleごとのトピック分布
        df_topic_dist = pd.DataFrame({
            'title': title_list,
            'topic_id': topic_id_list
        })
        # トピックごとの単語分布
        cols = ['{}_{}'.format(word_no, elem) \
                for word_no in range(10) \
                    for elem in range(2)]
        df_word_dist = pd.DataFrame()
        arr_dist = topic_dist_arr.reshape(-1, model.get_topics().shape[0], 2)
        for topic_id in range(model.get_topics().shape[0]):
            df_topic_dist['topic_{}'.format(topic_id)] = arr_dist[:, topic_id,
                                                                  1]
            topic_terms = model.get_topic_terms(topic_id,
                                                topn=int(len(cols) / 2))
            topic_terms_2 = []
            for term in topic_terms:
                topic_terms_2 = topic_terms_2 + [
                    dic.id2token[term[0]], term[1]
                ]
            df_word_dist = df_word_dist.append(
                pd.Series(topic_terms_2, name='topic_{}'.format(topic_id)))
        df_topic_dist.to_csv(
            os.path.join('test', 'classified_topic_{}.csv' \
                .format(datetime.today().strftime(format='%Y%m%d'))),
            index=False,
            encoding='cp932'
        )
        df_word_dist.columns = cols
        df_word_dist.to_csv(
            os.path.join('test', 'word_distribution_per_topic_{}.csv' \
                .format(datetime.today().strftime(format='%Y%m%d'))),
            encoding='cp932'
        )
    # 最頻出の話題を取得
    topic_id_counter = Counter(topic_id_list)
    most_common_topic_id = topic_id_counter.most_common(1)[0][0]
    topic_terms = model.get_topic_terms(most_common_topic_id)
    logger.debug('')
    logger.debug('topic_id_counter: ' + str(topic_id_counter))
    logger.debug('most_common_topic_id: ' + str(most_common_topic_id))
    logger.debug(topic_terms)
    # 最頻出の話題の重要な単語num個を取得
    important_word_list = [
        dic.id2token[topic_tuple[0]] for topic_tuple in topic_terms[:num]
    ]
    logger.debug(important_word_list)
    return important_word_list