Пример #1
0
def train_lda(recipe_file,num_topics,output_file):
    corpus = RecipeCorpus(recipe_file)
    
    corpora.MmCorpus.serialize(output_file+'.corpus.mm', corpus)
    lda = LdaModel(corpus, id2word=corpus.dictionary, num_topics=int(num_topics), distributed=False)
    lda.save(output_file)
    return lda
Пример #2
0
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
    """
    Function to convert mallet model to gensim LdaModel. This works by copying the
    training model weights (alpha, beta...) from a trained mallet model into the
    gensim model.

    Args:
        mallet_model : Trained mallet model
        gamma_threshold : To be used for inference in the new LdaModel.
        iterations : number of iterations to be used for inference in the new LdaModel.

    Returns:
        model_gensim : LdaModel instance; copied gensim LdaModel
    """
    model_gensim = LdaModel(
        id2word=mallet_model.id2word,
        num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha,
        iterations=iterations,
        gamma_threshold=gamma_threshold,
        dtype=numpy.
        float64  # don't loose precision when converting from MALLET
    )
    model_gensim.expElogbeta[:] = mallet_model.wordtopics
    return model_gensim
Пример #3
0
class LDA(object):
    def __init__(self, model, vocab, corpus=None, topics=200, passes=1):
        self._model_file = model
        self._dict_file = vocab
        self._corpus_file = corpus
        self._topics = topics
        self._passes = passes

    def train(self):
        self._corpus = SentenceDocCorpus(self._corpus_file)
        self._lda = LdaModel(self._corpus,
                             num_topics=self._topics,
                             id2word=self._corpus.dictionary,
                             passes=self._passes)
        self._dictionary = self._corpus.dictionary

        self._lda.save(self._model_file)
        self._dictionary.save(self._dict_file)

    def load(self):
        self._lda = LdaModel.load(self._model_file)
        self._dictionary = Dictionary.load(self._dict_file)

    def topics(self, words):
        return self._lda[self._dictionary.doc2bow(common.filter(words))]

    def topic_vector(self, words):
        return np.array([
            v for k, v in self._lda.__getitem__(
                self._dictionary.doc2bow(common.filter(words)), eps=0)
        ])
 def update(self, corpus=[[]]):
     """
     在线更新,在已有模型的基础上在线更新
     Args:
         corpus - 用于更新的文档列表
     """
     if not self._model and len(corpus) > 0:
         # 创建字典,每一个词都给予一个索引
         self._common_dictionary = Dictionary(corpus)
         corpus_data = [
             self._common_dictionary.doc2bow(sentence)
             for sentence in corpus
         ]
         # corpus_data 词袋矩阵 topics 主题数 id2word 将索引转化为词 passes 训练轮数
         self._model = LdaModel(corpus_data,
                                self._topics,
                                id2word=self._common_dictionary,
                                passes=50)
         #self._model = LdaModel(corpus_data, self._topics)
     elif self._model and len(corpus) > 0:
         self._common_dictionary.add_documents(corpus)
         new_corpus_data = [
             self._common_dictionary.doc2bow(sentence)
             for sentence in corpus
         ]
         self._model.update(new_corpus_data)
Пример #5
0
def main():
    docs = get_train(
        'D:/ByResearch/基于文本的原油油价预测/20200615code/code/SeaNMF-master/data/wedata.txt'
    )
    docs = [s.strip().split() for s in docs]

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=10, no_above=0.2)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Make a index to word dictionary.
    temp = dictionary[0]  # only to "load" the dictionary.
    id2word = dictionary.id2token

    PMI = []
    for i in range(2, 11):
        print(i)
        lda_model = LdaModel(corpus=corpus,
                             id2word=id2word,
                             iterations=100,
                             num_topics=i)
        # Print the Keyword in the 5 topics
        print(lda_model.print_topics())

        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence='c_uci')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        del lda_model
        PMI.append(coherence_lda)
    print(PMI)
Пример #6
0
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`.

    This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model.

    Parameters
    ----------
    mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet`
        Trained Mallet model
    gamma_threshold : float, optional
        To be used for inference in the new LdaModel.
    iterations : int, optional
        Number of iterations to be used for inference in the new LdaModel.

    Returns
    -------
    :class:`~gensim.models.ldamodel.LdaModel`
        Gensim native LDA.

    """
    model_gensim = LdaModel(
        id2word=mallet_model.id2word,
        num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha,
        iterations=iterations,
        gamma_threshold=gamma_threshold,
        dtype=numpy.
        float64  # don't loose precision when converting from MALLET
    )
    model_gensim.expElogbeta[:] = mallet_model.wordtopics
    return model_gensim
Пример #7
0
def train_lda(n_topics, id2word_dictionary=None, documents=None, corpus=None):
    """
    Training method for LDA. documents is a list of lists of words/tokens
    documents is used to construct a dictionary and a corpus from which the
    topics for LDA are inferred
    """
    # Construct dictionary of words if it's not passed
    if not id2word_dictionary:
        id2word_dictionary = corpora.Dictionary(documents)

    word2idx_dictionary = dict([(w, idx) for (idx, w) in id2word_dictionary.items()])

    # Construct corpus for model
    if documents and not corpus:
        corpus = [id2word_dictionary.doc2bow(document) for document in documents]

    # Cluster the documents into topics using LDA. number of topics is given
    # by n_topics
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word_dictionary,
                         num_topics=n_topics,
                         update_every=1,
                         chunksize=10000,
                         passes=1)

    """
    Default value for topn (number of top words to show by probability) is 10.
    A high enough value should return the words covering most or all of the
    probability mass
    """
    topics = [lda_model.show_topic(idx, topn=50000)
              for idx in range(0, n_topics)]

    return lda_model, id2word_dictionary, word2idx_dictionary, topics
Пример #8
0
def runlda(filetopicwords,fileinput,NUMTOPICS=30,NUMPASSES=10,NUMITERATIONS=10):
    print('runlda...')
    from gensim.corpora import Dictionary
    from gensim.models.ldamodel import LdaModel
    import numpy as np
    docs,word2freqtopics = [],{}
    fr = open(fileinput,'r')
    for line in fr:
        words = line.strip('\r\n').split(' ')
        docs.append(words)
        for word in words:
            if not word in word2freqtopics:
                word2freqtopics[word] = [0,[0. for i in range(NUMTOPICS)]]
            word2freqtopics[word][0] += 1
    fr.close()
    V = len(word2freqtopics)
    dct = Dictionary(docs)
    model = LdaModel(corpus=[dct.doc2bow(doc) for doc in docs],id2word=dct, \
            num_topics=NUMTOPICS,passes=NUMPASSES,iterations=NUMITERATIONS) 
    fw = open(filetopicwords,'w')
    for topicid in range(NUMTOPICS):
        s = 'topic '+str(topicid)
        wordscores = []
        for (wordid,score) in model.get_topic_terms(topicid,topn=V):
            if score < 1e-6: break
            wordscores.append([dct[wordid],score])
        scoresum = sum([x[1] for x in wordscores])
        for [word,score] in wordscores:
            s += ','+word+':'+str(np.round(score/scoresum,6))
            word2freqtopics[word][1][topicid] = score
        fw.write(s+'\n')
    fw.close()
    '''
Пример #9
0
 def trainModel(self):
     if self.toweight:
         self.model = LdaModel(self.tfidf[self.corpus], num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]])
     else:
         self.model = LdaModel(self.corpus, num_topics=self.num_topics)
         self.index = MatrixSimilarity(self.model[self.corpus])
Пример #10
0
    def lda_model(self,
                  num_topics: [int, None] = 10,
                  passes: [int, None] = 1,
                  seed: [int, None] = None):
        """
        Construct LDA topic models for each year in a
        corpus, given a set of parameters.
        """

        if self.word_to_id is None or self.corpora is None:
            self.build_dictionaries_and_corpora()

        results = num_dict(self.year_list)

        if seed is None:

            for year in self.year_list[:-1]:
                results[year] = \
                    LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year],
                             num_topics=num_topics, passes=passes)

        else:

            rand = RandomState(seed)
            for year in self.year_list[:-1]:
                results[year] = \
                    LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year],
                             num_topics=num_topics, passes=passes, random_state=rand)

        return TopicResults(results, self.num_docs, self.name)
def vwmodel2ldamodel(vw_model, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to
    :class:`~gensim.models.ldamodel.LdaModel`.

    This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel
    into the gensim model.

    Parameters
    ----------
    vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`
        Trained Vowpal Wabbit model.
    iterations : int
        Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`.

    Returns
    -------
    :class:`~gensim.models.ldamodel.LdaModel`.
        Gensim native LDA.

    """
    model_gensim = LdaModel(num_topics=vw_model.num_topics,
                            id2word=vw_model.id2word,
                            chunksize=vw_model.chunksize,
                            passes=vw_model.passes,
                            alpha=vw_model.alpha,
                            eta=vw_model.eta,
                            decay=vw_model.decay,
                            offset=vw_model.offset,
                            iterations=iterations,
                            gamma_threshold=vw_model.gamma_threshold,
                            dtype=numpy.float32)
    model_gensim.expElogbeta[:] = vw_model._get_topics()
    return model_gensim
Пример #12
0
	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		for idioma in self.output()['langs'].iterkeys():
			dicc_path = self.input()['dict']['langs'][idioma].path
			corp_path = self.input()['corp']['langs'][idioma].path
			print '=============================='
			print 'Corriendo LDA de %s con nivel de limpieza %s' % (idioma, kind)
			print '=============================='

			# Cargar diccionario y corpus
			dicc = corpora.Dictionary.load(dicc_path)
			corpus = corpora.MmCorpus(corp_path)

			# Correr LDA del idioma para cada numero de topicos
			for n_topics in self.output()['langs'][idioma].iterkeys():
				print 'Número de tópicos: ' + str(n_topics)
				if self.by_chunks:
					lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, update_every=self.update_e, chunksize=self.chunk_size, passes=self.n_passes)
				else:
					lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, passes=1)
				lda.save(self.output()['langs'][idioma][n_topics].path)
Пример #13
0
def train_model(texts, **kwargs):

  # parse args
  filter_stopwords = kwargs.get('filter_stopwords', True)
  normalizer = kwargs.get('normalizer', 'porter')
  tfidf = kwargs.get('tfidf', True)
  num_topics = kwargs.get('num_topics', 20)
  min_freq = kwargs.get('min_freq', 2)
  use_pickle = kwargs.get('use_pickle', True)
  update_pickle = kwargs.get('update_pickle', True)
  report = kwargs.get('report', True)
  distributed = kwargs.get('distributed', False)
  
  # build corpus or read it in from pickle
  if use_pickle:
    print "INFO: loading pickled corpus and word hash"
    corpus = pickle.load( open( "pickles/corpus.p", "rb" ) )
    id2word = pickle.load( open( "pickles/id2word.p", "rb" ) )
            
  else:
    print "INFO: processing text and building corpus..."
    corpus, id2word = process_texts(
      texts = texts, 
      filter_stopwords = filter_stopwords,
      normalizer = normalizer,
      min_freq = min_freq
    )

    if update_pickle:
      # pickle files
      print "INFO: updating pickled coprus and word hash"
      pickle.dump(corpus, open( "pickles/corpus.p", "wb" ) )
      pickle.dump(id2word, open( "pickles/id2word.p", "wb" ) )

  # optional tfidf transformation
  if tfidf:
    print "INFO: applying tfidf transformation..."
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]

  # fit model
  print "INFO: fitting model..."
  lda = LdaModel(
    corpus = corpus, 
    id2word = id2word, 
    num_topics = num_topics,
    distributed = distributed
  )

  # report
  if report:
    perplexity = lda.bound(corpus)
    print "RESULTS:"
    print "\nperplexity: ", perplexity, "\n"
    topics = lda.show_topics(num_topics)
    for i, t in enumerate(topics):
      print "topic %d:" % i
      print t

  return lda, corpus, id2word
def draw_cluster_key_word(cluster: list):
    """
    抽取一个聚类的关键词
    :param cluster: list of tuple(7),问题二中聚类得到的簇
    :return: list of words,关键词列表
    """
    stop = fetch_default_stop_words()  # 停用词表
    stop.extend(["", " ", "\n", "\t", "*"])  # 附加几个停用词

    sents = [
        jieba.lcut(row[2] + "。" + row[4], cut_all=True) for row in cluster
    ]  # 分词
    sents = [[word for word in sent if word not in stop]
             for sent in sents]  # 去停用词

    dictionary = corpora.Dictionary(sents)  # 建立词典
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in sents]  # 文档-词频矩阵

    # 训练LDA模型
    lda_model = LdaModel(doc_term_matrix,
                         num_topics=1,
                         id2word=dictionary,
                         passes=1)

    # 解析出主题中概率最大的前6个词
    key_words = [
        word
        for index, word in enumerate(lda_model.show_topics()[0][1].split("\""))
        if index in [1, 3, 5, 7, 9, 11]
    ]
    return key_words
Пример #15
0
class LMDL_LDA():
    def __init__(self):
        self.lmdl = LMDL_Corpus()
        self.texts = self.lmdl.get_corpus_texts_words()
        self.dictionary = Dictionary(self.texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        self.lda = LdaModel(self.corpus,
                            num_topics=LDA_NUM_TOPICS,
                            id2word=self.dictionary)

    def print_topics(self):
        return self.lda.print_topics(LDA_NUM_TOPICS)

    def get_document_topics(self, document_name):
        document_tokens = self.lmdl.token_list_processed(document_name)
        topics = self.lda.get_document_topics(
            self.dictionary.doc2bow(document_tokens),
            minimum_probability=None,
            minimum_phi_value=None,
            per_word_topics=False)
        show_topics_list = []
        for topic in topics:
            lda_topic = self.lda.show_topic(topic[0], topn=10)
            show_topics_list.append(lda_topic)
        return show_topics_list

    def top_topics(self):
        return self.lda.top_topics(corpus=self.corpus,
                                   texts=self.texts,
                                   dictionary=self.dictionary,
                                   window_size=None,
                                   coherence='u_mass',
                                   topn=20,
                                   processes=-1)
Пример #16
0
def generate_topics(journal, num_topics, num_words, passes):
    # num_words: number of words we want to see from each topic (defult is 10)
    # passes: times to go over the data. 1 can be used for large corpus

    filename = '{}_article_titles.txt'.format(journal)

    with open(filename) as f:
        documents = f.readlines()

    texts = [[
        word for word in document.lower().split() if word not in STOPWORDS
    ] for document in documents]

    stemmer = SnowballStemmer('english')
    texts_stemmed = [[stemmer.stem(word) for word in text] for text in texts]

    dictionary = corpora.Dictionary(texts_stemmed)
    corpus = [dictionary.doc2bow(text)
              for text in texts]  # bow means bag of words

    # LDA model
    lda = LdaModel(corpus,
                   id2word=dictionary,
                   num_topics=num_topics,
                   passes=passes)

    for topic in lda.print_topics(num_words=num_words):
        topicNumber = topic[0]
        print(topicNumber, ':', sep='')
        listOfTerms = topic[1].split('+')
        for term in listOfTerms:
            listItems = term.split('*')
            print('  ', listItems[1], '(', listItems[0], ')', sep='')
    '''
Пример #17
0
def makeLDA(path, num_topics, num_words, passes):
    num_topics = num_topics  # 模型中寻找主题的数量
    num_words = num_words  # 从每个主题中看到多少单词
    passes = passes  # 重复检查数据多少次
    with open(filename, encoding='utf-8') as f:
        documents = f.readlines()
        texts = [[
            word for word in document.lower().split()
            if word not in STOPWORDS and word.isalnum()
        ] for document in documents]

    # print(texts)
    # 从单词列表中创建一个字典和一个语料库
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(corpus,
                   id2word=dictionary,
                   num_topics=num_topics,
                   passes=passes)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words=num_words))
    unseennText = '../../../data/LDA_data/lkmlSingleNewEmail.txt'
    with open(unseennText, encoding='utf-8') as fenw:
        newdoc = fenw.read()
    newcourpus = dictionary.doc2bow(
        newword for newword in newdoc.lower().split()
        if newword not in STOPWORDS and newword.isalnum())

    #将新的语料库传入现有的LDA模型
    pp.pprint(lda[newcourpus])
def get_topics(df, num_topics):

    df_temp = df.sample(frac=0.2)

    text_dict = corpora.Dictionary(df_temp['topic_modeling_text'])

    tweets_bow = [
        text_dict.doc2bow(tweet) for tweet in df_temp['topic_modeling_text']
    ]

    tweets_lda = LdaModel(tweets_bow,
                          num_topics=num_topics,
                          id2word=text_dict,
                          random_state=1,
                          passes=5)

    words = [re.findall(r'"([^"]*)"', t[1]) for t in tweets_lda.print_topics()]
    topics = [' '.join(t[0:10]) for t in words]

    # Getting the coherence score -
    st.write(' ')
    coherence_model = CoherenceModel(model=tweets_lda,
                                     texts=df_temp['topic_modeling_text'],
                                     dictionary=text_dict,
                                     coherence='c_v')
    coherence_lda = coherence_model.get_coherence()

    return topics, coherence_lda
Пример #19
0
def get_lda_feature():
    doc_train = pd.read_csv(id_content_path)
    documents = doc_train['content'].apply(lambda x: x.split(' '))
    #    建立词和ID的映射字典(id:word)
    dictionary = corpora.Dictionary(documents)
    #    建立文档和id和list(tuple(id,num)) of list df
    ds_df = [dictionary.doc2bow(document) for document in documents]
    #    建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df
    tfidf_model = TfidfModel(ds_df)
    #    获取文档的tdf获取文档tfidf
    ds_tfidf = tfidf_model[ds_df]
    #    定义文档的主题个数
    n = 60
    #    构建lda模型,输入参数是文档的tfidf,并指明主题的个数
    lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12)
    vec_size = (len(documents), n)
    lda_feature = np.zeros(vec_size)
    i = 0

    for doc in ds_tfidf:
        topics = lda_model.get_document_topics(doc, minimum_probability=0.01)
        for topic in topics:
            num_topic = topic[0]
            prob = round(topic[1], 5)
            lda_feature[i, num_topic] = prob
        i += 1

    f_names = get_lda_feacture_name(n)
    pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path,
                                                      index=0)
def main():
    collection_name = "nips"
    years = xrange(2010, 2015)  # 10 ~ 14
    n_topics = 10
    
    corpus_paths = map(lambda y: 
                       "data/{}-{}.dat".format(collection_name, y),
                       years)
    all_corpus = []
    year2corpus = {}
    for year, path in zip(years, corpus_paths):
        corpus = list(load_line_corpus(path))
        all_corpus.append(proc_corpus(corpus))
        year2corpus[year] = corpus

    all_corpus = list(itertools.chain.from_iterable(all_corpus))

    dictionary = Dictionary(all_corpus)
    all_corpus = [dictionary.doc2bow(doc)
                  for doc in all_corpus]

    import pdb
    pdb.set_trace()

    # print all_corpus
    model = LdaModel(all_corpus, num_topics=n_topics,
                     id2word=dictionary,
                     eval_every=10, passes=100)
    print model.show_topics()
def create_LDA(comment_dict,
               num_topics=20,
               chunk_size=50,
               max_iter=20,
               from_db=True,
               get_data_func=None):
    lda = None
    text_gen = data_preprocessor(max_iter=max_iter,
                                 from_db=from_db,
                                 get_data_func=get_data_func)
    corpus = []
    for _, stemmed_text, _ in text_gen:
        if len(stemmed_text) != 0:
            corpus.append(comment_dict.doc2bow(stemmed_text))
        if len(corpus) == chunk_size:
            if lda is None:
                lda = LdaModel(corpus=corpus,
                               num_topics=num_topics,
                               id2word=comment_dict,
                               per_word_topics=1,
                               passes=10)
            else:
                lda.update(corpus=corpus)
            corpus = []
    return lda
    def create_model(self,
                     doc_matrix,
                     term_dictionary,
                     model_path,
                     save_model=True,
                     language='language_na'):
        """
        Creates an LDA model based on a set of documents
        :param model_path:
        :param doc_matrix:
        :param term_dictionary:
        :param save_model:
        :param language:
        :return LDA model:
        """
        self.language = language
        start = time()
        self.ldamodel = LdaModel(doc_matrix,
                                 num_topics=self.num_categories,
                                 id2word=term_dictionary,
                                 passes=50)

        if save_model:
            self.save_model(model_path=os.path.join(
                model_path, 'models', self.language,
                '%s_%s_category_lda.model' %
                (language, str(self.num_categories))))

        logging.info('Training lasted: {:.2f}s'.format(time() - start))
        return self.ldamodel
Пример #23
0
 def find_topic(self,condition=None,n_topics=10,n_words=10,topic_model='lda',vec_model='tf',show=True,**kwargs):
     '''主题模型,和上面那个函数,优先使用该函数
     parameter
     ---------
     condition: 语料逻辑值,可以用于专门对好评/差评进行主题分解
     n_topics: 主题数
     n_words: 每个主题输出的词语数
     vec_model: 向量化方法,默认是tf
     '''
     if condition is not None:
         texts=self.texts_seg[condition]
     else:
         texts=self.texts_seg
     if topic_model in ['lda','LDA']:
         dictionary = corpora.Dictionary([doc.split(' ') for doc in texts])
         corpus = [dictionary.doc2bow(text.split(' ')) for text in texts]
         if vec_model in ['idf','tfidf']:
             tfidf = models.TfidfModel(corpus)
             corpus = tfidf[corpus]
         lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics)
         topics_keywords=lda.show_topics(num_topics=n_topics, num_words=n_words,formatted=False)
         if show:
             print('\n'.join(['主题 {}: {}'.format(i,' | '.join([k[0] for k in \
             topic[1]])) for i,topic in enumerate(topics_keywords)]))
         return topics_keywords
Пример #24
0
def vwmodel2ldamodel(vw_model, iterations=50):
    """
    Function to convert vowpal wabbit model to gensim LdaModel. This works by
    simply copying the training model weights (alpha, beta...) from a trained
    vwmodel into the gensim model.

    Args:
        vw_model : Trained vowpal wabbit model.
        iterations : Number of iterations to be used for inference of the new LdaModel.

    Returns:
        model_gensim : LdaModel instance; copied gensim LdaModel.
    """
    model_gensim = LdaModel(num_topics=vw_model.num_topics,
                            id2word=vw_model.id2word,
                            chunksize=vw_model.chunksize,
                            passes=vw_model.passes,
                            alpha=vw_model.alpha,
                            eta=vw_model.eta,
                            decay=vw_model.decay,
                            offset=vw_model.offset,
                            iterations=iterations,
                            gamma_threshold=vw_model.gamma_threshold,
                            dtype=numpy.float32)
    model_gensim.expElogbeta[:] = vw_model._get_topics()
    return model_gensim
Пример #25
0
def trainModel():
    """ Train a model
    """
    if args.mode == 'Random':
        return args.topics, 0
    # need to train on dump
    files = [
        f"{args.input}/{f}" for f in os.listdir(args.input)
        if os.path.isfile(os.path.join(args.input, f))
    ]
    if args.mode == 'LDA':
        # create dictionary
        with open(files[0], "r", encoding='utf-8') as f:
            dct = Dictionary([' '.join(f.readlines()).split()])
        for filename in files[1:]:
            with open(filename, "r", encoding='utf-8') as f:
                dct.add_documents([' '.join(f.readlines()).split()])
        # create corpus
        corpus = []
        for filename in files:
            with open(filename, "r", encoding='utf-8') as f:
                corpus.append(dct.doc2bow(' '.join(f.readlines()).split()))
        lda = LdaModel(corpus, num_topics=args.topics)
        lda.save("./models/LDAdump.model")
        dct.save("./models/LDAdump.dct")
        return lda, dct
    if args.mode == 'loadLDA':
        return LdaModel.load("./models/LDAdump.model"), Dictionary.load(
            "./models/LDAdump.dct")
Пример #26
0
def get_topics(candidate, day):
    start_time = datetime.strptime(day, "%Y-%m-%d").date()
    start_time = int(start_time.strftime('%s'))*1000
    end_time = start_time + 86399999
    try:
        client = MongoClient()
        tweets = client.fletcher.tweets
        tweets = tweets.aggregate([
            {"$match":{"$text":{"$search":candidate_search[candidate_slugs[candidate]]}}},
            {"$match":{"timestamp_ms":{"$gte":start_time,"$lt":end_time}}}])
        documents = []
        pattern = re.compile("[^a-zA-Z ]")
        for tweet in tweets:
            documents.append(pattern.sub('', tweet['text']))
        stoplist = set(candidate_stop_words[candidate_slugs[candidate]] + stopwords)
        texts = [[word for word in document.lower().split() if word not in stoplist]
                for document in documents]
        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1
        texts = [[token for token in text if frequency[token] > 1]
                for text in texts]
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, update_every=1, chunksize=10000, passes=10)
        return lda.print_topics(3)
    except:
        return None
def display_perplexity_on_topic_num(start, step, limit):
    model_list = []
    pplxty_list = []
    names = locals()
    for num_topics in range(start, limit, step):
        print("############### current num:", num_topics, "###############")
        model_path = os.getcwd() + "\\Model\\topic_num_" + str(
            num_topics) + ".model"
        if not os.path.exists(model_path):
            # Modeling!!!!!
            print("Modeling in progress...")
            names['model' + str(num_topics)] = LdaModel(
                pubs_corpus,
                num_topics=num_topics,
                id2word=pubs_dictionary,
                passes=10,
                eval_every=1)
            names['model' + str(num_topics)].save(model_path)
        else:
            print("Model already exists.")
            names['model' + str(num_topics)] = LdaModel.load(model_path)
        model_list.append(names['model' + str(num_topics)])
        pplxty_value = perplexity(names['model' + str(num_topics)],
                                  pubs_corpus, pubs_dictionary,
                                  len(pubs_dictionary.keys()), num_topics)
        pplxty_list.append(pplxty_value)
    return model_list, pplxty_list
Пример #28
0
def vwmodel2ldamodel(vw_model, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to
    :class:`~gensim.models.ldamodel.LdaModel`.

    This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel
    into the gensim model.

    Parameters
    ----------
    vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`
        Trained Vowpal Wabbit model.
    iterations : int
        Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`.

    Returns
    -------
    :class:`~gensim.models.ldamodel.LdaModel`.
        Gensim native LDA.

    """
    model_gensim = LdaModel(
        num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
        passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
        offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
        dtype=numpy.float32
    )
    model_gensim.expElogbeta[:] = vw_model._get_topics()
    return model_gensim
Пример #29
0
class LDA(object):
    def __init__(self, model, vocab, corpus=None, topics=200, passes=1):
        self._model_file = model
        self._dict_file = vocab
        self._corpus_file = corpus
        self._topics = topics
        self._passes = passes

    def train(self):
        self._corpus = SentenceDocCorpus(self._corpus_file)
        self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
        self._dictionary = self._corpus.dictionary
        
        self._lda.save(self._model_file)
        self._dictionary.save(self._dict_file)

    def load(self):
        self._lda = LdaModel.load(self._model_file)
        self._dictionary = Dictionary.load(self._dict_file)

    def topics(self, words):
        return self._lda[self._dictionary.doc2bow(common.filter(words))]

    def topic_vector(self, words):
        return np.array([v for k, v in self._lda.__getitem__(self._dictionary.doc2bow(common.filter(words)), eps=0)])
def getLdaFeature(documents, topicNum):
    '''
     Funciton:
         generate lda features by training lda model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lda features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)    
    corpusD = [dictionary.doc2bow(text) for text in texts]

    # train lda model
#     LogInfo(' Train LDA model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
#     ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    # generate lda features
    LogInfo(' Generate LDA features...')
    ldaFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpus_tfidf:
        topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01)
        for t in topic:
             ldaFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlda")
    ldaFeature = pd.DataFrame(ldaFeature, columns = colName)
    return ldaFeature
Пример #31
0
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`.

    This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model.

    Parameters
    ----------
    mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet`
        Trained Mallet model
    gamma_threshold : float, optional
        To be used for inference in the new LdaModel.
    iterations : int, optional
        Number of iterations to be used for inference in the new LdaModel.

    Returns
    -------
    :class:`~gensim.models.ldamodel.LdaModel`
        Gensim native LDA.

    """
    model_gensim = LdaModel(
        id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha, iterations=iterations,
        gamma_threshold=gamma_threshold,
        dtype=numpy.float64  # don't loose precision when converting from MALLET
    )
    model_gensim.expElogbeta[:] = mallet_model.wordtopics
    return model_gensim
Пример #32
0
 def __init__(self):
     self.lmdl = LMDL_Corpus()
     self.texts = self.lmdl.get_corpus_texts_words()
     self.dictionary = Dictionary(self.texts)
     self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
     self.lda = LdaModel(self.corpus,
                         num_topics=LDA_NUM_TOPICS,
                         id2word=self.dictionary)
Пример #33
0
def convertldaMalletToldaGen(mallet_model):
    model_gensim = LdaModel(
        id2word=mallet_model.id2word,
        num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha)  # original function has 'eta=0' argument
    model_gensim.state.sstats[...] = mallet_model.wordtopics
    model_gensim.sync_state()
    return model_gensim
Пример #34
0
 def find_topic(self, num_topics, num_words=2, passes=20):
     dic = Dictionary(self.texts)
     corpus = [dic.doc2bow(text) for text in self.texts]
     lda = LdaModel(corpus,
                    num_topics=num_topics,
                    id2word=dic,
                    passes=passes)
     return lda.top_topics(topn=2, dictionary=dic, corpus=corpus)
Пример #35
0
def ldacreator(t, topics=5, stopwords=[]):
    texts = [[
        word for word in t.lower().split()
        if word not in list(STOPWORDS) + stopwords and word.isalnum()
    ]]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(corpus, id2word=dictionary, num_topics=topics, passes=10)
    return (lda.print_topics())
Пример #36
0
 def train(self, common_texts, num_topics):
     self.common_dictionary = Dictionary(common_texts)
     common_corpus = [
         self.common_dictionary.doc2bow(text) for text in common_texts
     ]
     self.model = LdaModel(common_corpus,
                           num_topics=num_topics,
                           alpha='auto',
                           eval_every=5)
Пример #37
0
def lda_gensim(texts,num_topics=10):
    id2word = Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]  
    lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
    for top in lda.print_topics():
        print(top)
    lda_corpus = lda[corpus]
    X_lda = corpus2csc(lda_corpus).todense().T
    return X_lda
Пример #38
0
def plottopicpop():
    internet = [0 for i in range(10)]
    developing = [0 for i in range(10)]
    habr = [0 for i in range(10)]
    n = 0
    for year in range(2006, 2016):
        articles, numberofarticles = getarticlesbyyear(year)
        print("Got articles for", str(year))
        # Normalaize texts
        i = 0
        for article in articles:
            article = replacesymbols(article)
            articles[i] = normalaisestr(article.lower())
            i += 1
        print('Normalaised')
        
        # Remove unnecessary words
        texts = [[word for word in article if word not in stoplist]
                 for article in articles]
        print('Deleted stopwords')
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        print('Starting training')
        # Щадящий режим для ОЗУ
        for i in range(numberofarticles // 100):
            begin = 100 * i
            end = 100 * (i + 1)
            if end > numberofarticles:
                end = numberofarticles
            lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin)

            for j in range(lda.num_topics):
                topics = lda.get_topic_terms(j, 15)
                # print(topics)
                for topic in topics[0]:
                    top = dictionary.get(topic)
                    # print(top)
                    if "интернет" == top:
                        internet[n] += 1
                    if "разработка" == top:
                        developing[n] += 1
                    if "хабра" == top:
                        habr[n] += 1
            del lda
        n += 1

        print(internet,'\n', developing, '\n', habr)

    plt.title('Population of 3 topics.')
    plt.xlabel('Year 2006 - 2015')
    plt.ylabel('Number of articles')
    plt.plot(internet, label="Интернет")
    plt.plot(developing, label="Разработка")
    plt.plot(habr, label="Хабра")
    plt.legend()
    plt.show()
Пример #39
0
def getLdaModel(bow_corpus, dictionary, useSavedTill):
    if useSavedTill >= USESAVED.lda_model:
        common_logger.info("loading LDA model from file")
        return LdaModel.load(file_lda_model)
    else:
        common_logger.info("Training LDA model")
        num_topics = int(math.log(len(bow_corpus)) + 1)  # assumption:
        lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses)
        common_logger.info("Saving LDA model")
        lda_model.save(file_lda_model)
        common_logger.info("Done creating LDA model")
        return lda_model
	def fetch_model(dictionary):
		print "Fetching LDA Model... ",
		try:
			lda = LdaModel.load('Topic/lda.tm')
			print "LDA Model loaded!"
		except IOError:
			print "Model not found, building LDA..."
			corpus=MyCorpus()
			#lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15)
			lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
			print "LDA Built!"
			lda.save('Topic/lda.tm')
		return lda
Пример #41
0
	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		if not os.path.exists(self.res_dir):
			print 'Creando carpeta para resultados...'
			os.mkdir(self.res_dir)

		# Aplicar cada modelo
		for idioma, modelos in self.input()['lda']['langs'].iteritems():
			corp_path = self.input()['corp']['langs'][idioma].path
			corpus = corpora.MmCorpus(corp_path)
			for n_topics, modelo in modelos.iteritems():
				model_path = modelo.path
				model = LdaModel.load(model_path)
				classification = []
				for doc in corpus:
					topic = model.get_document_topics(doc)
					classification.append(topic)
				print '--------------------------------------'
				print 'USER INFO: Clasificando textos en %s con nivel de limpieza "%s" con %d tópicos' % (idioma, kind, n_topics)
				model.print_topics(len(corpus),5)
				with self.output()['langs'][idioma][n_topics]['doc_topics'].open('w') as f:
					pickle.dump(classification, f)
				with self.output()['langs'][idioma][n_topics]['topics'].open('w') as f:
					pickle.dump(model.print_topics(n_topics,5), f) # el 5 es un parámetro que se puede editar (numero de palabras del tópico a mostrar)	
Пример #42
0
 def train(self):
     self._corpus = SentenceDocCorpus(self._corpus_file)
     self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
     self._dictionary = self._corpus.dictionary
     
     self._lda.save(self._model_file)
     self._dictionary.save(self._dict_file)
Пример #43
0
Файл: lda.py Проект: freygit/36
    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
Пример #44
0
def make_clouds(files, n_words=20):
    # set locations
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_d = '../browser/clouds/' + base_model_name + '/'
    if not os.path.exists(output_d):
        os.makedirs(output_d)
    # create wordcloud generator
    wc = WordCloud(width=1000, height=500, background_color='white')

    print('Loading model')
    model = LdaModel.load(files.model)
    beta = model.expElogbeta

    print('Normalizing by topics, and by words')
    pTW = normalize(beta, axis=0)
    pWT = normalize(beta, axis=1)

    # load bug<->id map, then invert to id<-> bug
    bug_to_id = json.loads(open(files.replacements).read())
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}

    for i in range(len(beta)):
        # compute RAR
        t_rar = np.sqrt(pTW[i] * pWT[i])
        top_word_ids = t_rar.argsort()[:-1 - n_words:-1]
        top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids]
        top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words]
        wc.fit_words(zip(top_words, t_rar[top_word_ids]))
        wc.to_file(output_d + str(i) + '.png')
    def __init__(self, fnames, model=None, corpus=None, dictionary=None):
        """`fnames` is an array of files for [lda_model, distribution]"""
        self.reviews = open('data/electronics_topics_in.txt').readlines()

        print "Loding topic model..."
        if model is not None:
            print "Using argument model"
            self.lda = model
        else:
            self.lda = LdaModel.load(fnames[0])

        if corpus is not None:
            print "Using argument corpus and dictionary"
            self.corpus = corpus
            self.dictionary = dictionary
        else:
            print "Loading corpus and dictionary from file"
            self.corpus = load("data/models/electronics_tfidf_corpus.pkl")
            self.dictionary = load("data/models/electronics_dict.pkl")

        print "Loading review-topic distribution..."
        self.review_dist = [l for l in self.lda[self.corpus]]
        tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True)
        self.review_dist = map(lambda dist: tmp(dist), self.review_dist)

        print "processing topics"
        tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1))
        self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
Пример #46
0
	def train_lda (self, corpus, dictionary):
		"""
			PRIVATE: train_lda
			------------------
			given a corpus and a dictionary, this fits parameters for self.lda_model, 
			fills self.lda_model_topics with the 
		"""
		self.lda_model = LdaModel(corpus, id2word=dictionary, num_topics=self.num_topics_lda)
		self.lda_model_topics = self.find_per_topic_word_distributions ()
Пример #47
0
	def analyze(self, docs):
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))

		# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
		docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs]

		docTopicDistr = self.getDocumentTopics(docTermMatrix)
		return docTopicDistr
Пример #48
0
	def __init__(self, ac):
		with open('../TextMining/Topic/data.loc','rb') as f:
			load(f)
			self.data = load(f)
		with open('../TextMining/Topic/translator.loc','rb') as f:
			self.translator = load(f)
		self.index = similarities.MatrixSimilarity.load('../TextMining/Topic/index.loc')
		self.lda = LdaModel.load('../TextMining/Topic/lda.loc')
		self.dictionary = Dictionary().load("../TextMining/Topic/dic.loc")
		self.ac_terms = ac
Пример #49
0
 def __init__(self, jobdesc_fname, jobtitle_fname):
     self.es = Elasticsearch([{'host': app.config['ES_HOST'], 'port': 9200, 'timeout': 120}])
     self.model = LdaModel.load(app.config['RCMDR_LDA_MODEL'])
     self.job_labels = {
         int(k):v
         for k, v in (line.split("=") for line in open(app.config['RCMDR_JOB_LABELS'])
                 .read().strip().split('\n'))
         }
     self.jobdesc_fname = jobdesc_fname
     self.jobtitle_fname = jobtitle_fname
Пример #50
0
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
    """
    Function to convert mallet model to gensim LdaModel. This works by copying the
    training model weights (alpha, beta...) from a trained mallet model into the
    gensim model.

    Args:
        mallet_model : Trained mallet model
        gamma_threshold : To be used for inference in the new LdaModel.
        iterations : number of iterations to be used for inference in the new LdaModel.

    Returns:
        model_gensim : LdaModel instance; copied gensim LdaModel
    """
    model_gensim = LdaModel(
        id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha, iterations=iterations,
        gamma_threshold=gamma_threshold)
    model_gensim.expElogbeta[:] = mallet_model.wordtopics
    return model_gensim
Пример #51
0
def vwmodel2ldamodel(vw_model, iterations=50):
    """
    Function to convert vowpal wabbit model to gensim LdaModel. This works by
    simply copying the training model weights (alpha, beta...) from a trained
    vwmodel into the gensim model.

    Args:
        vw_model : Trained vowpal wabbit model.
        iterations : Number of iterations to be used for inference of the new LdaModel.

    Returns:
        model_gensim : LdaModel instance; copied gensim LdaModel.
    """
    model_gensim = LdaModel(
        num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
        passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
        offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold
    )
    model_gensim.expElogbeta[:] = vw_model._get_topics()
    return model_gensim
Пример #52
0
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
Пример #53
0
def AuthorTopicStd():
    import nltk

    from gensim import corpora
    from gensim import matutils
    from gensim.models.ldamodel import LdaModel
    from nltk.corpus import stopwords
    from unidecode import unidecode

    TOPIC_FILE = './lda_topic.dump'
    LDA_FILE = './result.lda'
    DICTIONARY_FILE = './keywords.dict'

    with open(TOPIC_FILE, 'rb') as f:
        num_topics, topic_result = serializer.load(f)

    lda = LdaModel.load(LDA_FILE)

    dictionary = corpora.Dictionary.load(DICTIONARY_FILE)

    tokenizer = nltk.tokenize.RegexpTokenizer(r'[\w]{2,}')
    stopwords_set = set(stopwords.words())

    my_topic_cache_by_aid = [None, None]

    def calculator(aid, pid):
        if my_topic_cache_by_aid[0] == aid:
            my_topic = my_topic_cache_by_aid[1]
        else:
            my_keywords = []

            for ipid, iaid in paper_authors.get_by_aid(aid):
                paper = papers.get(ipid)
                if paper is None:
                    continue
                keywords = tokenizer.tokenize(unidecode(paper[Papers.IDX_TITLE]).lower())
                if not keywords:
                    continue
                my_keywords.extend(keywords)

            my_keywords = list(filter(lambda s: s not in stopwords_set, my_keywords))
            if not my_keywords:
                return np.nan

            my_topic = lda[dictionary.doc2bow(my_keywords)]

            my_topic_cache_by_aid[0] = aid
            my_topic_cache_by_aid[1] = my_topic

        my_topic_array = matutils.sparse2full(my_topic, num_topics)
        return np.std(my_topic_array)

    return calculator
Пример #54
0
def ldaforhabr():
    numberofarticles = 0
    articles, numberofarticles = getarticles()
    print("Got articles")
    # Normalaize texts
    i = 0
    for article in articles:
        article = replacesymbols(article)
        articles[i] = normalaisestr(article.lower())
        i += 1
    print('Normalaised')
    # Remove unnecessary words
    texts = [[word for word in article if word not in stoplist]
             for article in articles]
    print('Deleted stopwords')
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    print('Starting training')
    f = open('lda.log', 'w')
    for i in range(i // numberofarticles):
            begin = 100 * i
            end = 100 * (i + 1)
            if end > numberofarticles:
                end = numberofarticles
            lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin)

            for j in range(lda.num_topics):
                topics = lda.get_topic_terms(j, 15)
                f.write(str(begin + j) + ": ")
                # print(topics)
                for topic in topics[0]:

                    top = dictionary.get(topic)
                    if top is not None:
                        f.write(top + '\n')

                f.write('-----------\n')
            # i += 1
            del lda
    f.close()
Пример #55
0
def lda_topic_model(data, is_clean=False, num_of_topics=10, num_of_pass=5):
    """do the topic model for the given dataset
    input:
        data: a documents or a list of words
        is_clean: Use this notation to pre-process the data.
        num_of_topics: An LDA model requires the user to determine how many
                        topics should be generated.
        num_of_pass: The greater the number of passes, the more accurate the
                    model will be.
                    A lot of passes can be slow on a very large corpus.
    """
    if not is_clean:
        stops = set(nltk.corpus.stopwords.words("english"))
        texts = prepare_for_lda(data, stops)
    else:
        texts = data
    dictionary = corpora.Dictionary(texts)
    print dictionary
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=num_of_topics, \
                        passes=num_of_pass)
    return ldamodel.print_topics(num_topics=num_of_topics, num_words=10)
Пример #56
0
def generate_model():
    np.set_printoptions(precision=2)
    corpus = []
    corpus += load_expo_cdc()
    corpus += load_lago()
    corpus += load_news()
    corpus += load_news_ic()
    corpus += load_palestras()
    corpus = preprocessing(corpus)
    dictionary = corpora.Dictionary(corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in corpus]

    dictionary.save(DICT)
    corpora.MmCorpus.serialize(BOW_CORPUS, bow_corpus)

    bow2 = np.concatenate((bow_corpus, bow_corpus), axis=0)
    bow2 = np.concatenate((bow2, bow2), axis=0)
    bow2 = np.concatenate((bow2, bow2), axis=0)
    TOPICS = 20
    model = LdaModel(bow2, id2word=dictionary, num_topics=TOPICS, iterations=100, passes=15)
    model.save(MODEL)

    lda_corpus = [model[vector] for vector in bow2]
    lda_dense = gensim.matutils.corpus2dense(lda_corpus, num_terms=TOPICS).transpose()
    """
    tfidf = models.TfidfModel(bow_corpus)
    tfidf_corpus = [tfidf[vector] for vector in bow_corpus]
    tfidf_dense = gensim.matutils.corpus2dense(tfidf_corpus, num_terms=len(dictionary)).transpose()
    """
    classifier = LogisticRegression()
    labels = load_labels()
    labels2 = labels
    labels2 += labels2
    labels2 += labels2
    labels2 += labels2
    classifier.fit(lda_dense, labels2)
    joblib.dump(classifier, CLASSIFIER, compress=9)
    #print "LDA results"
    probs = classifier.predict_proba(lda_dense)
Пример #57
0
 def SNAP_generateLDAForTopic(self, topic, numTopics = 5):
   if (topic == 'all'):
     topics = ['syria', 'ufo', 'movie', 'celebrity', 'russia'] # bieber, cyrus
     for t in topics:
       for nt in [5, 10]:
         self.SNAP_generateLDAForTopic(t, nt)
     return
   id2word = self.SNAP_id2word()
   mmPath = os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
     'snap_data',
     "gensim_snap_mmcorpus_%s.mm" % topic
   )
   outPath = os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
     'snap_data',
     "gensim_snap_lda_%s_%d" % (topic, numTopics)
   )
   mm = MmCorpus(mmPath)
   lda = LdaModel(corpus=mm, id2word=id2word, num_topics=numTopics, update_every=1, chunksize=10000, passes=1)
   lda.save(outPath)
   return
Пример #58
0
	def update(self, docs):
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))

		# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
		docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs]

		numPass = self.config.getIntConfig("train.num.pass")[0]
		self.ldaModel.update(docTermMatrix, passes=numPasses)

		docTopicDistr = self.getDocumentTopics(docTermMatrix)
		return docTopicDistr
Пример #59
0
def build_lda_model(corpus, dictionary, num_topics=10):
    file_name = None

    if corpus == None:
        corpus = get_corpus()
    if dictionary == None:
        dictionary = get_dictionary()

    if num_topics == 10:
        file_name = LDA_FILE_10
    elif num_topics == 30:
        file_name = LDA_FILE_30
    elif num_topics == 60:
        file_name = LDA_FILE_60
    elif num_topics == 120:
        file_name = LDA_FILE_120
    else:
        raise ValueError("bad number of topics")
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=100, passes=1)
    lda.save(file_name)
    for topic in range(10):
        print "Topic {0}: {1}".format(topic, lda.print_topic(topic))
    return lda
Пример #60
0
class LDA(BaseEstimator, TransformerMixin):
    def __init__(self, **params):
        self.params = params

    def fit(self, X, y=None):
        corpus = Sparse2Corpus(X, documents_columns=False)
        self.lda = LdaModel(corpus, **self.params)
        return self

    def transform(self, X, y=None):
        corpus = Sparse2Corpus(X, documents_columns=False)
        topics = np.array([map(lambda x: x[1], self.lda.__getitem__(c, eps=0)) for c in corpus])
        print topics.shape
        return topics