Пример #1
0
def lda2topicMap(lda, corpus, ids, name):
  print '>> generating topic map...'
  evidenceTopicMap = {}
#  dictionary = gensim.corpora.Dictionary.load('/tmp/' + name + '.dict')
  i = 0
  for c in corpus:
#    b = dictionary.doc2bow(d)
    evidenceTopicMap[ids[i]] = lda.get_document_topics(c, minimum_probability=0.01)
    i += 1
  print len(evidenceTopicMap)
  return evidenceTopicMap
Пример #2
0
def get_document_topics(doc, name):
  lda = gensim.models.ldamodel.LdaModel.load(name + '.lda')
  englishStopWords = get_stopwords('english', name)
  text = [singularize(word) for word in doc.lower().split() if singularize(word) not in englishStopWords and word.isalpha() and len(word) > 1]
  dictionary = gensim.corpora.Dictionary.load(name + '.dict')
  document_topics = lda.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.05)
  if len(document_topics) > 0:
    primary_topic_tuple = max(document_topics, key=lambda x:x[1])
    topic_terms = lda.show_topic(primary_topic_tuple[0])
    print topic_terms
    return document_topics, topic_terms
  else:
    return [], ''
Пример #3
0
def gensim_lda(pd_df_yelp, text_rev):  #gensim lda
    common_dict = Dictionary(text_rev)
    common_corpus = [common_dict.doc2bow(text) for text in text_rev]
    lda = LdaModel(common_corpus)
    topics = [lda.get_document_topics(doc) for doc in common_corpus]
    topicIDs = [topic[0][0] for topic in topics]
    topic_prob_list = [lda.show_topic(topicID) for topicID in topicIDs]
    topic_prob_list_split = [zip(*item) for item in topic_prob_list]
    topic_prob_list_words = [list(map(lambda topID: dict(common_dict)[int(topID)],item[0]))\
     for item in topic_prob_list_split]
    topic_prob_list_prob = list(
        map(lambda item: list(item[1]), topic_prob_list_split))
    return (topic_prob_list_words, topic_prob_list_prob)
Пример #4
0
def topic_extraction(corpus, ntopics):
    # gensim lda
    common_dictionary = Dictionary(corpus)
    common_corpus = [common_dictionary.doc2bow(text) for text in corpus]
    lda = LdaModel(common_corpus,
                   num_topics=ntopics,
                   iterations=800,
                   random_state=1)
    features = lda.get_document_topics(common_corpus, minimum_probability=0)
    lda_list = []
    for f in features:
        lda_list.append([b[1] for b in f])
    lda_df = pd.DataFrame(lda_list)
    lda_df = lda_df.reset_index(drop=True)
    return lda_df