Exemplo n.º 1
0
def create_online_lda(docs, ids, name, numTopics):
  corpus, dictionary = docs2corpus(docs, name, True)
  print '>> generating online lda model...'
  lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=numTopics, id2word=dictionary, passes=10)
  print lda
  lda.save(name + '.lda')
  return lda2topicMap(lda, corpus, ids, name), lda.show_topics(formatted=False)
def generateTopics(corpus, dictionary):
    # Build LDA model using the above corpus
    
    #lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=100)
    #corpus_lda = lda[corpus]
    lda = lsimodel.LsiModel(corpus, id2word=dictionary, num_topics=100)
    corpus_lda = lda[corpus]

    # Group topics with similar words together.
    tops = set(lda.show_topics(num_topics=100,num_words=20))
    top_clusters = []
    for l in tops:
        top = []
        for t in l.split(" + "):
            top.append((t.split("*")[0], t.split("*")[1]))
        top_clusters.append(top)

    # Generate word only topics
    top_wordonly = []
    for i in top_clusters:
        top_wordonly.append(":".join([j[1] for j in i]))

    return lda, corpus_lda, top_clusters, top_wordonly   
Exemplo n.º 3
0
'''

lda = models.LdaModel.load('model.lda')

# applying the LDA model to identify topic for each request using
# similarity queries
docs = request_text_list
lda_topics = []
for doc in docs:
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lda = lda[vec_bow]
    vec_lda.sort(key=lambda item: -item[1])
    lda_topics.append(vec_lda[0][0])

# printing the topics and the words associated with each topic
for i in lda.show_topics():
    print i

# generating a feature for the topics
df['lda_topics'] = pd.Series(lda_topics)

# generating dummies for each topic
topics = pd.get_dummies(df['lda_topics'], prefix='topic')
df = pd.concat([df, topics], axis=1)
'''
Logistic Regression
'''

import statsmodels.formula.api as smf
from sklearn.cross_validation import train_test_split
Exemplo n.º 4
0
def get_online_lda_topics(name, numTopics):
  lda = gensim.models.ldamodel.LdaModel.load(name + '.lda')
  return lda.show_topics(num_topics=numTopics, formatted=False)
Exemplo n.º 5
0
def load_online_lda(docs, ids, name):
  print '>> loading online lda model...'
  corpus, dictionary = docs2corpus(docs, name, False)
  lda = gensim.models.ldamodel.LdaModel.load(name + '.lda')
  # return a map from evidence to topic and a list of topics
  return lda2topicMap(lda, corpus, ids, name), lda.show_topics(formatted=False)
Exemplo n.º 6
0
word_len = pd.Series(clean_all).map(len)
word_len = pd.Series(word_len, name='word_len')
cut_len = pd.concat([cut_result, word_len], axis=1)
group_by_len = cut_len.groupby('word_len')
len_count = group_by_len.count()
group_by_cut = pd.Series(freq)
sort_freq = group_by_cut.sort_values(ascending=False)
appear_once = freq.hapaxes()  #统计出现一次的词语

#统计每天词频最高的词语
hot_day = []
for i in range(len(clean_day)):
    freq = FreqDist(clean_day[i])
    group_by_word = pd.Series(freq)
    sort_freq = group_by_word.sort_values(ascending=False)
    hot_day.append(sort_freq[:10])
hot_day_stat = pd.DataFrame(hot_day)
hot_day_stat.to_excel("D:\\data\\HotWords5.xlsx")

#构建LDA模型
word_dict = corpora.Dictionary(clean_day)
corpus_list = [word_dict.doc2bow(doc) for doc in clean_day]
lda = models.ldamodel.LdaModel(corpus=corpus_list,
                               id2word=word_dict,
                               num_topics=10)

output_file = 'D:\\data\\lda_output.txt'
with open(output_file, 'w') as f:
    for pattern in lda.show_topics():
        f.write("%s" % str(pattern))
Exemplo n.º 7
0
    vec_rp.sort(key=lambda item: -item[1])
    rp_topics.append(vec_rp[0][0])
    
df['rp_topics'] = pd.Series(rp_topics)

# applying the LDA model to identify topic for each request using
# similarity queries
docs = request_text_list
lda_topics = []
for doc in docs:
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lda = lda[vec_bow]
    vec_lda.sort(key=lambda item: -item[1])
    lda_topics.append(vec_lda[0][0])
    
for i in lda.show_topics():
    print i

for i in lda.print_topics():
    print i
    
df['lda_topics'] = pd.Series(lda_topics)

# applying the HDP model to identify topic for each request using
# similarity queries
docs = request_text_list[:30]
hdp_topics = []
for doc in docs:
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_hdp = hdp[vec_bow]
    vec_hdp.sort(key=lambda item: -item[1])