corpus = [dictionary.doc2bow(text) for text in texts]



# COMMAND ----------

data = to_list[0:100]

# COMMAND ----------

data

# COMMAND ----------

# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)
Exemplo n.º 2
0
df_comments = sqlContext.createDataFrame(comments, ["list_of_words", 'index'])

# TF
cv = CountVectorizer(inputCol="list_of_words",
                     outputCol="raw_features",
                     vocabSize=50000,
                     minDF=10.0)
cvmodel = cv.fit(df_comments)
result_cv = cvmodel.transform(df_comments)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)

lda = LDA(k=3, maxIter=50)
model = lda.fit(result_tfidf[['index', 'features']])

transformed = model.transform(result_tfidf)
# transformed.show(truncate=False)
model.describeTopics(8).show()

# ll = model.logLikelihood(result_tfidf[['index','features']])
# lp = model.logPerplexity(result_tfidf[['index','features']])

vocabulary = {}
j = 0
for i in cvmodel.vocabulary:
    vocabulary[j] = i.encode("utf-8")
    j += 1
Exemplo n.º 3
0
p = rescaledData.select('features')
p = p.limit(650000)  # you can choose number or comments you want to run LDA on
#p.count()
#p.show(3)

import threading
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename='running.log',
                    filemode='w')

#Calculating LDA:
start = time()
lda = LDA(k=20, maxIter=500)
model = lda.fit(p)
print('used LDA: {:.2f}s'.format(time() - start))

#model.isDistributed()

#start = time()
#ll = model.logLikelihood(p)
#lp = model.logPerplexity(p)
#print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
#print("The upper bound on perplexity: " + str(lp))
#print ('used: {:.2f}s'.format(time()-start))

start = time()
# Describe topics.
topics = model.describeTopics(15)