tweet_texts = sqlContext.sql("SELECT text FROM tweets")

# run the processing .run() in the processing.py for the texts
# as output, we have pre-processed texts ready for gensim dictionary and gensim building
texts = twpr.run(tweet_texts)

# build Gensim dictionary and corpus with helper methods in processing.py
dictionary = twpr.buildDictionaryFromTexts(texts)
corpus = twpr.buildCorpusFromDictionaryAndTexts(texts, dictionary)

# set LDA topic count parameter
num_topics = 25
# in order to map LDA output and actual tweets for further analysis, select tweet IDs and texts
tweet_ids = sqlContext.sql("SELECT id_str as id, text FROM tweets")
# now we have all necesssary pre-processed data for LDA analysis

# use the pre-processed inputs to do the LDA analysis
distros = twpr.doLDA(corpus, dictionary, num_topics, tweet_ids)

# now we have the Apache Spark RDD object we can either .take(5) or .collect() all
distros_all = distros.collect()
# now we have the LDA topic probability distributions in memory

hdp = twpr.doHDP(corpus, dictionary)

# to make sense of the LDA output, we need to somehow look at the data
# thus, we'll write the topics into CSV, weighted with TF-IDF frequencies
topics = twpr.TFIDFsFromTopicDistributions(distros_all[0:-1], sqlContext, corpus, dictionary)
twpr.writeTFIDFsToCSV(topics)
# processing done
from gensim import corpora, models, similarities # needed for text clustering
import sklearn
import nltk
import csv
# nltk.download() # ensure all the necessary corpora are present for lemmatization


# Use MongoDB to fetch the top 10 users with the highest ratio_per_tweet index
users = twpr.User.objects() #.order_by('ratio_per_tweet').limit(100)
# from those users, get the tweet texts
tweet_texts = twpr.User.getTextsFromUsers(users)
# and then pre-process those texts
texts = twpr.runWithoutMap(tweet_texts)
# build dictionary and corpus
dictionary = twpr.buildDictionaryFromTexts(texts)
corpus = twpr.buildCorpusFromDictionaryAndTexts(texts, dictionary)

# set LDA topic count parameter
num_topics = 25
# and do the LDA modeling
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=0, passes=20)

# get the distributions for each tweet
tweets_for_lda = twpr.User.getTweetsFromUsers(users)
distros = twpr.distrosForTweetsFromLDAModel(lda, dictionary, tweets_for_lda)
twpr.writeMongoDistrosIntoCSV(distros, num_topics, 'new_distros.csv')

# print the topic keywords with the TF-IDF frequencies as weights
topics = twpr.TFIDFsFromMongoDBTopicDistributions(distros, corpus, dictionary)
twpr.writeTFIDFsToCSV(topics, 'new_tfidf.csv')