def bin_tweets_by_date_and_lda(dataset, n_topics=10, mallet=False, dataname=""): # dataset a class of KenyaCSMessage, a list of tweets, sorted by date. # Extract date and text. # Clean, tokenize it # Build a BOW model. date_pos = dataset.date_pos text_pos = dataset.text_pos data = np.array(dataset.data) date_data = data[:, date_pos] lda_model_name = make_lda_model_name(dataname, n_topics=n_topics, mallet=mallet) text_data, text_dict, text_bow = tu.process_text(data[:, text_pos], stoplist=dataset.stoplist) logging.info("Text processed") # If and LDA model does not already exist - build it. if lda_model_name is None or not os.path.isfile(lda_model_name): logging.info("Building lda with %i " % int(n_topics)) lda_model_name = build_lda(text_corpus=text_bow, dictionary=text_dict, n_topics=int(n_topics), mallet=False, dataname=dataname) logging.info("Lda model created in %s " % lda_model_name) # Load the LDA model lda_model = load_lda_model(lda_model_name, mallet=mallet) # Create the histogram of counts per topic per date. topic_assignments = apply_lda(bow_text_data=text_bow, lda_model=lda_model) date_topic_histogram(date_data, topic_assignments, n_topics=n_topics, dataname=dataname) # Extract and process topic definitions extract_topic_definitions(lda_model=lda_model, n_topics=n_topics, dataname=dataname)
def extract_phrases(tweet_text_corpus, stoplist): for thresh in [6000, 7000, 8000, 10000]: print "Threshhold %i " % thresh text_corpus, dictionary, bow = tu.process_text(tweet_text_corpus, stoplist=stoplist, bigrams=thresh, trigrams=None, keep_all=False, no_below=10, no_above=0.8) bigrams = [word for word in dictionary.token2id.keys() if re.search("_", word)] print len(bigrams) print ", ".join(bigrams) print
def build_doc2vec(dataset, size=100, window=10, dataname="none"): """ Given a text corpus build a word2vec model :param size: :param window: :param dataname: :return: """ # dataset a class of KenyaCSMessage, a list of tweets, sorted by date. # Extract date and text. # Clean, tokenize it # Build a BOW model. text_pos = dataset.text_pos id_pos = dataset.id_pos data = np.array(dataset.data) text_data, text_dict, text_bow = tu.process_text(data[:, text_pos], stoplist=dataset.stoplist, keep_all=True) labeled_text_data = np.array([LabeledSentence(tweet, ["id_"+str(id_str)]) for tweet, id_str in zip(text_data, data[:, id_pos])]) logging.info("Text processed") logging.info("Building d2v ") d2v_model_dm = Doc2Vec(min_count=1, window=window, size=size, sample=1e-3, negative=5, workers=4) d2v_model_dbow = Doc2Vec(min_count=1, window=window, size=size, sample=1e-3, negative=5, dm=0, workers=4) #build vocab over all reviews d2v_model_dm.build_vocab(labeled_text_data) d2v_model_dbow.build_vocab(labeled_text_data) #We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy. for epoch in range(10): perm = np.random.permutation(labeled_text_data.shape[0]) d2v_model_dm.train(labeled_text_data[perm]) d2v_model_dbow.train(labeled_text_data[perm]) d2v_model_dm_name = make_d2v_model_name(dataname, size, window, 'dm') d2v_model_dbow_name = make_d2v_model_name(dataname, size, window, 'dbow') d2v_model_dm.save(d2v_model_dm_name) d2v_model_dbow.save(d2v_model_dbow_name) return d2v_model_dm, d2v_model_dbow #------------------------------