Пример #1
0
def bin_tweets_by_date_and_lda(dataset, n_topics=10, mallet=False, dataname=""):
    # dataset a class of KenyaCSMessage, a list of tweets, sorted by date.

    # Extract date and text.
    # Clean, tokenize it
    # Build a BOW model.
    date_pos = dataset.date_pos
    text_pos = dataset.text_pos
    data = np.array(dataset.data)
    date_data = data[:, date_pos]

    lda_model_name = make_lda_model_name(dataname, n_topics=n_topics, mallet=mallet)

    text_data, text_dict, text_bow = tu.process_text(data[:, text_pos], stoplist=dataset.stoplist)
    logging.info("Text processed")

    # If and LDA model does not already exist - build it.
    if lda_model_name is None or not os.path.isfile(lda_model_name):

        logging.info("Building lda with %i " % int(n_topics))
        lda_model_name = build_lda(text_corpus=text_bow, dictionary=text_dict, n_topics=int(n_topics), mallet=False,
                                   dataname=dataname)
        logging.info("Lda model created in %s " % lda_model_name)

    # Load the LDA model
    lda_model = load_lda_model(lda_model_name, mallet=mallet)

    # Create the histogram of counts per topic per date.
    topic_assignments = apply_lda(bow_text_data=text_bow, lda_model=lda_model)
    date_topic_histogram(date_data, topic_assignments, n_topics=n_topics, dataname=dataname)
    # Extract and process topic definitions
    extract_topic_definitions(lda_model=lda_model, n_topics=n_topics, dataname=dataname)
Пример #2
0
def extract_phrases(tweet_text_corpus, stoplist):
    for thresh in [6000, 7000, 8000, 10000]:
        print "Threshhold %i " % thresh
        text_corpus, dictionary, bow = tu.process_text(tweet_text_corpus, stoplist=stoplist,
                                                       bigrams=thresh, trigrams=None, keep_all=False,
                                                       no_below=10, no_above=0.8)

        bigrams = [word for word in dictionary.token2id.keys() if re.search("_", word)]
        print len(bigrams)
        print ", ".join(bigrams)

        print
Пример #3
0
def build_doc2vec(dataset, size=100, window=10, dataname="none"):
    """
    Given a text corpus build a word2vec model
    :param size:
    :param window:
    :param dataname:
    :return:
    """

    # dataset a class of KenyaCSMessage, a list of tweets, sorted by date.

    # Extract date and text.
    # Clean, tokenize it
    # Build a BOW model.
    text_pos = dataset.text_pos
    id_pos = dataset.id_pos
    data = np.array(dataset.data)

    text_data, text_dict, text_bow = tu.process_text(data[:, text_pos], stoplist=dataset.stoplist, keep_all=True)

    labeled_text_data = np.array([LabeledSentence(tweet, ["id_"+str(id_str)])
                                  for tweet, id_str in zip(text_data, data[:, id_pos])])

    logging.info("Text processed")
    logging.info("Building d2v ")

    d2v_model_dm = Doc2Vec(min_count=1, window=window, size=size, sample=1e-3, negative=5, workers=4)
    d2v_model_dbow = Doc2Vec(min_count=1, window=window, size=size, sample=1e-3, negative=5, dm=0, workers=4)

    #build vocab over all reviews
    d2v_model_dm.build_vocab(labeled_text_data)
    d2v_model_dbow.build_vocab(labeled_text_data)

    #We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
    for epoch in range(10):
        perm = np.random.permutation(labeled_text_data.shape[0])
        d2v_model_dm.train(labeled_text_data[perm])
        d2v_model_dbow.train(labeled_text_data[perm])

    d2v_model_dm_name = make_d2v_model_name(dataname, size, window, 'dm')
    d2v_model_dbow_name = make_d2v_model_name(dataname, size, window, 'dbow')
    d2v_model_dm.save(d2v_model_dm_name)
    d2v_model_dbow.save(d2v_model_dbow_name)

    return d2v_model_dm, d2v_model_dbow

#------------------------------