# Raw data (i.e. withou unicode clean up etc) twitter_train_raw = prepareTwitterData(Globals.TWITTER_TRAIN, splitwords = False) twitter_test_raw = prepareTwitterData(Globals.TWITTER_TEST, splitwords = False) wiki_raw = prepareWikiData(Globals.WIKI_TRAIN, splitwords= False) blog_raw = prepareBlogData(Globals.BLOG_DATA, splitwords=False) # Raw POS data twitter_train_pos_raw = get_pos_data(Globals.TWITTER_TRAIN_POS) twitter_test_pos_raw = get_pos_data(Globals.TWITTER_TEST_POS) wiki_pos_raw = get_pos_data(Globals.WIKI_POS) blog_pos_raw = get_pos_data(Globals.BLOG_POS) # see which indices are clean for each twitter_train_indices = to_utf8(twitter_train_raw, return_indices=True) twitter_test_indices = to_utf8(twitter_test_raw, return_indices=True) wiki_indices = clean_wiki.clean_wiki(wiki_raw, return_indices = True) blog_indices = to_utf8(blog_raw, return_indices= True) # Clean up utf8 issues and nonsense observations (wiki) twitter_train = get_elems_at(twitter_train_raw, twitter_train_indices) twitter_test = get_elems_at(twitter_test_raw, twitter_test_indices) wiki = get_elems_at(wiki_raw, wiki_indices) blog = get_elems_at(blog_raw, blog_indices) # POS data twitter_train_pos = get_elems_at(twitter_train_pos_raw, twitter_train_indices) twitter_test_pos = get_elems_at(twitter_test_pos_raw, twitter_test_indices) wiki_pos = get_elems_at(wiki_pos_raw, wiki_indices) blog_pos = get_elems_at(blog_pos_raw, blog_indices) tw_pos = twitter_train_pos + wiki_pos
def histo_to_tuples(data, name): names = [name] * len(data[0]) cts = data[0] buckets = data[1][1:] if len(cts) != len(data[1]) else data[1] return zip(names, buckets, cts) # Preparing data # Possible training data twitter_train = to_utf8(prepareTwitterData(Globals.TWITTER_TRAIN, splitwords = False)) twitter_test = to_utf8(prepareTwitterData(Globals.TWITTER_TEST, splitwords = False)) wiki = prepareWikiData(Globals.WIKI_TRAIN, splitwords= False) # clean wiki data wiki = clean_wiki.clean_wiki(wiki) tw = twitter_train + wiki # blog data blog = to_utf8(prepareBlogData(Globals.BLOG_DATA, splitwords=False)) # Counts of unigrams -> data is sparse in all three sources def count_unigrams(outpath): tw_cter, twitter_cts = Features.wordCountsSkLearn(Features.getX(tw), stop_words = 'english') blog_cter, blog_cts = Features.wordCountsSkLearn(Features.getX(blog), stop_words = 'english') # Total number of non-stop-word unigrams unigrams = set(tw_cter.vocabulary_.keys() + blog_cter.vocabulary_.keys()) print "Data has %d distinct unigrams" % len(unigrams) # Distribution of unigram cts twitter_unigram_histo = histogram_cts(twitter_cts)
# Raw data (i.e. withou unicode clean up etc) twitter_train_raw = prepareTwitterData(Globals.TWITTER_TRAIN, splitwords=False) twitter_test_raw = prepareTwitterData(Globals.TWITTER_TEST, splitwords=False) wiki_raw = prepareWikiData(Globals.WIKI_TRAIN, splitwords=False) blog_raw = prepareBlogData(Globals.BLOG_DATA, splitwords=False) # Raw POS data twitter_train_pos_raw = get_pos_data(Globals.TWITTER_TRAIN_POS) twitter_test_pos_raw = get_pos_data(Globals.TWITTER_TEST_POS) wiki_pos_raw = get_pos_data(Globals.WIKI_POS) blog_pos_raw = get_pos_data(Globals.BLOG_POS) # see which indices are clean for each twitter_train_indices = to_utf8(twitter_train_raw, return_indices=True) twitter_test_indices = to_utf8(twitter_test_raw, return_indices=True) wiki_indices = clean_wiki.clean_wiki(wiki_raw, return_indices=True) blog_indices = to_utf8(blog_raw, return_indices=True) # Clean up utf8 issues and nonsense observations (wiki) twitter_train = get_elems_at(twitter_train_raw, twitter_train_indices) twitter_test = get_elems_at(twitter_test_raw, twitter_test_indices) wiki = get_elems_at(wiki_raw, wiki_indices) blog = get_elems_at(blog_raw, blog_indices) # POS data twitter_train_pos = get_elems_at(twitter_train_pos_raw, twitter_train_indices) twitter_test_pos = get_elems_at(twitter_test_pos_raw, twitter_test_indices) wiki_pos = get_elems_at(wiki_pos_raw, wiki_indices) blog_pos = get_elems_at(blog_pos_raw, blog_indices) tw_pos = twitter_train_pos + wiki_pos