# Raw data (i.e. withou unicode clean up etc)
twitter_train_raw = prepareTwitterData(Globals.TWITTER_TRAIN, splitwords = False)
twitter_test_raw = prepareTwitterData(Globals.TWITTER_TEST, splitwords = False)
wiki_raw = prepareWikiData(Globals.WIKI_TRAIN, splitwords= False)
blog_raw = prepareBlogData(Globals.BLOG_DATA, splitwords=False)

# Raw POS data
twitter_train_pos_raw = get_pos_data(Globals.TWITTER_TRAIN_POS)
twitter_test_pos_raw = get_pos_data(Globals.TWITTER_TEST_POS)
wiki_pos_raw = get_pos_data(Globals.WIKI_POS)
blog_pos_raw = get_pos_data(Globals.BLOG_POS)

# see which indices are clean for each
twitter_train_indices = to_utf8(twitter_train_raw, return_indices=True)
twitter_test_indices = to_utf8(twitter_test_raw, return_indices=True)
wiki_indices = clean_wiki.clean_wiki(wiki_raw, return_indices = True)
blog_indices = to_utf8(blog_raw, return_indices= True)


# Clean up utf8 issues and nonsense observations (wiki)
twitter_train = get_elems_at(twitter_train_raw, twitter_train_indices)
twitter_test = get_elems_at(twitter_test_raw, twitter_test_indices)
wiki = get_elems_at(wiki_raw, wiki_indices)
blog = get_elems_at(blog_raw, blog_indices)

# POS data
twitter_train_pos = get_elems_at(twitter_train_pos_raw, twitter_train_indices)
twitter_test_pos = get_elems_at(twitter_test_pos_raw, twitter_test_indices)
wiki_pos = get_elems_at(wiki_pos_raw, wiki_indices)
blog_pos = get_elems_at(blog_pos_raw, blog_indices)
tw_pos = twitter_train_pos + wiki_pos
Exemplo n.º 2
0
def histo_to_tuples(data, name):
    names = [name] * len(data[0])
    cts = data[0]
    buckets = data[1][1:] if len(cts) != len(data[1]) else data[1]
    return zip(names, buckets, cts)



# Preparing data
# Possible training data
twitter_train = to_utf8(prepareTwitterData(Globals.TWITTER_TRAIN, splitwords = False))
twitter_test = to_utf8(prepareTwitterData(Globals.TWITTER_TEST, splitwords = False))
wiki = prepareWikiData(Globals.WIKI_TRAIN, splitwords= False)
# clean wiki data
wiki = clean_wiki.clean_wiki(wiki)
tw = twitter_train + wiki
# blog data
blog = to_utf8(prepareBlogData(Globals.BLOG_DATA, splitwords=False))

# Counts of unigrams -> data is sparse in all three sources
def count_unigrams(outpath):
    tw_cter, twitter_cts = Features.wordCountsSkLearn(Features.getX(tw), stop_words = 'english')
    blog_cter, blog_cts = Features.wordCountsSkLearn(Features.getX(blog), stop_words = 'english')

    # Total number of non-stop-word unigrams
    unigrams = set(tw_cter.vocabulary_.keys() + blog_cter.vocabulary_.keys())
    print "Data has %d distinct unigrams" % len(unigrams)

    # Distribution of unigram cts
    twitter_unigram_histo = histogram_cts(twitter_cts)
Exemplo n.º 3
0
# Raw data (i.e. withou unicode clean up etc)
twitter_train_raw = prepareTwitterData(Globals.TWITTER_TRAIN, splitwords=False)
twitter_test_raw = prepareTwitterData(Globals.TWITTER_TEST, splitwords=False)
wiki_raw = prepareWikiData(Globals.WIKI_TRAIN, splitwords=False)
blog_raw = prepareBlogData(Globals.BLOG_DATA, splitwords=False)

# Raw POS data
twitter_train_pos_raw = get_pos_data(Globals.TWITTER_TRAIN_POS)
twitter_test_pos_raw = get_pos_data(Globals.TWITTER_TEST_POS)
wiki_pos_raw = get_pos_data(Globals.WIKI_POS)
blog_pos_raw = get_pos_data(Globals.BLOG_POS)

# see which indices are clean for each
twitter_train_indices = to_utf8(twitter_train_raw, return_indices=True)
twitter_test_indices = to_utf8(twitter_test_raw, return_indices=True)
wiki_indices = clean_wiki.clean_wiki(wiki_raw, return_indices=True)
blog_indices = to_utf8(blog_raw, return_indices=True)

# Clean up utf8 issues and nonsense observations (wiki)
twitter_train = get_elems_at(twitter_train_raw, twitter_train_indices)
twitter_test = get_elems_at(twitter_test_raw, twitter_test_indices)
wiki = get_elems_at(wiki_raw, wiki_indices)
blog = get_elems_at(blog_raw, blog_indices)

# POS data
twitter_train_pos = get_elems_at(twitter_train_pos_raw, twitter_train_indices)
twitter_test_pos = get_elems_at(twitter_test_pos_raw, twitter_test_indices)
wiki_pos = get_elems_at(wiki_pos_raw, wiki_indices)
blog_pos = get_elems_at(blog_pos_raw, blog_indices)
tw_pos = twitter_train_pos + wiki_pos