def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) ddir = 'semeval/binary' train16 = "task-BD-train-2016.tsv" dev2016 = "task-BD-dev-2016.tsv" devtest2016 = "task-BD-devtest-2016.tsv" test2016 = "SemEval2016-task4-test.subtask-BD.txt" fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:', dummy_word_idx topic_alphabet = Alphabet(start_feature_id=0) topic_alphabet.add('UNKNOWN_TOPIC_IDX') dummy_topic_idx = topic_alphabet.fid print "Loading Semeval Data" #save semeval tweets seperate files = [train16, dev2016, devtest2016, test2016] for fname in files: fname_ext = os.path.join(ddir, fname) tid, topics, tweets, sentiments = load_data(fname_ext, topic_alphabet) print "Number of tweets:", len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) topic_idx = get_topic_indices(tweets, topics, topic_alphabet) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx) cPickle.dump( topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) ddir = 'semeval/binary' train16 = "task-BD-train-2016.tsv" dev2016 = "task-BD-dev-2016.tsv" devtest2016 = "task-BD-devtest-2016.tsv" test2016 = "SemEval2016-task4-test.subtask-BD.txt" fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:',dummy_word_idx topic_alphabet = Alphabet(start_feature_id=0) topic_alphabet.add('UNKNOWN_TOPIC_IDX') dummy_topic_idx = topic_alphabet.fid print "Loading Semeval Data" #save semeval tweets seperate files = [train16,dev2016,devtest2016,test2016] for fname in files: fname_ext = os.path.join(ddir,fname) tid,topics,tweets, sentiments = load_data(fname_ext,topic_alphabet) print "Number of tweets:",len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) topic_idx = get_topic_indices(tweets,topics,topic_alphabet) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx) cPickle.dump(topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) parse_200M = True if len(sys.argv) > 1: parse_200M = False train2013 = "semeval/task-B-train.20140221.tsv" dev2013 = "semeval/task-B-dev.20140225.tsv" test2013_sms = "semeval/task-B-test2013-sms.tsv" test2013_twitter = "semeval/task-B-test2013-twitter.tsv" test2014_twitter = "semeval/task-B-test2014-twitter.tsv" test2014_livejournal = "semeval/task-B-test2014-livejournal.tsv" test2014_sarcasm = "semeval/task-B-test2014-twittersarcasm.tsv" test15 = "semeval/task-B-test2015-twitter.tsv" train16 = "semeval/task-A-train-2016.tsv" dev2016 = "semeval/task-A-dev-2016.tsv" devtest2016 = "semeval/task-A-devtest-2016.tsv" test2016 = "semeval/task-A-test2016.tsv" smiley_tweets = 'semeval/smiley_tweets_{}_balanced.gz'.format(input_fname) fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:', dummy_word_idx print "Loading Semeval Data" #ncol is the number of columns iside the files in semeval files = [(train2013, 4), (dev2013, 4), (test2013_sms, 4), (test2013_twitter, 4), (test2014_twitter, 4), (test2014_livejournal, 4), (test2014_sarcasm, 4), (test15, 4), (train16, 3), (dev2016, 3), (devtest2016, 3), (test2016, 3)] for (fname, ncols) in files: tid, tweets, sentiments = load_data(fname, alphabet, ncols=ncols) print "Number of tweets:", len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) if parse_200M: print "Loading Smiley Data" basename, _ = os.path.splitext(os.path.basename('smiley_tweets')) nTweets = pts.store_file( smiley_tweets, os.path.join(outdir, '{}.tweets.npy'.format(basename)), alphabet, dummy_word_idx, sentiment_fname=os.path.join(outdir, '{}.sentiments.npy'.format(basename))) print "Number of tweets:", nTweets
def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) parse_200M = True if len(sys.argv) > 1: parse_200M = False train2013 = "semeval/task-B-train.20140221.tsv" dev2013 = "semeval/task-B-dev.20140225.tsv" test2013_sms = "semeval/task-B-test2013-sms.tsv" test2013_twitter = "semeval/task-B-test2013-twitter.tsv" test2014_twitter = "semeval/task-B-test2014-twitter.tsv" test2014_livejournal = "semeval/task-B-test2014-livejournal.tsv" test2014_sarcasm = "semeval/test_2014_sarcasm.tsv" test15 = "semeval/task-B-test2015-twitter.tsv" train16 = "semeval/task-A-train-2016.tsv" dev2016 = "semeval/task-A-dev-2016.tsv" devtest2016 = "semeval/task-A-devtest-2016.tsv" test2016 = "semeval/SemEval2016-task4-test.subtask-A.tsv" rand_tweets = 'semeval/random_tweet_neut.tsv' smiley_tweets = 'semeval/smiley_tweets_{}_balanced.gz'.format(input_fname) fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:',dummy_word_idx print "Loading Semeval Data" #ncol is the number of columns iside the files in semeval files = [(train2013,4), (dev2013,4), (test2013_sms,4), (test2013_twitter,4), (test2014_twitter,4), (test2014_livejournal,4), (test2014_sarcasm,4), (test15,4), (train16,3), (dev2016,3), (devtest2016,3), (test2016,3)] files = [(rand_tweets,3)] for line in open('semeval/phrases'): fname = 'semeval/random_tweet_{}.tsv'.format(line.replace(' ','_').replace('\r','').replace('\n','')) ncols = 3 tid, tweets, sentiments = load_data(fname,alphabet,ncols=ncols) print "Number of tweets:",len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) if parse_200M: print "Loading Smiley Data" basename, _ = os.path.splitext(os.path.basename('smiley_tweets')) nTweets = pts.store_file(smiley_tweets,os.path.join(outdir, '{}.tweets.npy'.format(basename)),alphabet,dummy_word_idx,sentiment_fname=os.path.join(outdir,'{}.sentiments.npy'.format(basename))) print "Number of tweets:", nTweets