示例#1
0
 def __iter__(self):
    for (fname,pos) in self.files:
          for line in gzip.open(fname,'rb'):
              tweet = line.split('\t')[pos]
              tweet = preprocess_tweet(tweet)
              tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
              yield filter(lambda word: ' ' not in word, tweet)
示例#2
0
 def __iter__(self):
     for (fname, pos) in self.files:
         for line in gzip.open(fname, 'rb'):
             tweet = line.split('\t')[pos]
             tweet = preprocess_tweet(tweet)
             tweet = self.tknzr.tokenize(tweet.decode('utf-8'))
             yield filter(lambda word: ' ' not in word, tweet)
示例#3
0
def load_data(fname):
    tid,topics,tweets,sentiments = [],[],[],[]
    tknzr = TweetTokenizer(reduce_len=True)
    n_not_available = 0
    with open(fname) as f:
        for line in f:
            splits = line.split('\t')
            tweet = splits[3]
            sentiment = convertSentiment(splits[2])
            if tweet != "Not Available\n":
                tid.append(splits[0])
                topic = pts.preprocess_tweet(splits[1])
                topic_tok = tknzr.tokenize(topic.decode('utf-8'))
                topics.append(splits[1])

                tweet = pts.preprocess_tweet(tweet)
                tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
                tweets.append(tweet_tok)
                sentiments.append(int(sentiment))
            else:
                n_not_available += 1

    print "Number of not availalbe tweets:", n_not_available
    return tid,topics,tweets,sentiments
示例#4
0
def load_data(fname):
    tid,topics,tweets,sentiments = [],[],[],[]
    tknzr = TweetTokenizer(reduce_len=True)
    n_not_available = 0
    with open(fname) as f:
        for line in f:
            splits = line.split('\t')
            tweet = splits[3]
            sentiment = convertSentiment(splits[2])
            if tweet != "Not Available\n":
                tid.append(splits[0])
                topic = pts.preprocess_tweet(splits[1])
                topic_tok = tknzr.tokenize(topic.decode('utf-8'))
                topics.append(splits[1])

                tweet = pts.preprocess_tweet(tweet)
                tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
                tweets.append(tweet_tok)
                sentiments.append(int(sentiment))
            else:
                n_not_available += 1

    print "Number of not availalbe tweets:", n_not_available
    return tid,topics,tweets,sentiments
示例#5
0
def load_data(fname,pos):
    tid,tweets,sentiments = [],[],[]
    tknzr = TweetTokenizer(reduce_len=True)
    n_not_available = 0
    with open(fname) as f:
        for line in f:
            splits = line.split('\t')
            tweet = splits[pos + 1]
            sentiment = convertSentiment(splits[pos])

            tid.append(splits[0])
            tweet = pts.preprocess_tweet(tweet)
            tweet_tok = tknzr.tokenize(tweet.decode('utf-8'))
            tweets.append(tweet_tok)
            sentiments.append(int(sentiment))

    return tid,tweets,sentiments
示例#6
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]
        print input_fname

    input_file = 'semeval/smiley_tweets_{}.gz'.format(input_fname)
    output_file = 'semeval/smiley_tweets_{}_balanced.gz'.format(input_fname)
    read_emo('emoscores')

    counter = 0
    pos_counter = 0
    neg_counter = 0
    pos_queue = deque()
    neg_queue = deque()
    f_out = gzip.open(output_file,'w')
    with gzip.open(input_file,'r') as f:
        for tweet in f:
            tweet,sentiment = convert_sentiment(tweet,trim=False)
            tweet = preprocess_tweet(tweet)
            if sentiment == 0:
                pos_queue.append(tweet)
                pos_counter += 1
            if sentiment == 1:
                neg_queue.append(tweet)
                neg_counter += 1
            counter += 1
            while len(neg_queue) > 0 and len(pos_queue) > 0:
                pos_tweet = pos_queue.popleft()
                neg_tweet = neg_queue.popleft()
                f_out.write(pos_tweet)
                f_out.write(neg_tweet)

            if (counter%100000) == 0:
                print "Elements processed:",counter
    print "Pos tweets:",pos_counter
    print "Neg tweets:",neg_counter
    f_out.close()
示例#7
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]
        print input_fname

    input_file = 'semeval/smiley_tweets_{}.gz'.format(input_fname)
    output_file = 'semeval/smiley_tweets_{}_balanced.gz'.format(input_fname)
    read_emo('emoscores')

    counter = 0
    pos_counter = 0
    neg_counter = 0
    pos_queue = deque()
    neg_queue = deque()
    f_out = gzip.open(output_file, 'w')
    with gzip.open(input_file, 'r') as f:
        for tweet in f:
            tweet, sentiment = convert_sentiment(tweet, trim=False)
            tweet = preprocess_tweet(tweet)
            if sentiment == 0:
                pos_queue.append(tweet)
                pos_counter += 1
            if sentiment == 1:
                neg_queue.append(tweet)
                neg_counter += 1
            counter += 1
            while len(neg_queue) > 0 and len(pos_queue) > 0:
                pos_tweet = pos_queue.popleft()
                neg_tweet = neg_queue.popleft()
                f_out.write(pos_tweet)
                f_out.write(neg_tweet)

            if (counter % 100000) == 0:
                print "Elements processed:", counter
    print "Pos tweets:", pos_counter
    print "Neg tweets:", neg_counter
    f_out.close()
示例#8
0
def main():
    outdir = "semeval_parsed_200M"

    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    #supervised data
    train = "semeval/task-B-train-plus-dev.tsv"
    test = "semeval/task-B-test2014-twitter.tsv"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv"
    test15 = "semeval/task-B-test2015-twitter.tsv"
    train16 = "semeval/task-A-train-2016.tsv"
    dev2016 = "semeval/task-A-dev-2016.tsv"
    devtest2016 = "semeval/task-A-devtest-2016.tsv"
    test2016 = "semeval/SemEval2016-task4-test.subtask-A.txt"

    #unsupervised data
    smiley_tweets_200M = 'semeval/smiley_tweets_200M.gz'

    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX')
    dummy_word_idx = alphabet.fid
    tknzr = TweetTokenizer(reduce_len=True)

    fnames = [
        (train,3),
        (dev,3),
        (test,3),
        (test15,3),
        (train16,2),
        (dev2016,2),
        (devtest2016,2),
        (test2016,2)
    ]

    fnames_gz = [smiley_tweets_200M]

    counter = 0

    for (fname,pos) in fnames:
        with open(fname,'r ') as f:
            for line in f:
                tweet = line.split('\t')[pos]
                tweet,_ = convert_sentiment(tweet)
                tweet = tknzr.tokenize(preprocess_tweet(tweet))
                for token in tweet:
                    alphabet.add(token)
        print len(alphabet)

    for fname in fnames_gz:
        with gzip.open(fname,'r') as f:
            for tweet in f:
                tweet,_ = convert_sentiment(tweet)
                tweet = tknzr.tokenize(preprocess_tweet(tweet))
                for token in tweet:
                    alphabet.add(token)
                counter += 1
                if (counter % 1000000) == 0:
                    print 'Precessed Tweets:',counter

        print len(alphabet)

    print 'Alphabet before purge:',len(alphabet)
    alphabet.purge_dict(input_fname=type,min_freq=10)
    print 'Alphabet after purge:',len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w'))