def main(): # sentiment = SentiWordNet('./SentiWordNet_3.0.0_20130122.txt') taggedTweets = getTaggedTweets('../splitTaggedTweets2.txt') taggedTweets = addTarget(taggedTweets,'../dataset_raw/semeval2016-task6-edited-trainingdata.txt') writer = open('./final.txt','w') for tweet in taggedTweets: writer.write(' '.join(tweet['tweet'])) writer.write('\t') writer.write(tweet['stance']) writer.write('\n') word_vec_dict = oldWork.readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt') tweets = [taggedTweet['tweet'] for taggedTweet in taggedTweets] for i in xrange(len(tweets)): tweets[i] = ' '.join(tweets[i]) getFeatures(tweets, word_vec_dict)
def main(): # sentiment = SentiWordNet('./SentiWordNet_3.0.0_20130122.txt') taggedTweets = getTaggedTweets('../splitTaggedTweets2.txt') taggedTweets = addTarget( taggedTweets, '../dataset_raw/semeval2016-task6-edited-trainingdata.txt') writer = open('./final.txt', 'w') for tweet in taggedTweets: writer.write(' '.join(tweet['tweet'])) writer.write('\t') writer.write(tweet['stance']) writer.write('\n') word_vec_dict = oldWork.readGloveData( '../glove.twitter.27B/glove.twitter.27B.25d.txt') tweets = [taggedTweet['tweet'] for taggedTweet in taggedTweets] for i in xrange(len(tweets)): tweets[i] = ' '.join(tweets[i]) getFeatures(tweets, word_vec_dict)
def main(): wordlist = InitializeWords() tweetFile = '../dataset_raw/semeval2016-task6-trainingdata.txt' tweetTags = '../processedTweets.txt' tweetWriter = open(tweetTags, 'w') tweets = oldWork.readTweets(tweetFile) tweets = tweets[:len(tweets) - 1] tweetClasses = tweets[len(tweets) - 1] for tweet in tweets: sentence = '' for word in tweet: splitWord = word # print word if len(word) > 0 and word[0] == '#' and word != '#semst': splitWord = ParseSentence(splitWord, wordlist) # print splitWord sentence += splitWord sentence += ' ' # sentence = ' '.join(tweet[1:len(tweet)]) sentence = ' '.join(sentence.split()) tweetWriter.write(sentence + '\n')
import os import sys import IntermediateProjectWork as oldWork origTweetFile = '../dataset_raw/semeval2016-task6-trainingdata.txt' origTweets = oldWork.readTweets(origTweetFile) origTweetClasses = origTweets[len(origTweets) - 1] origTweets = origTweets[:len(origTweets) - 1] origTweetReader = open(origTweetFile) taggedTweetFile = '../tweetsTagged.txt' taggedTweetReader = open(taggedTweetFile) outputFile = '../dataset_raw/semeval2016-task6-edited-trainingdata.txt' outputWriter = open(outputFile, 'w') origLine = origTweetReader.readline() outputWriter.write(origLine) # print origTweets[0] for taggedTweet in taggedTweetReader: print taggedTweet origLine = origTweetReader.readline() # print 'yo' origLineSplit = origLine.split('\t') # print origLineSplit outputWriter.write(origLineSplit[0] + '\t' + origLineSplit[1] + '\t') # print taggedTweet line = taggedTweet.strip().split('\t') tweetString = line[0]
def getFeatures(tweets, word_vec_dict): X = [] # Y = [] # print tweets for tweet in tweets: # dummy_tweet = [tweet['target'].lower()] # dummy_tweet.extend(tweet['tweet']) # print oldWork.getSumVectors(dummy_tweet, word_vec_dict) x = word_vec_dict['hi'] x = x * 0 for word in tweet: x = x + oldWork.getWordVector(word, word_vec_dict) X.append(x) # if tweet['stance'] == 'AGAINST': # Y.append(-1) # elif tweet['stance'] == 'FOR': # Y.append(1) # else: # Y.append(0) # print len(X) # print len(tweets) # print X nrc = readLexicon('../nrc_unigram.txt') # print nrc s140 = readLexicon('../s140_unigram.txt') # print s140 sumPos = 0 sumNeg = 0 numPos = 0 numNeg = 0 maxPos = -1 maxNeg = 1 maxPosIndex = 0 maxNegIndex = 0 for i in xrange(len(tweets)): sumPos = 0 sumNeg = 0 numPos = 0 numNeg = 0 maxPos = -1 maxNeg = 1 maxPosIndex = 0 maxNegIndex = 0 tweet = tweets[i] # print tweet for j in xrange(len(tweet)): word = tweet[j] if word in nrc: if nrc[word] > 0: numPos += 1 if nrc[word] > maxPos: maxPos = nrc[word] maxPosIndex = j elif nrc[word] < 0: numNeg += 1 if nrc[word] < maxNeg: maxNeg = nrc[word] maxNegIndex = j elif word in s140: if s140[word] > 0: numPos += 1 if s140[word] > maxPos: maxPos = s140[word] maxPosIndex = j elif s140[word] < 0: numNeg += 1 if s140[word] < maxNeg: maxNeg = s140[word] maxNegIndex = j # np.concatenate(X[i], oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict)) # np.concatenate(X[i], oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict)) # np.concatenate(X[i], numPos/len(tweet)) # np.concatenate(X[i], numNeg/len(tweet)) X[i] = X[i].tolist() # print X[i] # print 'index ' + str(maxPosIndex) # print len(tweet) X[i].extend(oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict)) X[i].extend(oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict)) X[i].append(numPos / len(tweet)) X[i].append(numNeg / len(tweet)) #print len(X[i]) return X
def getFeatures(tweets, word_vec_dict): X = [] # Y = [] # print tweets for tweet in tweets: # dummy_tweet = [tweet['target'].lower()] # dummy_tweet.extend(tweet['tweet']) # print oldWork.getSumVectors(dummy_tweet, word_vec_dict) x = word_vec_dict['hi'] x = x * 0 for word in tweet: x = x + oldWork.getWordVector(word, word_vec_dict) X.append(x) # if tweet['stance'] == 'AGAINST': # Y.append(-1) # elif tweet['stance'] == 'FOR': # Y.append(1) # else: # Y.append(0) # print len(X) # print len(tweets) # print X nrc = readLexicon('../nrc_unigram.txt') # print nrc s140 = readLexicon('../s140_unigram.txt') # print s140 sumPos = 0 sumNeg = 0 numPos = 0 numNeg = 0 maxPos = -1 maxNeg = 1 maxPosIndex = 0 maxNegIndex = 0 for i in xrange(len(tweets)): sumPos = 0 sumNeg = 0 numPos = 0 numNeg = 0 maxPos = -1 maxNeg = 1 maxPosIndex = 0 maxNegIndex = 0 tweet = tweets[i] # print tweet for j in xrange(len(tweet)): word = tweet[j] if word in nrc: if nrc[word] > 0: numPos += 1 if nrc[word] > maxPos: maxPos = nrc[word] maxPosIndex = j elif nrc[word] < 0: numNeg += 1 if nrc[word] < maxNeg: maxNeg = nrc[word] maxNegIndex = j elif word in s140: if s140[word] > 0: numPos += 1 if s140[word] > maxPos: maxPos = s140[word] maxPosIndex = j elif s140[word] < 0: numNeg += 1 if s140[word] < maxNeg: maxNeg = s140[word] maxNegIndex = j # np.concatenate(X[i], oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict)) # np.concatenate(X[i], oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict)) # np.concatenate(X[i], numPos/len(tweet)) # np.concatenate(X[i], numNeg/len(tweet)) X[i] = X[i].tolist() # print X[i] # print 'index ' + str(maxPosIndex) # print len(tweet) X[i].extend(oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict)) X[i].extend(oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict)) X[i].append(numPos/len(tweet)) X[i].append(numNeg/len(tweet)) #print len(X[i]) return X