def main():
	# sentiment = SentiWordNet('./SentiWordNet_3.0.0_20130122.txt')
	taggedTweets = getTaggedTweets('../splitTaggedTweets2.txt')
	taggedTweets = addTarget(taggedTweets,'../dataset_raw/semeval2016-task6-edited-trainingdata.txt')
	writer = open('./final.txt','w')
	for tweet in taggedTweets:
		writer.write(' '.join(tweet['tweet']))
		writer.write('\t')
		writer.write(tweet['stance'])
		writer.write('\n')
	word_vec_dict = oldWork.readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt')
	tweets = [taggedTweet['tweet'] for taggedTweet in taggedTweets]
	for i in xrange(len(tweets)):
		tweets[i] = ' '.join(tweets[i])
	getFeatures(tweets, word_vec_dict)
def main():
    # sentiment = SentiWordNet('./SentiWordNet_3.0.0_20130122.txt')
    taggedTweets = getTaggedTweets('../splitTaggedTweets2.txt')
    taggedTweets = addTarget(
        taggedTweets,
        '../dataset_raw/semeval2016-task6-edited-trainingdata.txt')
    writer = open('./final.txt', 'w')
    for tweet in taggedTweets:
        writer.write(' '.join(tweet['tweet']))
        writer.write('\t')
        writer.write(tweet['stance'])
        writer.write('\n')
    word_vec_dict = oldWork.readGloveData(
        '../glove.twitter.27B/glove.twitter.27B.25d.txt')
    tweets = [taggedTweet['tweet'] for taggedTweet in taggedTweets]
    for i in xrange(len(tweets)):
        tweets[i] = ' '.join(tweets[i])
    getFeatures(tweets, word_vec_dict)
示例#3
0
def main():
    wordlist = InitializeWords()
    tweetFile = '../dataset_raw/semeval2016-task6-trainingdata.txt'
    tweetTags = '../processedTweets.txt'
    tweetWriter = open(tweetTags, 'w')
    tweets = oldWork.readTweets(tweetFile)
    tweets = tweets[:len(tweets) - 1]
    tweetClasses = tweets[len(tweets) - 1]
    for tweet in tweets:
        sentence = ''
        for word in tweet:
            splitWord = word
            # print word
            if len(word) > 0 and word[0] == '#' and word != '#semst':
                splitWord = ParseSentence(splitWord, wordlist)
                # print splitWord
            sentence += splitWord
            sentence += ' '
        # sentence = ' '.join(tweet[1:len(tweet)])
        sentence = ' '.join(sentence.split())
        tweetWriter.write(sentence + '\n')
def main():
	wordlist = InitializeWords()
	tweetFile = '../dataset_raw/semeval2016-task6-trainingdata.txt'
	tweetTags = '../processedTweets.txt'
	tweetWriter = open(tweetTags, 'w')
	tweets = oldWork.readTweets(tweetFile)
	tweets = tweets[:len(tweets) - 1]
	tweetClasses = tweets[len(tweets) - 1]
	for tweet in tweets:
		sentence = ''
		for word in tweet:
			splitWord = word
			# print word
			if len(word) > 0 and word[0] == '#' and word != '#semst':
				splitWord = ParseSentence(splitWord, wordlist)
				# print splitWord
			sentence += splitWord
			sentence += ' '
		# sentence = ' '.join(tweet[1:len(tweet)])
		sentence = ' '.join(sentence.split())
		tweetWriter.write(sentence + '\n')
import os
import sys
import IntermediateProjectWork as oldWork

origTweetFile = '../dataset_raw/semeval2016-task6-trainingdata.txt'

origTweets = oldWork.readTweets(origTweetFile)
origTweetClasses = origTweets[len(origTweets) - 1]
origTweets = origTweets[:len(origTweets) - 1]

origTweetReader = open(origTweetFile)

taggedTweetFile = '../tweetsTagged.txt'
taggedTweetReader = open(taggedTweetFile)

outputFile = '../dataset_raw/semeval2016-task6-edited-trainingdata.txt'
outputWriter = open(outputFile, 'w')

origLine = origTweetReader.readline()
outputWriter.write(origLine)
# print origTweets[0]
for taggedTweet in taggedTweetReader:
	print taggedTweet
	origLine = origTweetReader.readline()
	# print 'yo'
	origLineSplit = origLine.split('\t')
	# print origLineSplit
	outputWriter.write(origLineSplit[0] + '\t' + origLineSplit[1] + '\t')
	# print taggedTweet
	line = taggedTweet.strip().split('\t')
	tweetString = line[0]
def getFeatures(tweets, word_vec_dict):
    X = []
    # Y = []
    # print tweets
    for tweet in tweets:
        # dummy_tweet = [tweet['target'].lower()]
        # dummy_tweet.extend(tweet['tweet'])
        # print oldWork.getSumVectors(dummy_tweet, word_vec_dict)
        x = word_vec_dict['hi']
        x = x * 0
        for word in tweet:
            x = x + oldWork.getWordVector(word, word_vec_dict)
        X.append(x)
        # if tweet['stance'] == 'AGAINST':
        # Y.append(-1)
        # elif tweet['stance'] == 'FOR':
        # Y.append(1)
        # else:
        # Y.append(0)
    # print len(X)
    # print len(tweets)
    # print X
    nrc = readLexicon('../nrc_unigram.txt')
    # print nrc
    s140 = readLexicon('../s140_unigram.txt')
    # print s140
    sumPos = 0
    sumNeg = 0
    numPos = 0
    numNeg = 0
    maxPos = -1
    maxNeg = 1
    maxPosIndex = 0
    maxNegIndex = 0
    for i in xrange(len(tweets)):
        sumPos = 0
        sumNeg = 0
        numPos = 0
        numNeg = 0
        maxPos = -1
        maxNeg = 1
        maxPosIndex = 0
        maxNegIndex = 0

        tweet = tweets[i]

        # print tweet
        for j in xrange(len(tweet)):
            word = tweet[j]
            if word in nrc:
                if nrc[word] > 0:
                    numPos += 1
                    if nrc[word] > maxPos:
                        maxPos = nrc[word]
                        maxPosIndex = j
                elif nrc[word] < 0:
                    numNeg += 1
                    if nrc[word] < maxNeg:
                        maxNeg = nrc[word]
                        maxNegIndex = j
            elif word in s140:
                if s140[word] > 0:
                    numPos += 1
                    if s140[word] > maxPos:
                        maxPos = s140[word]
                        maxPosIndex = j
                elif s140[word] < 0:
                    numNeg += 1
                    if s140[word] < maxNeg:
                        maxNeg = s140[word]
                        maxNegIndex = j
        # np.concatenate(X[i], oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict))
        # np.concatenate(X[i], oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict))
        # np.concatenate(X[i], numPos/len(tweet))
        # np.concatenate(X[i], numNeg/len(tweet))
        X[i] = X[i].tolist()
        # print X[i]
        # print 'index ' + str(maxPosIndex)
        # print len(tweet)
        X[i].extend(oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict))
        X[i].extend(oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict))
        X[i].append(numPos / len(tweet))
        X[i].append(numNeg / len(tweet))
        #print len(X[i])

    return X
def getFeatures(tweets, word_vec_dict):
	X = []
	# Y = []
	# print tweets
	for tweet in tweets:
		# dummy_tweet = [tweet['target'].lower()]
		# dummy_tweet.extend(tweet['tweet'])
		# print oldWork.getSumVectors(dummy_tweet, word_vec_dict)
		x = word_vec_dict['hi']
		x = x * 0
		for word in tweet:
			x = x + oldWork.getWordVector(word, word_vec_dict)
		X.append(x)
		# if tweet['stance'] == 'AGAINST':
			# Y.append(-1)
		# elif tweet['stance'] == 'FOR':
			# Y.append(1)
		# else:
			# Y.append(0)
	# print len(X)
	# print len(tweets)
	# print X
	nrc = readLexicon('../nrc_unigram.txt')
	# print nrc
	s140 = readLexicon('../s140_unigram.txt')
	# print s140
	sumPos = 0
	sumNeg = 0
	numPos = 0
	numNeg = 0
	maxPos = -1
	maxNeg = 1
	maxPosIndex = 0
	maxNegIndex = 0
	for i in xrange(len(tweets)):
		sumPos = 0
		sumNeg = 0
		numPos = 0
		numNeg = 0
		maxPos = -1
		maxNeg = 1
		maxPosIndex = 0
		maxNegIndex = 0

		tweet = tweets[i]

		# print tweet
		for j in xrange(len(tweet)):
			word = tweet[j]
			if word in nrc:
				if nrc[word] > 0:
					numPos += 1
					if nrc[word] > maxPos:
						maxPos = nrc[word]
						maxPosIndex = j
				elif nrc[word] < 0:
					numNeg += 1
					if nrc[word] < maxNeg:
						maxNeg = nrc[word]
						maxNegIndex = j
			elif word in s140:
				if s140[word] > 0:
					numPos += 1
					if s140[word] > maxPos:
						maxPos = s140[word]
						maxPosIndex = j
				elif s140[word] < 0:
					numNeg += 1
					if s140[word] < maxNeg:
						maxNeg = s140[word]
						maxNegIndex = j
		# np.concatenate(X[i], oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict))
		# np.concatenate(X[i], oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict))
		# np.concatenate(X[i], numPos/len(tweet))
		# np.concatenate(X[i], numNeg/len(tweet))
		X[i] = X[i].tolist()
		# print X[i]
		# print 'index ' + str(maxPosIndex)
		# print len(tweet)
		X[i].extend(oldWork.getWordVector(tweet[maxPosIndex], word_vec_dict))
		X[i].extend(oldWork.getWordVector(tweet[maxNegIndex], word_vec_dict))
		X[i].append(numPos/len(tweet))
		X[i].append(numNeg/len(tweet))
		#print len(X[i])

        return X