def extract_features(tweet): word_features = get_word_features(tweet) tweet_words = set(tweet) features = {} for word in word_features: features['contains(%s)' % word] = (word in tweet_words) return features ''' Train on file "trainGold.tsv" Test on file "devGold.tsv" (attempted to view confusion matrix. bugs...) ''' # TRAIN read = readerAndWriter.readFile("data/cleaned/trainGold.tsv") dataForTweets = readTweets(read) training_set = nltk.classify.apply_features(extract_features, dataForTweets) classifier = nltk.NaiveBayesClassifier.train(training_set) # TEST testFile = readerAndWriter.readFile("data/cleaned/devGold.tsv") dataForTest = readTweets(testFile) test_set = nltk.classify.apply_features(extract_features, dataForTest) print "CLASSIFYING: ", nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(10) ''' cm = nltk.ConfusionMatrix(training_set, training_set) print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9)) '''
#result = [] #for tweet in tagResults: #w = word, t = tag, c = confidence level # cleanTweet = "" # for triple in tweet: # print triple # (w, t, c) = triple # #removing urls, user mentions, numbers, and hashtags from the tweet # if t != 'U' and t != '@' and t!= '$' and t != '#': # cleanTweet+=str(w) + " " # #print "cleantweet is:",cleanTweet # #print w # print cleanTweet # result.append(cleanTweet) #print result #readResults = readerAndWriter.readFile("cleaned/PFTweets.txt") readTrain = readerAndWriter.readFile("cleaned/PFTweetsTrain.txt") readTest = readerAndWriter.readFile("cleaned/PFTweetsTest.txt") #print "read results\n",readResults print "\n DONE READING \n" #print "\nclean tweet\n",cleanTweet(tweets) #replacedResults = replaceTaggedTweet(readResults) taggedTrain = replaceTaggedTweet(readTrain) taggedTest = replaceTaggedTweet(readTest) print "\n DONE REPLACING \n" #print "\nreplace tagged tweet\t",replacedResults #np.savetxt("cleaned/testCleaned.txt",replacedResults) #toFile("cleaned/PFTweetsCleaned.txt",replacedResults) toFile("cleaned/PFTrainTag.txt", taggedTrain) toFile("cleaned/PFTestTag.txt", taggedTest) print "\n DONE WRITING \n"
''' def collapseScales(originalData): # newFile is an array of arrays where each element in newFile # is a line (as an array) newFile = [] for e in originalData: line = e.split("\t") if (int(line[2]) == -2): line[2] = str(-1) elif (int(line[2]) == 2): line[2] = str(1) newFile.append(line) return newFile ''' ******************* COLLAPSE SCALES ***************************''' originalFile = readerAndWriter.readFile("cleaned/allTopics.tsv") # find the distribution of the labels before collapsing the scales dist_count = [0]*5 for element in originalFile: line = element.split("\t") dist_count[int(line[2])+2] += 1 print "Distribution count of numbers [-2, -1, 0, 1, 2]:",dist_count #[139, 1086, 2667, 4608, 549] print "Total number of tweets",np.sum(dist_count) #9049 distribution = [0]*5 for num in range(len(dist_count)): print "num is", num print dist_count[num] distribution[num] = dist_count[num]/np.sum(dist_count)
def collapseScales(originalData): # newFile is an array of arrays where each element in newFile # is a line (as an array) newFile = [] for e in originalData: line = e.split("\t") if (int(line[2]) == -2): line[2] = str(-1) elif (int(line[2]) == 2): line[2] = str(1) newFile.append(line) return newFile ''' ******************* COLLAPSE SCALES ***************************''' originalFile = readerAndWriter.readFile("cleaned/allTopics.tsv") # find the distribution of the labels before collapsing the scales dist_count = [0] * 5 for element in originalFile: line = element.split("\t") dist_count[int(line[2]) + 2] += 1 print "Distribution count of numbers [-2, -1, 0, 1, 2]:", dist_count #[139, 1086, 2667, 4608, 549] print "Total number of tweets", np.sum(dist_count) #9049 distribution = [0] * 5 for num in range(len(dist_count)): print "num is", num print dist_count[num] distribution[num] = dist_count[num] / np.sum(dist_count)