def main(): usage = 'python3 classifier.py training_file test_file OR \n \ python3 classifier.py training_file num_folds_for_cross_val' if len(sys.argv) != 3: print(usage) return trainFileName = sys.argv[1] trainData = parse_tweets(trainFileName, 'B') try: fold = int(sys.argv[2]) ''' linearSVCClassifier = TweetClassifier(trainData, trainData, \ LinearSVC, name = 'Linear SVC Classifier', fold = fold) linearSVCClassifier.evaluate(TweetClassifier.MODE_CROSS_VALIDATE) ''' logisticRegClassifier = TweetClassifier(trainData, trainData, \ LogisticRegression, name = 'Logistic Regression Classifier', \ fold = fold) logisticRegClassifier.evaluate(TweetClassifier.MODE_CROSS_VALIDATE) except ValueError: testFileName = sys.argv[2] testData = parse_tweets(testFileName, 'B') ''' linearSVCClassifier = TweetClassifier(trainData, testData, \ LinearSVC, name = 'Linear SVC Classifier') linearSVCClassifier.evaluate(TweetClassifier.MODE_TEST_FILE) ''' logisticRegClassifier = TweetClassifier(trainData, testData, \ LogisticRegression, name = 'Logistic Regression Classifier') logisticRegClassifier.evaluate(TweetClassifier.MODE_TEST_FILE)
def handleCommandLineArgs(): # retrieve and check command-line arguments #print sys.argv if len(sys.argv) != 3: print "Error, incorrect number of arguments" usage() sys.exit(1) trainingFile = check_argv(sys.argv[1]) if os.path.exists(sys.argv[2]): testFile = check_argv(sys.argv[2]) crossVal = False else: n = check_n(sys.argv[2]) crossVal = True # build Tweet dataset #trainingFile = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv' tweetData = parse_tweets(trainingFile, 'B') if crossVal: # do n-fold cross validation on training set specified trainFolds, testFolds = crossValidation(tweetData, n, False) testData = None else: testData = parse_tweets(testFile, 'B') trainFolds = [tweetData["tweets"].keys()] testFolds = [testData["tweets"].keys()] return tweetData, testData, trainFolds, testFolds
def main(): filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv' global fold fold = 5 tweetData = parse_tweets(filename, 'B') testData(tweetData)
def main(): #Running this file will print the answers to each question to the terminal. filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv' tweetData = parse_tweets(filename, 'B') #Q1: How many training examples are there? print("(1) There are %d training examples" % len(tweetData['tweets'].keys())) #Q2: How often do each of the sentiment labels show up? What about if you # conflate neutral/objective/objective-OR-neutral into neutral? print("(2) With conflation of neutral/objective turned off or on:") print("OFF:") maxKeyCOff = MFS_counter(tweetData) print("---------------") print("ON:") maxKeyCOn = MFS_counter(tweetData, True) print("---------------") #Q3: What is the random baseline? print("(3)\n\tWith conflation off: 5 sentiments, so 20% tagged correctly.\n\tWith conflation on: 3 sentiments, so 33.3% tagged correctly." ) #Q4: What is the most frequent sentiment baseline? print("(4)\n\tWith conflation off: MFS = positive with 37.3% tagged correctly\n\tWith conflation on: MFS = neutral with 48.2% tagged correctly")
def main(): filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv' tweetData = parse_tweets(filename, 'B') # use 5-fold cross-validation trainFolds, testFolds = crossValidation(tweetData, 5, False) standardTweetData = conflateData(tweetData) lowerTweetData = lowerData(standardTweetData) negatedTweetData = negateData(standardTweetData, ['not']) lowerNegatedTweetData = negateData(lowerTweetData, ['not']) tokenizedTweetData = tokenizeData(standardTweetData) #extras: negateWords = ['not', 'no', 'never'] negatedTweetData2 = negateData(standardTweetData, negateWords) lowerNegatedTweetData2 = negateData(lowerTweetData, negateWords) print "\nQuestion 5: Most Frequent Sentiments in %d Chunks:\n" % ( len(testFolds)) testMFSList = question5(testFolds, standardTweetData) print "\nQuestion 6: Accuracy of MFS on Test Data (with cross-validation):" question6(trainFolds, testFolds, standardTweetData) print "\nQuestion 7: Accuracy of Decision List Classifier:\n" print "Standard Conditions:" question7(trainFolds, testFolds, standardTweetData, 0) # test stopwords print "\nWith Stopwords Removed:" for i in range(25, 151, 25): print i, "-", question7(trainFolds, testFolds, standardTweetData, i) print "\nWith Case-Folding:" question7(trainFolds, testFolds, lowerTweetData, 0) print "\nQuestion 8: Accuracy of Decision List with Negations after 'not':" question7(trainFolds, testFolds, negatedTweetData, 0) print "\nApply negation after 'not','no','never':" question7(trainFolds, testFolds, negatedTweetData2, 0) print "\nQuestion 9: Accuracy of Naive Bayes Classifier:" question9(trainFolds, testFolds, standardTweetData, 0) print "\nQuestion 10: Naive Bayes Classifier:" print "\n1. With StopWords Removed:" for i in range(25, 151, 25): print i, '-', question9(trainFolds, testFolds, standardTweetData, i) print "\n2. With Case-Folding:" question9(trainFolds, testFolds, lowerTweetData, 0) print "\n3. With Negations after 'not':" question9(trainFolds, testFolds, negatedTweetData, 0) print "\n4. With Case-Folding and Negations after 'not':" question9(trainFolds, testFolds, lowerNegatedTweetData, 0) print "\n5. With StopWords Removed, Case-Folding, and Negations after 'not':" for i in range(25, 151, 25): print i, '-', question9(trainFolds, testFolds, lowerNegatedTweetData, i) print '\nUsing 2nd Negation Strategy...' print "\n1. With Negations after 'not', 'never', 'no':" question9(trainFolds, testFolds, negatedTweetData2, 0) print "\n2. With Case-Folding and Negations after 'not':" question9(trainFolds, testFolds, lowerNegatedTweetData2, 0) print "\n3. With StopWords Removed, Case-Folding, and Negations after 'not', \ 'never', 'no':" for i in range(25, 151, 25): print i, '-', question9(trainFolds, testFolds, lowerNegatedTweetData2, i) print print "\nQuestion 11: Accuracy of Decision List and Naive Bayes with Tokenization\n" print 'Decision List:' question7(trainFolds, testFolds, tokenizedTweetData, 0) print 'Naive Bayes:' question9(trainFolds, testFolds, tokenizedTweetData, 0) print "\nQuestion 11: Extras\n" tokenedLowerTweetData = lowerData(tokenizedTweetData) tokenedLowerNegatedTweetData = negateData(tokenedLowerTweetData, negateWords) print 'Decision List:' question7(trainFolds, testFolds, tokenedLowerNegatedTweetData, 0) print 'Naive Bayes:' question9(trainFolds, testFolds, tokenedLowerNegatedTweetData, 30)
MFS = sortSents[-1][0] correctCount = 0 tweets = tweetData["tweets"].keys() for tweetID in tweets: sentiments = tweetData["tweets"][tweetID]["answers"] if conflate: if 'objective' in sentiments or 'neutral' in sentiments: sentiments = ['neutral'] if MFS in sentiments: correctCount += 1 print float(correctCount) / len(tweets) * 100 if __name__ == '__main__': filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv' tweetData = parse_tweets(filename, 'B') print "\nQuestion 1:\n" question1(tweetData) print "\nQuestion 2:\n" sentDictionary1 = question2(tweetData, False) print "\nQuestion 2 (with conflation to neutral):\n" sentDictionary2 = question2(tweetData, True) print "\nQuestion 3:\n" question3(tweetData) print "\nQuestion 4:\n" question4(sentDictionary1, tweetData, False) question4(sentDictionary2, tweetData, True)
def build_decision_list_file(filename): tweetData = parse_tweets(filename, 'B') return build_decision_list(tweetData, 'tweets')
for sense2 in counts: if sense2 != sense and feature in counts[sense2]: presence_elsewhere += counts[sense2][feature] if presence_elsewhere == 0: presence_elsewhere = 0.1 score = log(counts[sense][feature]) / log(presence_elsewhere) scores.append((sense, feature, score)) return scores def classify(instance, decision_list, MFS): # features = get_features(instance, range(2,4), True, [], 3) features = get_bag_of_words(instance, caseFolding=True) for entry in decision_list: if entry[1] in features: return entry[0] return MFS if __name__ == "__main__": filename = 'twitter-train-full-B.tsv' tweetData = parse_tweets(filename, 'B') stopwords = get_stopwords(tweetData['tweets'], 0.2) stopwords2 = get_stopwords(tweetData['tweets'], 0.1) print("Stopwords with threshold of 0.2\n", stopwords) print("Stopwords with threshold of 0.1\n", stopwords2) print() example = ["This", "is", "not", "a", "very", "good", "example", ",", "but", "that", "one", "was", "."] print(example) negation_processing(example) print(example)
def main(): usage = 'python3 preprocess.py <INPUT_FILE_NAME> <OUTPUT_FILE_NAME>' if len(sys.argv) != 3: print(usage) return global abbreviationDict abbreviationDict = {} for line in open('abbreviationDict.txt', 'r'): fields = line.split(': ') abbreviation = fields[0].lower() words = fields[1].lower() abbreviationDict[abbreviation] = words global emoticonDict emoticonDict = {} for line in open('emoticonDict.txt', 'r'): fields = line.split() emoticon = fields[0] replacement = fields[1] abbreviationDict[emoticon] = replacement global stopWords stopWords = [] for line in open('stopWordsDict.txt', 'r'): stopWords.append(line.split()[0]) global positiveWords positiveWords = {} for line in open('positive-words.txt', 'r', encoding='ISO-8859-1'): positiveWords[line.split()[0]] = 1 global negativeWords negativeWords = {} for line in open('negative-words.txt', 'r', encoding='ISO-8859-1'): negativeWords[line.split()[0]] = 1 ''' #global positiveWords #global negativeWords #positiveWords = {} #negativeWords = {} for line in open('unigrams-pmilexicon.txt', 'r'): currLine = line.split() score = float(currLine[1]) if score >= 4.5: positiveWords[currLine[0]] = score elif score <= 4.5: negativeWords[currLine[0]] = score global positiveWordsPOS global negativeWordsPOS global weakPositiveWordsPOS global weakNegativeWordsPOS positiveWordsPOS = {} negativeWordsPOS = {} weakPositiveWordsPOS = {} weakNegativeWordsPOS = {} for line in open('mpqa.tff', 'r'): currLine = line.split() wordType = currLine[0] POS = currLine[3].split('=')[1] if POS == 'noun': POS = 'NN' elif POS == 'adj': POS = 'JJ' elif POS == 'verb': POS = 'VBD' elif POS == 'adverb': POS = 'RB' if wordType == 'type=strongsubj': if currLine[5] == 'priorpolarity=positive': positiveWordsPOS[currLine[2]] = POS else: negativeWordsPOS[currLine[2]] = POS else: if currLine[5] == 'priorpolarity=negative': weakPositiveWordsPOS[currLine[2]] = POS else: weakNegativeWordsPOS[currLine[2]] = POS ''' trainFileName = sys.argv[1] trainData = parse_tweets(trainFileName, 'B') preprocess(trainData) outputFileName = sys.argv[2] write(trainData, outputFileName)
def main(): usage = 'python3 preprocess.py <INPUT_FILE_NAME> <OUTPUT_FILE_NAME>' if len(sys.argv) != 3: print(usage) return global abbreviationDict abbreviationDict = {} for line in open('abbreviationDict.txt', 'r'): fields = line.split(': ') abbreviation = fields[0].lower() words = fields[1].lower() abbreviationDict[abbreviation] = words global emoticonDict emoticonDict = {} for line in open('emoticonDict.txt', 'r'): fields = line.split() emoticon = fields[0] replacement = fields[1] abbreviationDict[emoticon] = replacement global stopWords stopWords = [] for line in open('stopWordsDict.txt', 'r'): stopWords.append(line.split()[0]) global positiveWords positiveWords = {} for line in open('positive-words.txt', 'r', encoding='ISO-8859-1'): positiveWords[line.split()[0]] = 1 global negativeWords negativeWords = {} for line in open('negative-words.txt', 'r', encoding='ISO-8859-1'): negativeWords[line.split()[0]] = 1 ''' #global positiveWords #global negativeWords #positiveWords = {} #negativeWords = {} for line in open('unigrams-pmilexicon.txt', 'r'): currLine = line.split() score = float(currLine[1]) if score >= 4.5: positiveWords[currLine[0]] = score elif score <= 4.5: negativeWords[currLine[0]] = score global positiveWordsPOS global negativeWordsPOS global weakPositiveWordsPOS global weakNegativeWordsPOS positiveWordsPOS = {} negativeWordsPOS = {} weakPositiveWordsPOS = {} weakNegativeWordsPOS = {} for line in open('mpqa.tff', 'r'): currLine = line.split() wordType = currLine[0] POS = currLine[3].split('=')[1] if POS == 'noun': POS = 'NN' elif POS == 'adj': POS = 'JJ' elif POS == 'verb': POS = 'VBD' elif POS == 'adverb': POS = 'RB' if wordType == 'type=strongsubj': if currLine[5] == 'priorpolarity=positive': positiveWordsPOS[currLine[2]] = POS else: negativeWordsPOS[currLine[2]] = POS else: if currLine[5] == 'priorpolarity=negative': weakPositiveWordsPOS[currLine[2]] = POS else: weakNegativeWordsPOS[currLine[2]] = POS ''' trainFileName = sys.argv[1] trainData = parse_tweets(trainFileName, 'B') preprocess(trainData) outputFileName = sys.argv[2] write(trainData, outputFileName)