예제 #1
0
def main():
    usage = 'python3 classifier.py training_file test_file OR \n \
        python3 classifier.py training_file num_folds_for_cross_val'

    if len(sys.argv) != 3:
        print(usage)
        return

    trainFileName = sys.argv[1]
    trainData = parse_tweets(trainFileName, 'B')

    try:
        fold = int(sys.argv[2])
        '''
        linearSVCClassifier = TweetClassifier(trainData, trainData, \
            LinearSVC, name = 'Linear SVC Classifier', fold = fold)
        linearSVCClassifier.evaluate(TweetClassifier.MODE_CROSS_VALIDATE)
        '''

        logisticRegClassifier = TweetClassifier(trainData, trainData, \
            LogisticRegression, name = 'Logistic Regression Classifier', \
            fold = fold)
        logisticRegClassifier.evaluate(TweetClassifier.MODE_CROSS_VALIDATE)

    except ValueError:
        testFileName = sys.argv[2]
        testData = parse_tweets(testFileName, 'B')
        '''
        linearSVCClassifier = TweetClassifier(trainData, testData, \
            LinearSVC, name = 'Linear SVC Classifier')
        linearSVCClassifier.evaluate(TweetClassifier.MODE_TEST_FILE)
        '''
        logisticRegClassifier = TweetClassifier(trainData, testData, \
            LogisticRegression, name = 'Logistic Regression  Classifier')
        logisticRegClassifier.evaluate(TweetClassifier.MODE_TEST_FILE)
예제 #2
0
def handleCommandLineArgs():
    # retrieve and check command-line arguments
    #print sys.argv
    if len(sys.argv) != 3:
        print "Error, incorrect number of arguments"
        usage()
        sys.exit(1)

    trainingFile = check_argv(sys.argv[1])

    if os.path.exists(sys.argv[2]):
        testFile = check_argv(sys.argv[2])
        crossVal = False
    else:
        n = check_n(sys.argv[2])
        crossVal = True

    # build Tweet dataset
    #trainingFile = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv'
    tweetData = parse_tweets(trainingFile, 'B')

    if crossVal:
        # do n-fold cross validation on training set specified
        trainFolds, testFolds = crossValidation(tweetData, n, False)
        testData = None
    else:
        testData = parse_tweets(testFile, 'B')
        trainFolds = [tweetData["tweets"].keys()]
        testFolds = [testData["tweets"].keys()]

    return tweetData, testData, trainFolds, testFolds
예제 #3
0
def main():
    filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv'
   
    global fold
    fold = 5
    
    tweetData = parse_tweets(filename, 'B')
    testData(tweetData)   
예제 #4
0
def main():
    #Running this file will print the answers to each question to the terminal.
    filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv'
    tweetData = parse_tweets(filename, 'B')
    #Q1: How many training examples are there?
    print("(1) There are %d training examples" % len(tweetData['tweets'].keys()))

    #Q2: How often do each of the sentiment labels show up? What about if you
    #    conflate neutral/objective/objective-OR-neutral into neutral?
    print("(2) With conflation of neutral/objective turned off or on:")
    print("OFF:")
    maxKeyCOff = MFS_counter(tweetData)
    print("---------------")
    print("ON:")
    maxKeyCOn = MFS_counter(tweetData, True)
    print("---------------")

    #Q3: What is the random baseline?
    print("(3)\n\tWith conflation off: 5 sentiments, so 20% tagged correctly.\n\tWith conflation on: 3 sentiments, so 33.3% tagged correctly." )

    #Q4: What is the most frequent sentiment baseline?
    print("(4)\n\tWith conflation off: MFS = positive with 37.3% tagged correctly\n\tWith conflation on: MFS = neutral with 48.2% tagged correctly")
예제 #5
0
def main():
    filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv'
    tweetData = parse_tweets(filename, 'B')

    # use 5-fold cross-validation
    trainFolds, testFolds = crossValidation(tweetData, 5, False)

    standardTweetData = conflateData(tweetData)
    lowerTweetData = lowerData(standardTweetData)
    negatedTweetData = negateData(standardTweetData, ['not'])
    lowerNegatedTweetData = negateData(lowerTweetData, ['not'])
    tokenizedTweetData = tokenizeData(standardTweetData)

    #extras:
    negateWords = ['not', 'no', 'never']
    negatedTweetData2 = negateData(standardTweetData, negateWords)
    lowerNegatedTweetData2 = negateData(lowerTweetData, negateWords)

    print "\nQuestion 5: Most Frequent Sentiments in %d Chunks:\n" % (
        len(testFolds))
    testMFSList = question5(testFolds, standardTweetData)

    print "\nQuestion 6: Accuracy of MFS on Test Data (with cross-validation):"
    question6(trainFolds, testFolds, standardTweetData)

    print "\nQuestion 7: Accuracy of Decision List Classifier:\n"
    print "Standard Conditions:"
    question7(trainFolds, testFolds, standardTweetData, 0)
    # test stopwords

    print "\nWith Stopwords Removed:"
    for i in range(25, 151, 25):
        print i, "-",
        question7(trainFolds, testFolds, standardTweetData, i)

    print "\nWith Case-Folding:"
    question7(trainFolds, testFolds, lowerTweetData, 0)

    print "\nQuestion 8: Accuracy of Decision List with Negations after 'not':"
    question7(trainFolds, testFolds, negatedTweetData, 0)

    print "\nApply negation after 'not','no','never':"
    question7(trainFolds, testFolds, negatedTweetData2, 0)

    print "\nQuestion 9: Accuracy of Naive Bayes Classifier:"
    question9(trainFolds, testFolds, standardTweetData, 0)

    print "\nQuestion 10: Naive Bayes Classifier:"

    print "\n1. With StopWords Removed:"
    for i in range(25, 151, 25):
        print i, '-',
        question9(trainFolds, testFolds, standardTweetData, i)

    print "\n2. With Case-Folding:"
    question9(trainFolds, testFolds, lowerTweetData, 0)

    print "\n3. With Negations after 'not':"
    question9(trainFolds, testFolds, negatedTweetData, 0)
    print "\n4. With Case-Folding and Negations after 'not':"
    question9(trainFolds, testFolds, lowerNegatedTweetData, 0)

    print "\n5. With StopWords Removed, Case-Folding, and Negations after 'not':"
    for i in range(25, 151, 25):
        print i, '-',
        question9(trainFolds, testFolds, lowerNegatedTweetData, i)

    print '\nUsing 2nd Negation Strategy...'
    print "\n1. With Negations after 'not', 'never', 'no':"
    question9(trainFolds, testFolds, negatedTweetData2, 0)
    print "\n2. With Case-Folding and Negations after 'not':"
    question9(trainFolds, testFolds, lowerNegatedTweetData2, 0)

    print "\n3. With StopWords Removed, Case-Folding, and Negations after 'not', \
            'never', 'no':"

    for i in range(25, 151, 25):
        print i, '-',
        question9(trainFolds, testFolds, lowerNegatedTweetData2, i)
    print

    print "\nQuestion 11: Accuracy of Decision List and Naive Bayes with Tokenization\n"
    print 'Decision List:'
    question7(trainFolds, testFolds, tokenizedTweetData, 0)
    print 'Naive Bayes:'
    question9(trainFolds, testFolds, tokenizedTweetData, 0)

    print "\nQuestion 11: Extras\n"
    tokenedLowerTweetData = lowerData(tokenizedTweetData)
    tokenedLowerNegatedTweetData = negateData(tokenedLowerTweetData,
                                              negateWords)
    print 'Decision List:'
    question7(trainFolds, testFolds, tokenedLowerNegatedTweetData, 0)
    print 'Naive Bayes:'
    question9(trainFolds, testFolds, tokenedLowerNegatedTweetData, 30)
예제 #6
0
    MFS = sortSents[-1][0]

    correctCount = 0
    tweets = tweetData["tweets"].keys()
    for tweetID in tweets:
        sentiments = tweetData["tweets"][tweetID]["answers"]
        if conflate:
            if 'objective' in sentiments or 'neutral' in sentiments:
                sentiments = ['neutral']
        if MFS in sentiments:
            correctCount += 1

    print float(correctCount) / len(tweets) * 100


if __name__ == '__main__':
    filename = '/data/cs65/semeval-2015/B/train/twitter-train-full-B.tsv'
    tweetData = parse_tweets(filename, 'B')

    print "\nQuestion 1:\n"
    question1(tweetData)
    print "\nQuestion 2:\n"
    sentDictionary1 = question2(tweetData, False)
    print "\nQuestion 2 (with conflation to neutral):\n"
    sentDictionary2 = question2(tweetData, True)
    print "\nQuestion 3:\n"
    question3(tweetData)
    print "\nQuestion 4:\n"
    question4(sentDictionary1, tweetData, False)
    question4(sentDictionary2, tweetData, True)
예제 #7
0
def build_decision_list_file(filename):
    tweetData = parse_tweets(filename, 'B')
    return build_decision_list(tweetData, 'tweets')
예제 #8
0
            for sense2 in counts:
                if sense2 != sense and feature in counts[sense2]:
                    presence_elsewhere += counts[sense2][feature]
            if presence_elsewhere == 0:
                presence_elsewhere = 0.1
            score = log(counts[sense][feature]) / log(presence_elsewhere)
            scores.append((sense, feature, score))
    return scores

def classify(instance, decision_list, MFS):
    # features = get_features(instance, range(2,4), True, [], 3)
    features = get_bag_of_words(instance, caseFolding=True)  
    for entry in decision_list:
        if entry[1] in features:
            return entry[0]
    return MFS

if __name__ == "__main__":
    filename = 'twitter-train-full-B.tsv'
    tweetData = parse_tweets(filename, 'B')
    stopwords = get_stopwords(tweetData['tweets'], 0.2)
    stopwords2 = get_stopwords(tweetData['tweets'], 0.1)
    print("Stopwords with threshold of 0.2\n", stopwords)
    print("Stopwords with threshold of 0.1\n", stopwords2)

    print()
    example = ["This", "is", "not", "a", "very", "good", "example", ",", "but", "that", "one", "was", "."]
    print(example)
    negation_processing(example)
    print(example)
def main():
    usage = 'python3 preprocess.py <INPUT_FILE_NAME> <OUTPUT_FILE_NAME>'
    if len(sys.argv) != 3:
        print(usage)
        return
    
    global abbreviationDict
    abbreviationDict = {}
    for line in  open('abbreviationDict.txt', 'r'):
        fields = line.split(': ')
        abbreviation = fields[0].lower()
        words = fields[1].lower()
        abbreviationDict[abbreviation] = words

   
    global emoticonDict 
    emoticonDict = {}
    for line in  open('emoticonDict.txt', 'r'):
        fields = line.split()
        emoticon = fields[0]
        replacement = fields[1]
        abbreviationDict[emoticon] = replacement

    global stopWords 
    stopWords = []
    for line in open('stopWordsDict.txt', 'r'):
        stopWords.append(line.split()[0])
    
    
    global positiveWords 
    positiveWords = {}
    for line in open('positive-words.txt', 'r', encoding='ISO-8859-1'):
        positiveWords[line.split()[0]] = 1

    global negativeWords 
    negativeWords = {}
    for line in open('negative-words.txt', 'r', encoding='ISO-8859-1'):
        negativeWords[line.split()[0]] = 1

    '''
    #global positiveWords
    #global negativeWords
    #positiveWords = {}
    #negativeWords = {}
    for line in open('unigrams-pmilexicon.txt', 'r'):
        currLine = line.split()
        score = float(currLine[1])

        if score >= 4.5:
            positiveWords[currLine[0]] = score
        elif score <= 4.5:
            negativeWords[currLine[0]] = score
    
    global positiveWordsPOS
    global negativeWordsPOS
    global weakPositiveWordsPOS
    global weakNegativeWordsPOS
    positiveWordsPOS = {}
    negativeWordsPOS = {}
    weakPositiveWordsPOS = {}
    weakNegativeWordsPOS = {}
    for line in open('mpqa.tff', 'r'):
        currLine = line.split()
        wordType = currLine[0]
        POS = currLine[3].split('=')[1]

        if POS == 'noun':
            POS = 'NN'
        elif POS == 'adj':
            POS = 'JJ'
        elif POS == 'verb':
            POS = 'VBD'
        elif POS == 'adverb':
            POS = 'RB'

        if wordType == 'type=strongsubj':
            if currLine[5] == 'priorpolarity=positive':
                positiveWordsPOS[currLine[2]] = POS
            else:
                negativeWordsPOS[currLine[2]] = POS
        else:
            if currLine[5] == 'priorpolarity=negative':
                weakPositiveWordsPOS[currLine[2]] = POS
            else:
                weakNegativeWordsPOS[currLine[2]] = POS
    '''
    trainFileName = sys.argv[1]
    trainData = parse_tweets(trainFileName, 'B')
    preprocess(trainData)
    outputFileName = sys.argv[2]
    write(trainData, outputFileName)
예제 #10
0
def main():
    usage = 'python3 preprocess.py <INPUT_FILE_NAME> <OUTPUT_FILE_NAME>'
    if len(sys.argv) != 3:
        print(usage)
        return

    global abbreviationDict
    abbreviationDict = {}
    for line in open('abbreviationDict.txt', 'r'):
        fields = line.split(': ')
        abbreviation = fields[0].lower()
        words = fields[1].lower()
        abbreviationDict[abbreviation] = words

    global emoticonDict
    emoticonDict = {}
    for line in open('emoticonDict.txt', 'r'):
        fields = line.split()
        emoticon = fields[0]
        replacement = fields[1]
        abbreviationDict[emoticon] = replacement

    global stopWords
    stopWords = []
    for line in open('stopWordsDict.txt', 'r'):
        stopWords.append(line.split()[0])

    global positiveWords
    positiveWords = {}
    for line in open('positive-words.txt', 'r', encoding='ISO-8859-1'):
        positiveWords[line.split()[0]] = 1

    global negativeWords
    negativeWords = {}
    for line in open('negative-words.txt', 'r', encoding='ISO-8859-1'):
        negativeWords[line.split()[0]] = 1
    '''
    #global positiveWords
    #global negativeWords
    #positiveWords = {}
    #negativeWords = {}
    for line in open('unigrams-pmilexicon.txt', 'r'):
        currLine = line.split()
        score = float(currLine[1])

        if score >= 4.5:
            positiveWords[currLine[0]] = score
        elif score <= 4.5:
            negativeWords[currLine[0]] = score
    
    global positiveWordsPOS
    global negativeWordsPOS
    global weakPositiveWordsPOS
    global weakNegativeWordsPOS
    positiveWordsPOS = {}
    negativeWordsPOS = {}
    weakPositiveWordsPOS = {}
    weakNegativeWordsPOS = {}
    for line in open('mpqa.tff', 'r'):
        currLine = line.split()
        wordType = currLine[0]
        POS = currLine[3].split('=')[1]

        if POS == 'noun':
            POS = 'NN'
        elif POS == 'adj':
            POS = 'JJ'
        elif POS == 'verb':
            POS = 'VBD'
        elif POS == 'adverb':
            POS = 'RB'

        if wordType == 'type=strongsubj':
            if currLine[5] == 'priorpolarity=positive':
                positiveWordsPOS[currLine[2]] = POS
            else:
                negativeWordsPOS[currLine[2]] = POS
        else:
            if currLine[5] == 'priorpolarity=negative':
                weakPositiveWordsPOS[currLine[2]] = POS
            else:
                weakNegativeWordsPOS[currLine[2]] = POS
    '''
    trainFileName = sys.argv[1]
    trainData = parse_tweets(trainFileName, 'B')
    preprocess(trainData)
    outputFileName = sys.argv[2]
    write(trainData, outputFileName)