Пример #1
0
def conduct_test(test_set, lexicon_only = False, svm_only = False, number_of_tweets = 3700):
    scores = []
    if  svm_only:
        for i in range(0,number_of_tweets,100):
            transformed = svm.vectorize_tweets(test_set.keys()[i:i+100], False)
            predicted = svm.score(transformed, test_set.values()[i:i+100])
            scores.append(predicted)
        transformed = svm.vectorize_tweets(test_set.keys()[number_of_tweets:len(test_set.keys())], False)
        predicted = svm.score(transformed, test_set.values()[number_of_tweets:len(test_set.keys())])
        scores.append(predicted)  
    else:
        for i in range(0,number_of_tweets,100):
             predicted = predict(test_set.keys()[i:i+100], lexicon_only) 
             expected = test_set.items()[i:i+100]
             scores.append(calculate_accuracy(predicted, dict(expected)))
        predicted = predict(test_set.keys()[number_of_tweets:len(test_set.keys())],lexicon_only)
        expected = test_set.items()[number_of_tweets:len(test_set.keys())]
        scores.append(calculate_accuracy(predicted, dict(expected)))
    return scores      
Пример #2
0
def predict(test_tweets, lexicon_only = False):
    global lexicon
    predicted_from_lexicon = sentlex.predict(test_tweets, lexicon)
    if lexicon_only:
        return predicted_from_lexicon
    undecided = sentlex.get_unclassified(predicted_from_lexicon)
    #undecided_labels = get_labels(undecided, test_set)

    undecided_transformed = svm.vectorize_tweets(undecided, False)
    predicted_from_svm = svm.predict(undecided_transformed)

    final = {}
    for i in range(len(undecided)):
        final[undecided[i]] = predicted_from_svm[i]
        predicted_from_lexicon.update(final)
    
    return predicted_from_lexicon
Пример #3
0
def load_sets(training_file, test_file):
    global train_set, lexicon_set, test_set, train_labels, train_tweets, lexicon

    #Loading the training and lexicon sets
    train_set = loader.tweets_to_tweetlist(training_file, neutral = True)
    lexicon_set = loader.tweets_to_wordlist(training_file, neutral = False)
 
   #Loading the test set
    test_set = loader.tweets_to_tweetlist(test_file, neutral = True)
    
    #training the svm anlyzer
    train_labels = train_set.values()
    train_tweets = svm.vectorize_tweets(train_set.keys())
    svm.train(train_tweets, train_labels)

    #Building the lexicon
    lexicon = sentlex.get_ratioDict(lexicon_set['positive'], lexicon_set['negative'], 10)