def conduct_test(test_set, lexicon_only = False, svm_only = False, number_of_tweets = 3700): scores = [] if svm_only: for i in range(0,number_of_tweets,100): transformed = svm.vectorize_tweets(test_set.keys()[i:i+100], False) predicted = svm.score(transformed, test_set.values()[i:i+100]) scores.append(predicted) transformed = svm.vectorize_tweets(test_set.keys()[number_of_tweets:len(test_set.keys())], False) predicted = svm.score(transformed, test_set.values()[number_of_tweets:len(test_set.keys())]) scores.append(predicted) else: for i in range(0,number_of_tweets,100): predicted = predict(test_set.keys()[i:i+100], lexicon_only) expected = test_set.items()[i:i+100] scores.append(calculate_accuracy(predicted, dict(expected))) predicted = predict(test_set.keys()[number_of_tweets:len(test_set.keys())],lexicon_only) expected = test_set.items()[number_of_tweets:len(test_set.keys())] scores.append(calculate_accuracy(predicted, dict(expected))) return scores
def predict(test_tweets, lexicon_only = False): global lexicon predicted_from_lexicon = sentlex.predict(test_tweets, lexicon) if lexicon_only: return predicted_from_lexicon undecided = sentlex.get_unclassified(predicted_from_lexicon) #undecided_labels = get_labels(undecided, test_set) undecided_transformed = svm.vectorize_tweets(undecided, False) predicted_from_svm = svm.predict(undecided_transformed) final = {} for i in range(len(undecided)): final[undecided[i]] = predicted_from_svm[i] predicted_from_lexicon.update(final) return predicted_from_lexicon
def load_sets(training_file, test_file): global train_set, lexicon_set, test_set, train_labels, train_tweets, lexicon #Loading the training and lexicon sets train_set = loader.tweets_to_tweetlist(training_file, neutral = True) lexicon_set = loader.tweets_to_wordlist(training_file, neutral = False) #Loading the test set test_set = loader.tweets_to_tweetlist(test_file, neutral = True) #training the svm anlyzer train_labels = train_set.values() train_tweets = svm.vectorize_tweets(train_set.keys()) svm.train(train_tweets, train_labels) #Building the lexicon lexicon = sentlex.get_ratioDict(lexicon_set['positive'], lexicon_set['negative'], 10)