class InterestAnalyzer: def __init__(self): self.feature_builder = FeatureBuilder() def rebuild_features(self): print 'Parsing Facebook...' fbparser.parse({'in': TRAINING_DIRECTORIES['fb'], 'out': TRAINING_FILES['fb']}) print 'Parsing Twitter...' tweetparser.parse({'in': TRAINING_DIRECTORIES['tweets'], 'out': TRAINING_FILES['tweets']}) print 'Parsing LinkedIn...' linkedinparser.parse({'in': TRAINING_DIRECTORIES['linkedin'], 'out': TRAINING_FILES['linkedin']}) print 'Building features...' # build features for training data self.feature_builder.create_feature_vectors(TRAINING_FILES['linkedin'], TRAINING_FEATURE_FILES['linkedin'], 'linkedin') self.feature_builder.create_feature_vectors(TRAINING_FILES['tweets'], TRAINING_FEATURE_FILES['tweets'], 'tweets') self.feature_builder.create_feature_vectors(TRAINING_FILES['fb'], TRAINING_FEATURE_FILES['fb'], 'fb') def retrain_classifier(self): print 'Training classifier...' self.classifier = Classifier() def save_classifier(self): pickle.dump(self.classifier, open(CLASSIFIER_FILE, 'wb')) def load_classifier(self): self.classifier = pickle.load(open(CLASSIFIER_FILE, 'rb')) def classifier_predict(self): print 'Parsing Facebook...' fbparser.parse({'in': TESTING_DIRECTORIES['fb'], 'out': TESTING_FILES['fb']}) print 'Parsing Twitter...' tweetparser.parse({'in': TESTING_DIRECTORIES['tweets'], 'out': TESTING_FILES['tweets']}) print 'Parsing LinkedIn...' linkedinparser.parse({'in': TESTING_DIRECTORIES['linkedin'], 'out': TESTING_FILES['linkedin']}) print 'Building features...' self.feature_builder.create_feature_vectors(TESTING_FILES['linkedin'], TESTING_FEATURE_FILES['linkedin'], 'linkedin') self.feature_builder.create_feature_vectors(TESTING_FILES['tweets'], TESTING_FEATURE_FILES['tweets'], 'tweets') self.feature_builder.create_feature_vectors(TESTING_FILES['fb'], TESTING_FEATURE_FILES['fb'], 'fb') linkedin_testing_features = np.loadtxt(TESTING_FEATURE_FILES['linkedin'], delimiter=',') tweets_testing_features = np.loadtxt(TESTING_FEATURE_FILES['tweets'], delimiter=',') fb_testing_features = np.loadtxt(TESTING_FEATURE_FILES['fb'], delimiter=',') print 'Predicting labels...' print 'LinkedIn classifier:' print self.classifier.predict_testing_data('linkedin', linkedin_testing_features, TESTING_LABELS_FILE, 'results_l.txt') print 'Twitter classifier:' print self.classifier.predict_testing_data('tweets', tweets_testing_features, TESTING_LABELS_FILE, 'results_t.txt') tweets_result_labels = np.loadtxt('results_t.txt', delimiter=',') linkedin_result_labels = np.loadtxt('results_l.txt', delimiter=',') print 'Late fusion classifier:' print self.classifier.predict_late_fusion_testing_data([tweets_result_labels, linkedin_result_labels], TESTING_LABELS_FILE, 'result.txt')
class SentimentAnalyzer: def __init__(self): self.parser_options = tweetparser.options self.classifier = Classifier() if os.path.exists(CLASSIFIER_FILE): self.classifier.load_classifier(CLASSIFIER_FILE) else: self.retrain_classifier() def rebuild_features(self): print 'Parsing tweets...' tweetparser.parse_all_files(self.parser_options) print 'Building features...' # build features for training data training_labels = read_labels(TRAINING) training_tweets = pickle.load(open(TRAINING_TWEETS, 'rb')) unigram_features = buildfeatures.build_unigram_feature_dict(training_tweets, training_labels) training_data = buildfeatures.get_feature_vectors(training_tweets, unigram_features) # save training data np.savetxt(TRAINING_DATA_FILE, training_data, delimiter=',') # build features for testing data testing_tweets = pickle.load(open(TESTING_TWEETS, 'rb')) testing_data = buildfeatures.get_feature_vectors(testing_tweets, unigram_features) np.savetxt(TESTING_DATA_FILE, testing_data, delimiter=',') # build features for development data development_tweets = pickle.load(open(DEVELOPMENT_TWEETS, 'rb')) development_data = buildfeatures.get_feature_vectors(development_tweets, unigram_features) np.savetxt(DEVELOPMENT_DATA_FILE, development_data, delimiter=',') # save unigram features processed pickle.dump(unigram_features, open(UNIGRAM_FEATURES_FILE, 'wb'), -1) def retrain_classifier(self): if not os.path.exists(TRAINING_DATA_FILE): self.rebuild_features() training_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=',') training_labels = read_labels(TRAINING) print 'Training classifier...' self.classifier = Classifier() self.classifier.train(training_data, training_labels) self.classifier.save_classifier(CLASSIFIER_FILE) def classify_test_tweets(self): testing_tweets = pickle.load(open(TESTING_TWEETS, 'rb')) testing_data = np.loadtxt(TESTING_DATA_FILE, delimiter=',') testing_labels = read_labels(TESTING) testing_topics = read_topics(TESTING) print 'Predicting labels...' print 'Testing Results: ' + str(self.classifier.predict_testing_data(testing_tweets, testing_data, testing_topics, testing_labels, RESULTS_FILE)) def classify_development_tweets(self): development_tweets = pickle.load(open(DEVELOPMENT_TWEETS, 'rb')) development_data = np.loadtxt(DEVELOPMENT_DATA_FILE, delimiter=',') development_labels = read_labels(DEVELOPMENT) development_topics = read_topics(DEVELOPMENT) print 'Predicting labels...' print 'Development Results: ' + str(self.classifier.predict_testing_data(development_tweets, development_data, development_topics, development_labels, RESULTS_FILE)) def classify_custom_tweets(self, custom_filename): if not os.path.exists(custom_filename): print 'The file ' + custom_filename + ' does not exist.' return try: print 'Parsing tweets...' custom_tweets = [] def collect(tweet): custom_tweets.append(tweet) tweetparser._parse_tweets(custom_filename, collect) labels = read_labels(custom_filename) topics = read_topics(custom_filename) print 'Building features...' unigram_features = pickle.load(open(UNIGRAM_FEATURES_FILE, 'rb')) data = buildfeatures.get_feature_vectors(custom_tweets, unigram_features) print 'Predicting labels...' labels = read_labels(custom_filename) topics = read_topics(custom_filename) print 'Results: ' + str(self.classifier.predict_testing_data(custom_tweets, data, topics, labels, RESULTS_FILE)) print 'See labels at: ' + RESULTS_FILE except: print 'Something went wrong. File may be in wrong format.' def cross_validation(self): training_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=',') training_labels = read_labels(TRAINING) raw_classifier = self.classifier.get_classifier() kf_total = cross_validation.KFold(len(training_labels), n_folds=10, shuffle=True, random_state=4) print 'Average F1-Score: ' + str(np.average(cross_validation.cross_val_score(raw_classifier, training_data, training_labels, cv=kf_total, n_jobs=1, scoring='f1_weighted'))) def adjust_parser(self): length = len(self.parser_options) option = 0 while not option == length + 1: print 'Which parser switch do you want to flip?' switches = {} for i, (opt, val) in enumerate(self.parser_options.items()): switches[i + 1] = opt print str(i + 1) + '. ' + opt + ':' + (' ' * (24 - len(opt))) + str(val) print str(length + 1) + '. Back to main menu' option = input('Answer: ') if option > 0 and option < length + 1: opt = switches[option] self.parser_options[opt] = not self.parser_options[opt]