def classifier_for_twitter_corpus(): svc = LogisticRegression(class_weight='auto', penalty='l2') f_measure = [] entities = [] predicted = [] right = [] train_data = Bunch() test_data = Bunch() try: for i in range(1, 6): f_train = open("input/twitter_corpus/" + str(i) + "/train.txt") f_test = open("input/twitter_corpus/" + str(i) + "/test.txt") train_data.reviews = load_data(f_train) test_data.reviews = load_data(f_test) train_data.labels = extract_labels(train_data.reviews) test_data.labels = extract_labels(test_data.reviews) train_data.entities = extract_entities(train_data.reviews) features_train = extract_features_for_twitter_corpus( train_data.reviews, True) svc.fit(numpy.array(features_train), numpy.array(train_data.labels)) features_test = extract_features_for_twitter_corpus( test_data.reviews, False) predicted_block = svc.predict(numpy.array(features_test)) predicted.extend(predicted_block) right.extend(test_data.labels) print metrics.f1_score(test_data.labels, predicted_block, average='macro') f_measure.append( metrics.f1_score(test_data.labels, predicted_block, average='macro')) entities.extend(test_data.reviews) print str(f_measure) print classification_report(right, predicted, digits=3) print metrics.precision_score(right, predicted, average='macro') print metrics.recall_score(right, predicted, average='macro') print metrics.f1_score(right, predicted, average='macro') except FileNotFoundError: print "Please download Twitter corpus and put it into input/twitter_corpus folder"
def classifier_for_cadec_corpus(): svc = LinearSVC(class_weight='auto', penalty='l2') f_measure = [] entities = [] predicted = [] right = [] train_data = Bunch() test_data = Bunch() for i in range(1, 6): print i f_train = open("input/cadec_corpus/" + str(i) + "/train.txt") f_test = open("input/cadec_corpus/" + str(i) + "/test.txt") train_data.reviews = load_data(f_train) test_data.reviews = load_data(f_test) train_data.labels = extract_labels(train_data.reviews) test_data.labels = extract_labels(test_data.reviews) train_data.entities = extract_entities(train_data.reviews) features_train = extract_features_for_cadec_corpus( train_data.reviews, True) svc.fit(numpy.array(features_train), numpy.array(train_data.labels)) features_test = extract_features_for_cadec_corpus( test_data.reviews, False) predicted_block = svc.predict(numpy.array(features_test)) predicted.extend(predicted_block) right.extend(test_data.labels) print metrics.f1_score(test_data.labels, predicted_block, average='macro') f_measure.append( metrics.f1_score(test_data.labels, predicted_block, average='macro')) entities.extend(test_data.reviews) print str(f_measure) print classification_report(right, predicted, digits=3) print metrics.precision_score(right, predicted, average='macro') print metrics.recall_score(right, predicted, average='macro') print metrics.f1_score(right, predicted, average='macro')