def evaluate( self, test_set, classifier=None, accuracy=True, f_measure=True, precision=True, recall=True, verbose=False, ): """ Evaluate and print classifier performance on the test set. :param test_set: A list of (tokens, label) tuples to use as gold set. :param classifier: a classifier instance (previously trained). :param accuracy: if `True`, evaluate classifier accuracy. :param f_measure: if `True`, evaluate classifier f_measure. :param precision: if `True`, evaluate classifier precision. :param recall: if `True`, evaluate classifier recall. :return: evaluation results. :rtype: dict(str): float """ if classifier is None: classifier = self.classifier print("Evaluating {0} results...".format(type(classifier).__name__)) metrics_results = {} if accuracy == True: accuracy_score = eval_accuracy(classifier, test_set) metrics_results["Accuracy"] = accuracy_score gold_results = defaultdict(set) test_results = defaultdict(set) labels = set() for i, (feats, label) in enumerate(test_set): labels.add(label) gold_results[label].add(i) observed = classifier.classify(feats) test_results[observed].add(i) for label in labels: if precision == True: precision_score = eval_precision( gold_results[label], test_results[label] ) metrics_results["Precision [{0}]".format(label)] = precision_score if recall == True: recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results["Recall [{0}]".format(label)] = recall_score if f_measure == True: f_measure_score = eval_f_measure( gold_results[label], test_results[label] ) metrics_results["F-measure [{0}]".format(label)] = f_measure_score # Print evaluation results (in alphabetical order) if verbose == True: for result in sorted(metrics_results): print("{0}: {1}".format(result, metrics_results[result])) return metrics_results
def evaluate( self, test_set, classifier=None, accuracy=True, f_measure=True, precision=True, recall=True, verbose=False, ): """ Evaluate and print classifier performance on the test set. :param test_set: A list of (tokens, label) tuples to use as gold set. :param classifier: a classifier instance (previously trained). :param accuracy: if `True`, evaluate classifier accuracy. :param f_measure: if `True`, evaluate classifier f_measure. :param precision: if `True`, evaluate classifier precision. :param recall: if `True`, evaluate classifier recall. :return: evaluation results. :rtype: dict(str): float """ if classifier is None: classifier = self.classifier print("Evaluating {0} results...".format(type(classifier).__name__)) metrics_results = {} if accuracy == True: accuracy_score = eval_accuracy(classifier, test_set) metrics_results['Accuracy'] = accuracy_score gold_results = defaultdict(set) test_results = defaultdict(set) labels = set() for i, (feats, label) in enumerate(test_set): labels.add(label) gold_results[label].add(i) observed = classifier.classify(feats) test_results[observed].add(i) for label in labels: if precision == True: precision_score = eval_precision( gold_results[label], test_results[label] ) metrics_results['Precision [{0}]'.format(label)] = precision_score if recall == True: recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score if f_measure == True: f_measure_score = eval_f_measure( gold_results[label], test_results[label] ) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score # Print evaluation results (in alphabetical order) if verbose == True: for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) return metrics_results
def demo_vader_tweets(n_instances=None, output=None): """ Classify 10000 positive and negative tweets using Vader approach. :param n_instances: the number of total tweets that have to be classified. :param output: the output file where results have to be reported. """ from collections import defaultdict from nltk.corpus import twitter_samples from nltk.sentiment import SentimentIntensityAnalyzer from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) if n_instances is not None: n_instances = int(n_instances / 2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False, limit=n_instances) pos_docs = parse_tweets_set(positive_csv, label='pos') neg_docs = parse_tweets_set(negative_csv, label='neg') # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs + train_neg_docs testing_tweets = test_pos_docs + test_neg_docs vader_analyzer = SentimentIntensityAnalyzer() gold_results = defaultdict(set) test_results = defaultdict(set) acc_gold_results = [] acc_test_results = [] labels = set() num = 0 for i, (text, label) in enumerate(testing_tweets): labels.add(label) gold_results[label].add(i) acc_gold_results.append(label) score = vader_analyzer.polarity_scores(text)['compound'] if score > 0: observed = 'pos' else: observed = 'neg' num += 1 acc_test_results.append(observed) test_results[observed].add(i) metrics_results = {} for label in labels: accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) metrics_results['Accuracy'] = accuracy_score precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results['Precision [{0}]'.format(label)] = precision_score recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) if output: output_markdown(output, Approach='Vader', Dataset='labeled_tweets', Instances=n_instances, Results=metrics_results)
def demo_vader_tweets(n_instances=None, output=None): """ Classify 10000 positive and negative tweets using Vader approach. :param n_instances: the number of total tweets that have to be classified. :param output: the output file where results have to be reported. """ from collections import defaultdict from nltk.corpus import twitter_samples from nltk.sentiment import SentimentIntensityAnalyzer from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) if n_instances is not None: n_instances = int(n_instances/2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False, limit=n_instances) pos_docs = parse_tweets_set(positive_csv, label='pos') neg_docs = parse_tweets_set(negative_csv, label='neg') # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs vader_analyzer = SentimentIntensityAnalyzer() gold_results = defaultdict(set) test_results = defaultdict(set) acc_gold_results = [] acc_test_results = [] labels = set() num = 0 for i, (text, label) in enumerate(testing_tweets): labels.add(label) gold_results[label].add(i) acc_gold_results.append(label) score = vader_analyzer.polarity_scores(text)['compound'] if score > 0: observed = 'pos' else: observed = 'neg' num += 1 acc_test_results.append(observed) test_results[observed].add(i) metrics_results = {} for label in labels: accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) metrics_results['Accuracy'] = accuracy_score precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results['Precision [{0}]'.format(label)] = precision_score recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) if output: output_markdown(output, Approach='Vader', Dataset='labeled_tweets', Instances=n_instances, Results=metrics_results)
print('Ethotic Statement Classifier Accuracy') print(accuracy_score(testData['IsEthotic'], predictions) * 100, '%\n') print('Ethotic sentences with VADER sentiment classifier') reference_set = [] test_set = [] for index, item in enumerate(testData['Text']): if predictions[np.where(item == testData['Text'])].any() == 1: print(item) reference_set.append(testData['SentimentPolarity'][index]) #print(testData['SentimentPolarity'].keys()[index]) polarity_score = sid.polarity_scores(item) for j in sorted(polarity_score): print('{0}: {1}, \n'.format(j, polarity_score[j]), end='') if polarity_score['compound'] > 0: test_set.append(1) else: test_set.append(2) accuracy_score = eval_accuracy(reference_set, test_set) reference_set = set(reference_set) test_set = set(test_set) precision_score = eval_precision(reference_set, test_set) recall_score = eval_recall(reference_set, test_set) f_measure_score = eval_f_measure(reference_set, test_set) print('VADER Classification Statistics') print('Accuracy: ', accuracy_score * 100, '%') print('Precision: ', precision_score) print('Recall: ', recall_score) print('F-measure', f_measure_score)