def writer(class_name, table, cls_to_accept): class_writer = unicodecsv.writer(open(class_name, 'w'), encoding='utf-8') class_writer.writerow(("tweet_id", "tweet_text")) for cls, tweet_id, tweet_text in sql_convenience.extract_classifications_and_tweets(args.table): if cls == cls_to_accept: # remove carriage returns tweet_text = tweet_text.replace("\n", " ") class_writer.writerow((tweet_id, tweet_text))
def writer(class_name, table, cls_to_accept): class_writer = unicodecsv.writer(open(class_name, 'w'), encoding='utf-8') class_writer.writerow(("tweet_id", "tweet_text")) for cls, tweet_id, tweet_text in sql_convenience.extract_classifications_and_tweets( args.table): if cls == cls_to_accept: # remove carriage returns tweet_text = tweet_text.replace("\n", " ") class_writer.writerow((tweet_id, tweet_text))
def label_learned_set(vectorizer, clfl, threshold, validation_table): for row in sql_convenience.extract_classifications_and_tweets(validation_table): cls, tweet_id, tweet_text = row spd = vectorizer.transform([tweet_text]).todense() predicted_cls = clfl.predict(spd) predicted_class = predicted_cls[0] # turn 1D array of 1 item into 1 item predicted_proba = clfl.predict_proba(spd)[0][predicted_class] if predicted_proba < threshold and predicted_class == 1: predicted_class = 0 # force to out-of-class if we don't trust our answer sql_convenience.update_class(tweet_id, validation_table, predicted_class)
import unicodecsv if __name__ == "__main__": parser = argparse.ArgumentParser(description='Tweet annotator') parser.add_argument( 'keyword', help= 'Keyword we wish to disambiguate (determines table name and used to filter tweets)' ) parser.add_argument( '--csv', default=None, help='CSV filename to write to (e.g. output.csv), defaults to stdout') args = parser.parse_args() if args.csv is None: writer_stream = sys.stdout else: writer_stream = open(args.csv, "w") writer = unicodecsv.writer(writer_stream, encoding='utf-8') classifications_and_tweets = sql_convenience.extract_classifications_and_tweets( args.keyword) for cls, tweet_id, tweet in classifications_and_tweets: writer.writerow((cls, tweet)) if not writer_stream.isatty(): # close the file (but not stdout if that's what we're using!) writer_stream.close()
if __name__ == "__main__": # gold_std table, comparison_table parser = argparse.ArgumentParser(description='Score results against a gold standard') parser.add_argument('gold_standard_table', help='Name of the gold standard table (e.g. annotations_apple)') parser.add_argument('comparison_table', help='Name of the table we will score against the gold_standard_table (e.g. scikit_apple)') args = parser.parse_args() # counters for the 4 types of classification tp = 0 # True Positives (predicted in class and are actually in class) tn = 0 # True Negatives (predicted out of class and are actually out of class) fp = 0 # False Positives (predicted in class but are actually out of class) fn = 0 # False Negatives (predicted out of class but are actually in class) # for each tweet in comparison table, get tweet_id and cls classifications_and_tweets = sql_convenience.extract_classifications_and_tweets(args.gold_standard_table) for gold_class, tweet_id, tweet in classifications_and_tweets: cls, _, _ = sql_convenience.extract_classification_and_tweet(args.comparison_table, tweet_id) if gold_class == sql_convenience.CLASS_IN: if cls == sql_convenience.CLASS_IN: tp += 1 else: assert cls == sql_convenience.CLASS_OUT fn += 1 else: assert gold_class == sql_convenience.CLASS_OUT if cls == sql_convenience.CLASS_OUT: tn += 1 else: assert cls == sql_convenience.CLASS_IN fp += 1
"""Annotate tweets by hand to create a gold standard""" from __future__ import division # 1/2 == 0.5, as in Py3 from __future__ import absolute_import # avoid hiding global modules with locals from __future__ import print_function # force use of print("hello") from __future__ import unicode_literals # force unadorned strings "" to be unicode without prepending u"" import argparse import sys import sql_convenience import unicodecsv if __name__ == "__main__": parser = argparse.ArgumentParser(description='Tweet annotator') parser.add_argument('keyword', help='Keyword we wish to disambiguate (determines table name and used to filter tweets)') parser.add_argument('--csv', default=None, help='CSV filename to write to (e.g. output.csv), defaults to stdout') args = parser.parse_args() if args.csv is None: writer_stream = sys.stdout else: writer_stream = open(args.csv, "w") writer = unicodecsv.writer(writer_stream, encoding='utf-8') classifications_and_tweets = sql_convenience.extract_classifications_and_tweets(args.keyword) for cls, tweet_id, tweet in classifications_and_tweets: writer.writerow((cls, tweet)) if not writer_stream.isatty(): # close the file (but not stdout if that's what we're using!) writer_stream.close()