def writer(class_name, table, cls_to_accept):
    class_writer = unicodecsv.writer(open(class_name, 'w'), encoding='utf-8')
    class_writer.writerow(("tweet_id", "tweet_text"))
    for cls, tweet_id, tweet_text in sql_convenience.extract_classifications_and_tweets(args.table):
        if cls == cls_to_accept:
            # remove carriage returns
            tweet_text = tweet_text.replace("\n", " ")
            class_writer.writerow((tweet_id, tweet_text))
Пример #2
0
def writer(class_name, table, cls_to_accept):
    class_writer = unicodecsv.writer(open(class_name, 'w'), encoding='utf-8')
    class_writer.writerow(("tweet_id", "tweet_text"))
    for cls, tweet_id, tweet_text in sql_convenience.extract_classifications_and_tweets(
            args.table):
        if cls == cls_to_accept:
            # remove carriage returns
            tweet_text = tweet_text.replace("\n", " ")
            class_writer.writerow((tweet_id, tweet_text))
def label_learned_set(vectorizer, clfl, threshold, validation_table):
    for row in sql_convenience.extract_classifications_and_tweets(validation_table):
        cls, tweet_id, tweet_text = row
        spd = vectorizer.transform([tweet_text]).todense()
        predicted_cls = clfl.predict(spd)
        predicted_class = predicted_cls[0]  # turn 1D array of 1 item into 1 item
        predicted_proba = clfl.predict_proba(spd)[0][predicted_class]
        if predicted_proba < threshold and predicted_class == 1:
            predicted_class = 0  # force to out-of-class if we don't trust our answer
        sql_convenience.update_class(tweet_id, validation_table, predicted_class)
def label_learned_set(vectorizer, clfl, threshold, validation_table):
    for row in sql_convenience.extract_classifications_and_tweets(validation_table):
        cls, tweet_id, tweet_text = row
        spd = vectorizer.transform([tweet_text]).todense()
        predicted_cls = clfl.predict(spd)
        predicted_class = predicted_cls[0]  # turn 1D array of 1 item into 1 item
        predicted_proba = clfl.predict_proba(spd)[0][predicted_class]
        if predicted_proba < threshold and predicted_class == 1:
            predicted_class = 0  # force to out-of-class if we don't trust our answer
        sql_convenience.update_class(tweet_id, validation_table, predicted_class)
Пример #5
0
import unicodecsv

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Tweet annotator')
    parser.add_argument(
        'keyword',
        help=
        'Keyword we wish to disambiguate (determines table name and used to filter tweets)'
    )
    parser.add_argument(
        '--csv',
        default=None,
        help='CSV filename to write to (e.g. output.csv), defaults to stdout')
    args = parser.parse_args()

    if args.csv is None:
        writer_stream = sys.stdout
    else:
        writer_stream = open(args.csv, "w")

    writer = unicodecsv.writer(writer_stream, encoding='utf-8')

    classifications_and_tweets = sql_convenience.extract_classifications_and_tweets(
        args.keyword)
    for cls, tweet_id, tweet in classifications_and_tweets:
        writer.writerow((cls, tweet))

    if not writer_stream.isatty():
        # close the file (but not stdout if that's what we're using!)
        writer_stream.close()
if __name__ == "__main__":
    # gold_std table, comparison_table
    parser = argparse.ArgumentParser(description='Score results against a gold standard')
    parser.add_argument('gold_standard_table', help='Name of the gold standard table (e.g. annotations_apple)')
    parser.add_argument('comparison_table', help='Name of the table we will score against the gold_standard_table (e.g. scikit_apple)')
    args = parser.parse_args()

    # counters for the 4 types of classification
    tp = 0  # True Positives (predicted in class and are actually in class)
    tn = 0  # True Negatives (predicted out of class and are actually out of class)
    fp = 0  # False Positives (predicted in class but are actually out of class)
    fn = 0  # False Negatives (predicted out of class but are actually in class)

    # for each tweet in comparison table, get tweet_id and cls
    classifications_and_tweets = sql_convenience.extract_classifications_and_tweets(args.gold_standard_table)
    for gold_class, tweet_id, tweet in classifications_and_tweets:
        cls, _, _ = sql_convenience.extract_classification_and_tweet(args.comparison_table, tweet_id)
        if gold_class == sql_convenience.CLASS_IN:
            if cls == sql_convenience.CLASS_IN:
                tp += 1
            else:
                assert cls == sql_convenience.CLASS_OUT
                fn += 1
        else:
            assert gold_class == sql_convenience.CLASS_OUT
            if cls == sql_convenience.CLASS_OUT:
                tn += 1
            else:
                assert cls == sql_convenience.CLASS_IN
                fp += 1
"""Annotate tweets by hand to create a gold standard"""
from __future__ import division  # 1/2 == 0.5, as in Py3
from __future__ import absolute_import  # avoid hiding global modules with locals
from __future__ import print_function  # force use of print("hello")
from __future__ import unicode_literals  # force unadorned strings "" to be unicode without prepending u""
import argparse
import sys
import sql_convenience
import unicodecsv

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Tweet annotator')
    parser.add_argument('keyword', help='Keyword we wish to disambiguate (determines table name and used to filter tweets)')
    parser.add_argument('--csv', default=None, help='CSV filename to write to (e.g. output.csv), defaults to stdout')
    args = parser.parse_args()

    if args.csv is None:
        writer_stream = sys.stdout
    else:
        writer_stream = open(args.csv, "w")

    writer = unicodecsv.writer(writer_stream, encoding='utf-8')

    classifications_and_tweets = sql_convenience.extract_classifications_and_tweets(args.keyword)
    for cls, tweet_id, tweet in classifications_and_tweets:
        writer.writerow((cls, tweet))

    if not writer_stream.isatty():
        # close the file (but not stdout if that's what we're using!)
        writer_stream.close()