def evaluate(corpus_path, denominator, intercept): corpus = read_gvfi(corpus_path) (training, heldout) = split_corpus(corpus, denominator, intercept) model, _ = train(training, 30, "lbfgs", 2) raw = predicted_and_actual_outcomes(model, heldout) # predicteds/desireds will be maps from classes (=outcomes) to sets of # corpus rows predicted/desired to be in that class: predicteds = dict() desireds = dict() for outcome in non_ignored_classes: for dictionary in predicteds, desireds: dictionary[outcome] = set() num_correct = 0 for (row, (desired, predicted)) in raw.items(): predicteds[predicted].add(row) desireds[desired].add(row) if desired == predicted: num_correct += 1 # TODO: If we weigh by the annotation's trust, we # should use the annotation's trust here (and # elsewhere) instead of 1. accuracy = float(num_correct) / float(len(raw)) precisions = dict() recalls = dict() f1s = dict() for outcome in non_ignored_classes: reference = predicteds[outcome] test = desireds[outcome] precisions[outcome] = nltk.precision(reference, test) recalls[outcome] = nltk.recall(reference, test) f1s[outcome] = nltk.f_measure(reference, test) # TODO: feed it the right alpha (third arg) for f1. return accuracy, precisions, recalls, f1s, raw
def evaluate(corpus_path, denominator, intercept): corpus = read_gvfi(corpus_path) (training, heldout) = split_corpus(corpus, denominator, intercept) model, _ = train(training, 30, 'lbfgs', 2) raw = predicted_and_actual_outcomes(model, heldout) # predicteds/desireds will be maps from classes (=outcomes) to sets of # corpus rows predicted/desired to be in that class: predicteds = dict() desireds = dict() for outcome in non_ignored_classes: for dictionary in predicteds, desireds: dictionary[outcome] = set() num_correct = 0 for (row, (desired, predicted)) in raw.items(): predicteds[predicted].add(row) desireds[desired].add(row) if desired == predicted: num_correct += 1 # TODO: If we weigh by the annotation's trust, we # should use the annotation's trust here (and # elsewhere) instead of 1. accuracy = float(num_correct) / float(len(raw)) precisions = dict() recalls = dict() f1s = dict() for outcome in non_ignored_classes: reference = predicteds[outcome] test = desireds [outcome] precisions[outcome] = nltk.precision(reference, test) recalls [outcome] = nltk.recall (reference, test) f1s [outcome] = nltk.f_measure(reference, test) # TODO: feed it the right alpha (third arg) for f1. return accuracy, precisions, recalls, f1s, raw
#!/usr/bin/env python """Gets the named entities in each article. Outputs a pickled dict from id_article to entity type to a list of entity names. """ import nltk import pickle from datum import Datum, read_gvfi from optparse import OptionParser option_parser = OptionParser() (options, [corpus_path, out_path]) = option_parser.parse_args() corpus = read_gvfi(corpus_path) def named_entities(datum): """Return a dict from entity type to a list of named entities of that type in the given datum. """ result = {} for sent in nltk.sent_tokenize(datum.article_snippet): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'node'): result.setdefault(chunk.node, []) result[chunk.node].append('_'.join(c[0].lower() for c in chunk.leaves())) return result result = dict() for datums in corpus.values(): print '.',
#!/usr/bin/env python """Gets the named entities in each article. Outputs a pickled dict from id_article to entity type to a list of entity names. """ import nltk import pickle from datum import Datum, read_gvfi from optparse import OptionParser option_parser = OptionParser() (options, [corpus_path, out_path]) = option_parser.parse_args() corpus = read_gvfi(corpus_path) def named_entities(datum): """Return a dict from entity type to a list of named entities of that type in the given datum. """ result = {} for sent in nltk.sent_tokenize(datum.article_snippet): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'node'): result.setdefault(chunk.node, []) result[chunk.node].append('_'.join(c[0].lower() for c in chunk.leaves())) return result result = dict()