예제 #1
0
def evaluate(corpus_path, denominator, intercept):
    corpus = read_gvfi(corpus_path)
    (training, heldout) = split_corpus(corpus, denominator, intercept)

    model, _ = train(training, 30, "lbfgs", 2)

    raw = predicted_and_actual_outcomes(model, heldout)

    # predicteds/desireds will be maps from classes (=outcomes) to sets of
    # corpus rows predicted/desired to be in that class:
    predicteds = dict()
    desireds = dict()
    for outcome in non_ignored_classes:
        for dictionary in predicteds, desireds:
            dictionary[outcome] = set()

    num_correct = 0
    for (row, (desired, predicted)) in raw.items():
        predicteds[predicted].add(row)
        desireds[desired].add(row)
        if desired == predicted:
            num_correct += 1
            # TODO: If we weigh by the annotation's trust, we
            # should use the annotation's trust here (and
            # elsewhere) instead of 1.
    accuracy = float(num_correct) / float(len(raw))
    precisions = dict()
    recalls = dict()
    f1s = dict()
    for outcome in non_ignored_classes:
        reference = predicteds[outcome]
        test = desireds[outcome]
        precisions[outcome] = nltk.precision(reference, test)
        recalls[outcome] = nltk.recall(reference, test)
        f1s[outcome] = nltk.f_measure(reference, test)  # TODO: feed it the right alpha (third arg) for f1.
    return accuracy, precisions, recalls, f1s, raw
예제 #2
0
def evaluate(corpus_path, denominator, intercept):
        corpus = read_gvfi(corpus_path)
        (training, heldout) = split_corpus(corpus, denominator, intercept)

        model, _ = train(training, 30, 'lbfgs', 2)

	raw = predicted_and_actual_outcomes(model, heldout)

	# predicteds/desireds will be maps from classes (=outcomes) to sets of
	# corpus rows predicted/desired to be in that class:
	predicteds = dict()
	desireds   = dict()
	for outcome in non_ignored_classes:
		for dictionary in predicteds, desireds:
			dictionary[outcome] = set()

	num_correct = 0
	for (row, (desired, predicted)) in raw.items():
		predicteds[predicted].add(row)
		desireds[desired].add(row)
		if desired == predicted:
			num_correct += 1
			# TODO: If we weigh by the annotation's trust, we
			# should use the annotation's trust here (and
			# elsewhere) instead of 1.
	accuracy = float(num_correct) / float(len(raw))
	precisions = dict()
	recalls    = dict()
	f1s        = dict()
	for outcome in non_ignored_classes:
		reference = predicteds[outcome]
		test      = desireds  [outcome]
		precisions[outcome] = nltk.precision(reference, test)
		recalls   [outcome] = nltk.recall   (reference, test)
		f1s       [outcome] = nltk.f_measure(reference, test) # TODO: feed it the right alpha (third arg) for f1.
	return accuracy, precisions, recalls, f1s, raw
예제 #3
0
#!/usr/bin/env python

"""Gets the named entities in each article.  Outputs a pickled dict from
   id_article to entity type to a list of entity names.
"""

import nltk
import pickle
from datum    import Datum, read_gvfi
from optparse import OptionParser

option_parser = OptionParser()
(options, [corpus_path, out_path]) = option_parser.parse_args()

corpus = read_gvfi(corpus_path)

def named_entities(datum):
	"""Return a dict from entity type to a list of named entities of that
	   type in the given datum.
	"""
	result = {}
	for sent in nltk.sent_tokenize(datum.article_snippet):
		for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
			if hasattr(chunk, 'node'):
				result.setdefault(chunk.node, [])
				result[chunk.node].append('_'.join(c[0].lower() for c in chunk.leaves()))
	return result

result = dict()
for datums in corpus.values():
	print '.',
예제 #4
0
#!/usr/bin/env python
"""Gets the named entities in each article.  Outputs a pickled dict from
   id_article to entity type to a list of entity names.
"""

import nltk
import pickle
from datum import Datum, read_gvfi
from optparse import OptionParser

option_parser = OptionParser()
(options, [corpus_path, out_path]) = option_parser.parse_args()

corpus = read_gvfi(corpus_path)


def named_entities(datum):
    """Return a dict from entity type to a list of named entities of that
	   type in the given datum.
	"""
    result = {}
    for sent in nltk.sent_tokenize(datum.article_snippet):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'node'):
                result.setdefault(chunk.node, [])
                result[chunk.node].append('_'.join(c[0].lower()
                                                   for c in chunk.leaves()))
    return result


result = dict()