Exemplo n.º 1
0
def main():
    usage = "%prog [options] <results-files>"
    description = "Evaluates parse results stored in files by comparing "\
        "them to the gold standard results stored with them, using any "\
        "a variety of metrics."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("--errors", dest="errors", action="store_true", help="display errors reading in the files.")
    parser.add_option("--unscored", dest="unscored", action="store_true", help="output a list of files containing no results (i.e. no successful full parses) and exit")
    parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options")
    parser.add_option("--mc", "--metric-computation", dest="print_computation", action="store_true", help="show the metric's computation trace for each input")
    options, arguments = parser.parse_args()
        
    if len(arguments) == 0:
        print >>sys.stderr, "Specify at least one file to read the results from"
        sys.exit(1)
    
    deprec_metric = command_line_metric(formalism, "deprec")
    deps_metric = command_line_metric(formalism, "deps")
    
    # Try loading all the input files
    preses = []
    input_pairs = []
    errors = []
    covered = 0
    input_filenames = []
    for filename in arguments:
        try:
            pres = ParseResults.from_file(filename)
        except ParseResults.LoadError, err:
            if options.errors:
                # Print all load errors
                print >>sys.stderr, "Error loading file: %s" % (err)
            errors.append(filename)
            continue
        preses.append(pres)
        
        # Try to get a gold standard result
        gold_result = pres.get_gold_semantics()
        if gold_result is None:
            # Can't evaluate this: ignore it
            if options.unscored:
                print "No gold result for", filename
            continue
        
        # Get the top result's semantics
        if len(pres.semantics) == 0:
            # No results for this
            input_pairs.append((None, gold_result))
            input_filenames.append(filename)
            continue
        top_result = pres.semantics[0][1]
        
        # Got a result and gold result for this
        covered += 1
        input_pairs.append((top_result, gold_result))
        input_filenames.append(filename)
Exemplo n.º 2
0
def main():
    usage = "%prog [options] <results-files>"
    description = "Evaluates parse results stored in files by comparing "\
        "them to the gold standard results stored with them, using any "\
        "a variety of metrics."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("--tabbed",
                      dest="tabbed",
                      action="store_true",
                      help="output a tabbed table of values")
    options, arguments = parser.parse_args()

    if len(arguments) == 0:
        print >> sys.stderr, "Specify at least one file to read the results from"
        sys.exit(1)

    deprec_metric = command_line_metric(formalism,
                                        "deprec",
                                        options="output=f")
    deps_metric = command_line_metric(formalism, "deps", options="output=f")

    # Try loading all the input files
    input_pairs = []
    errors = []
    covered = 0
    input_filenames = []
    for filename in arguments:
        try:
            pres = ParseResults.from_file(filename)
        except ParseResults.LoadError, err:
            errors.append(filename)
            continue

        # Try to get a gold standard result
        gold_result = pres.get_gold_semantics()
        if gold_result is None:
            # Can't evaluate this: ignore it
            if options.unscored:
                print "No gold result for", filename
            continue

        # Get the top result's semantics
        if len(pres.semantics) == 0:
            # No results for this
            input_pairs.append((None, gold_result))
            input_filenames.append(filename)
            continue
        top_result = pres.semantics[0][1]

        # Got a result and gold result for this
        covered += 1
        input_pairs.append((top_result, gold_result))
        input_filenames.append(filename)
Exemplo n.º 3
0
def main():
    usage = "%prog [options] <results-files>"
    description = "Evaluates parse results stored in files by comparing "\
        "them to the gold standard results stored with them, using any "\
        "a variety of metrics."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("--tabbed", dest="tabbed", action="store_true", help="output a tabbed table of values")
    options, arguments = parser.parse_args()
        
    if len(arguments) == 0:
        print >>sys.stderr, "Specify at least one file to read the results from"
        sys.exit(1)
    
    deprec_metric = command_line_metric(formalism, "deprec", options="output=f")
    deps_metric = command_line_metric(formalism, "deps", options="output=f")
    
    # Try loading all the input files
    input_pairs = []
    errors = []
    covered = 0
    input_filenames = []
    for filename in arguments:
        try:
            pres = ParseResults.from_file(filename)
        except ParseResults.LoadError, err:
            errors.append(filename)
            continue
        
        # Try to get a gold standard result
        gold_result = pres.get_gold_semantics()
        if gold_result is None:
            # Can't evaluate this: ignore it
            if options.unscored:
                print "No gold result for", filename
            continue
        
        # Get the top result's semantics
        if len(pres.semantics) == 0:
            # No results for this
            input_pairs.append((None, gold_result))
            input_filenames.append(filename)
            continue
        top_result = pres.semantics[0][1]
        
        # Got a result and gold result for this
        covered += 1
        input_pairs.append((top_result, gold_result))
        input_filenames.append(filename)
Exemplo n.º 4
0
def main():
    usage = "%prog [options] <results-dir1> <results-dir2>"
    description = "Measures statistical significance of two sets of results "\
        "using stratified shuffling. Only works with f-score metrics."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options")
    parser.add_option("--mc", "--metric-computation", dest="print_computation", action="store_true", help="show the metric's computation trace for each input")
    parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="just output the p-value, nothing else")
    parser.add_option("-i", "--iterations", dest="iterations", action="store", type="int", help="number of shuffles to do. Default: 100,000", default=100000)
    parser.add_option("-p", "--pattern", dest="pattern", action="store", help="filename glob pattern to look for in the directories. Default: *.res", default="*.res")
    parser.add_option("-e", "--exhaustive", dest="exhaustive", action="store_true", help="perform all possible shuffles exhaustively. You probably never want to do this. If not set, shuffles randomly for a fixed number of iterations")
    options, arguments = parser.parse_args()
    
    metric = command_line_metric(formalism, options.metric, options.mopts or [])
    if not isinstance(metric, FScoreMetric):
        print >>sys.stderr, "%s is not an f-score metric. The script is only for f-scores"
        sys.exit(1)
    
    if len(arguments) < 2:
        print >>sys.stderr, "Specify two directories to read results from"
        sys.exit(1)
    res_dir1 = arguments[0]
    res_dir2 = arguments[1]
    
    # Look for .res files in the two directories
    filenames1 = glob(os.path.join(res_dir1, options.pattern))
    filenames2 = glob(os.path.join(res_dir2, options.pattern))
    
    # We must be able to pair the filenames
    basenames1 = [os.path.basename(fn) for fn in filenames1]
    basenames2 = [os.path.basename(fn) for fn in filenames2]
    for basename2 in basenames2:
        if basename2 not in basenames1:
            print "No result in set 1 for %s" % basename2
    for basename1 in basenames1:
        if basename1 not in basenames2:
            print "No result in set 2 for %s" % basename1
    # Only use filenames that are in both directories
    basenames = list(set(basenames1) & set(basenames2))
    
    def _load_res(filename):
        try:
            return ParseResults.from_file(filename)
        except ParseResults.LoadError, err:
            if not options.quiet:
                print >>sys.stderr, "Error loading file %s: %s" % (filename, err)
Exemplo n.º 5
0
def main():
    usage = "%prog [options] <results-files>"
    description = "Evaluates parse results stored in files by comparing "\
        "them to the gold standard results stored with them, using any "\
        "a variety of metrics."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("--errors", dest="errors", action="store_true", help="display errors reading in the files.")
    parser.add_option("--unscored", dest="unscored", action="store_true", help="output a list of files containing no results (i.e. no successful full parses) and exit")
    parser.add_option("--timeout", dest="timeout", action="store_true", help="output a list of parses that timed out")
    parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options")
    parser.add_option("--mc", "--metric-computation", dest="print_computation", action="store_true", help="show the metric's computation trace for each input")
    parser.add_option("-f", "--f-score", dest="f_score", action="store_true", help="outputs recall, precision and f-score for an f-score-based metric. Just uses the same metric 3 times with output=recall, etc. Will only work with appropriate metrics")
    parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="just output the numbers, nothing else")
    parser.add_option("-t", "--time", dest="time", action="store_true", help="output average parse time. This is output by default, but hidden in quiet mode unless this switch is used")
    options, arguments = parser.parse_args()
        
    if options.f_score:
        # Special case: get 3 metrics
        metrics = []
        opts = options.mopts or []
        for opt in [ "output=precision", "output=recall" ]:
            metrics.append(command_line_metric(formalism, options.metric, 
                                                                opts+[opt]))
        if not options.quiet:
            print "Evaluating precision, recall and f-score on %s" % metrics[0].name
    else:
        # Get a metric according to the options
        metrics = [command_line_metric(formalism, options.metric, options.mopts)]
        if not options.quiet:
            print "Evaluating using metric: %s" % metrics[0].name
    
    if len(arguments) == 0:
        print >>sys.stderr, "Specify at least one file to read the results from"
        sys.exit(1)
    
    # Try loading all the input files
    input_pairs = []
    errors = []
    covered = 0
    input_filenames = []
    times = []
    timed_out = 0
    for filename in arguments:
        # We read in the whole file (it's pickled, so we have to), but don't 
        #  keep the pres object after the loop iteration, because it can 
        #  be very big
        try:
            pres = ParseResults.from_file(filename)
        except ParseResults.LoadError, err:
            if options.errors:
                # Print all load errors
                print >>sys.stderr, "Error loading file: %s" % (err)
            errors.append(filename)
            continue
        
        if options.timeout and pres.timed_out:
            print "Timed out: %s" % filename
        if pres.timed_out:
            timed_out += 1
        
        # Try to get a gold standard result
        gold_result = pres.get_gold_semantics()
        if gold_result is None:
            # Can't evaluate this: ignore it
            if not options.quiet:
                print "No gold result for", filename
            continue
        
        # Get the top result's semantics
        if len(pres.semantics) == 0:
            # No results for this
            input_pairs.append((None, gold_result))
            input_filenames.append(filename)
            if options.unscored:
                print "No results: %s" % filename
            continue
        top_result = pres.semantics[0][1]
        
        # Got a result and gold result for this
        covered += 1
        input_pairs.append((top_result, gold_result))
        input_filenames.append(filename)
        # Check this for compat with old stored results
        if hasattr(pres, 'cpu_time'):
            times.append(pres.cpu_time)
Exemplo n.º 6
0
def main():
    usage = "%prog [options] <consistency-data>"
    description = "Evaluates annotator consistency."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-m", "--metric", dest="metric", action="store", 
        help="semantics distance metric to use. Use '-m help' for a list of "\
            "available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts", 
        action="append", 
        help="options to pass to the semantics metric. Use with '--mopt help' "\
            "with -m to see available options")
    parser.add_option("-f", "--f-score", dest="f_score", action="store_true", 
        help="outputs recall, precision and f-score for an f-score-based "\
            "metric. Just uses the same metric 3 times with output=recall, "\
            "etc. Will only work with appropriate metrics")
    options, arguments = parser.parse_args()
    
    grammar = get_grammar()
    
    if options.metric is not None:
        use_metric = True
        if options.f_score:
            # Special case: get 3 metrics
            metrics = []
            opts = options.mopts or []
            for opt in [ "output=precision", "output=recall", "output=f" ]:
                metrics.append(command_line_metric(formalism, options.metric, 
                                                                    opts+[opt]))
            print "Evaluating precision, recall and f-score on %s" % metrics[0].name
        else:
            # Get a metric according to the options
            metrics = [command_line_metric(formalism, options.metric, options.mopts)]
            print "Evaluating using metric: %s" % metrics[0].name
    else:
        use_metric = False
    
    
    if len(arguments) < 1:
        print >>sys.stderr, "Specify a consistency data file"
        sys.exit(1)
    filename = arguments[0]
    
    consdata = ConsistencyData.from_file(filename)
    
    # Count up matching annotations
    matches = 0
    chords = 0
    for ann1,ann2 in consdata:
        for chord1,chord2 in zip(ann1,ann2):
            chords += 1
            if chord1.category == chord2.category:
                matches += 1
    # Count matching coordination points
    rean_coords = sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    gold_coords = sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    match_coords = sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if 
                                            crdr.treeinfo.coord_unresolved 
                                            and crdg.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                   sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if 
                                            crdr.treeinfo.coord_resolved 
                                            and crdg.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    # Compute precision, recall and f-score from this
    precision = 100.0 * (matches + match_coords) / (chords + rean_coords)
    recall = 100.0 * (matches + match_coords) / (chords + gold_coords)
    fscore = 2.0 * precision * recall / (precision+recall)
    print "%d chords" % chords
    print "\nCategory and coordination accuracy:"
    print "Precision: %.2f" % precision
    print "Recall: %.2f" % recall
    print "F-score: %.2f" % fscore
    
    if use_metric:
        print 
        def _parse_seq(seq):
            # Parse the annotations to get a semantics
            try:
                gold_parses = parse_sequence_with_annotations(
                                                    DbInput.from_sequence(seq), 
                                                    grammar=grammar,
                                                    allow_subparses=False)
                # Got a result: return its semantics
                return gold_parses[0].semantics
            except ParseError, err:
                # Could not parse annotated sequence
                print >>sys.stderr, "Could not parse sequence '%s': %s" % \
                                                        (seq.string_name, err)
                return 
        
        # Prepare pairs of gold-standard parse results from the two annotations
        sem_pairs = [
            (_parse_seq(ann1), _parse_seq(ann2)) for (ann1,ann2) in consdata
        ]
        # Compute the distance using the metrics
        for metric in metrics:
            distance = metric.total_distance(sem_pairs)
            print "%s: %s" % (metric.identifier.capitalize(), 
                              metric.format_distance(distance))
Exemplo n.º 7
0
def main():
    usage = "%prog [options] <consistency-data>"
    description = "Evaluates annotator consistency."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-m", "--metric", dest="metric", action="store",
        help="semantics distance metric to use. Use '-m help' for a list of "\
            "available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts",
        action="append",
        help="options to pass to the semantics metric. Use with '--mopt help' "\
            "with -m to see available options")
    parser.add_option("-f", "--f-score", dest="f_score", action="store_true",
        help="outputs recall, precision and f-score for an f-score-based "\
            "metric. Just uses the same metric 3 times with output=recall, "\
            "etc. Will only work with appropriate metrics")
    options, arguments = parser.parse_args()

    grammar = get_grammar()

    if options.metric is not None:
        use_metric = True
        if options.f_score:
            # Special case: get 3 metrics
            metrics = []
            opts = options.mopts or []
            for opt in ["output=precision", "output=recall", "output=f"]:
                metrics.append(
                    command_line_metric(formalism, options.metric,
                                        opts + [opt]))
            print "Evaluating precision, recall and f-score on %s" % metrics[
                0].name
        else:
            # Get a metric according to the options
            metrics = [
                command_line_metric(formalism, options.metric, options.mopts)
            ]
            print "Evaluating using metric: %s" % metrics[0].name
    else:
        use_metric = False

    if len(arguments) < 1:
        print >> sys.stderr, "Specify a consistency data file"
        sys.exit(1)
    filename = arguments[0]

    consdata = ConsistencyData.from_file(filename)

    # Count up matching annotations
    matches = 0
    chords = 0
    for ann1, ann2 in consdata:
        for chord1, chord2 in zip(ann1, ann2):
            chords += 1
            if chord1.category == chord2.category:
                matches += 1
    # Count matching coordination points
    rean_coords = sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in seq if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    gold_coords = sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                  sum(sum(
                    [1 for crd in gs if crd.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    match_coords = sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if
                                            crdr.treeinfo.coord_unresolved
                                            and crdg.treeinfo.coord_unresolved])
                        for seq,gs in consdata) + \
                   sum(sum(
                    [1 for crdr,crdg in zip(seq,gs) if
                                            crdr.treeinfo.coord_resolved
                                            and crdg.treeinfo.coord_resolved])
                        for seq,gs in consdata)
    # Compute precision, recall and f-score from this
    precision = 100.0 * (matches + match_coords) / (chords + rean_coords)
    recall = 100.0 * (matches + match_coords) / (chords + gold_coords)
    fscore = 2.0 * precision * recall / (precision + recall)
    print "%d chords" % chords
    print "\nCategory and coordination accuracy:"
    print "Precision: %.2f" % precision
    print "Recall: %.2f" % recall
    print "F-score: %.2f" % fscore

    if use_metric:
        print

        def _parse_seq(seq):
            # Parse the annotations to get a semantics
            try:
                gold_parses = parse_sequence_with_annotations(
                    DbInput.from_sequence(seq),
                    grammar=grammar,
                    allow_subparses=False)
                # Got a result: return its semantics
                return gold_parses[0].semantics
            except ParseError, err:
                # Could not parse annotated sequence
                print >>sys.stderr, "Could not parse sequence '%s': %s" % \
                                                        (seq.string_name, err)
                return

        # Prepare pairs of gold-standard parse results from the two annotations
        sem_pairs = [(_parse_seq(ann1), _parse_seq(ann2))
                     for (ann1, ann2) in consdata]
        # Compute the distance using the metrics
        for metric in metrics:
            distance = metric.total_distance(sem_pairs)
            print "%s: %s" % (metric.identifier.capitalize(),
                              metric.format_distance(distance))