def main(): usage = "%prog [options] <results-files>" description = "Evaluates parse results stored in files by comparing "\ "them to the gold standard results stored with them, using any "\ "a variety of metrics." parser = OptionParser(usage=usage, description=description) parser.add_option("--errors", dest="errors", action="store_true", help="display errors reading in the files.") parser.add_option("--unscored", dest="unscored", action="store_true", help="output a list of files containing no results (i.e. no successful full parses) and exit") parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options") parser.add_option("--mc", "--metric-computation", dest="print_computation", action="store_true", help="show the metric's computation trace for each input") options, arguments = parser.parse_args() if len(arguments) == 0: print >>sys.stderr, "Specify at least one file to read the results from" sys.exit(1) deprec_metric = command_line_metric(formalism, "deprec") deps_metric = command_line_metric(formalism, "deps") # Try loading all the input files preses = [] input_pairs = [] errors = [] covered = 0 input_filenames = [] for filename in arguments: try: pres = ParseResults.from_file(filename) except ParseResults.LoadError, err: if options.errors: # Print all load errors print >>sys.stderr, "Error loading file: %s" % (err) errors.append(filename) continue preses.append(pres) # Try to get a gold standard result gold_result = pres.get_gold_semantics() if gold_result is None: # Can't evaluate this: ignore it if options.unscored: print "No gold result for", filename continue # Get the top result's semantics if len(pres.semantics) == 0: # No results for this input_pairs.append((None, gold_result)) input_filenames.append(filename) continue top_result = pres.semantics[0][1] # Got a result and gold result for this covered += 1 input_pairs.append((top_result, gold_result)) input_filenames.append(filename)
def main(): usage = "%prog [options] <results-files>" description = "Evaluates parse results stored in files by comparing "\ "them to the gold standard results stored with them, using any "\ "a variety of metrics." parser = OptionParser(usage=usage, description=description) parser.add_option("--tabbed", dest="tabbed", action="store_true", help="output a tabbed table of values") options, arguments = parser.parse_args() if len(arguments) == 0: print >> sys.stderr, "Specify at least one file to read the results from" sys.exit(1) deprec_metric = command_line_metric(formalism, "deprec", options="output=f") deps_metric = command_line_metric(formalism, "deps", options="output=f") # Try loading all the input files input_pairs = [] errors = [] covered = 0 input_filenames = [] for filename in arguments: try: pres = ParseResults.from_file(filename) except ParseResults.LoadError, err: errors.append(filename) continue # Try to get a gold standard result gold_result = pres.get_gold_semantics() if gold_result is None: # Can't evaluate this: ignore it if options.unscored: print "No gold result for", filename continue # Get the top result's semantics if len(pres.semantics) == 0: # No results for this input_pairs.append((None, gold_result)) input_filenames.append(filename) continue top_result = pres.semantics[0][1] # Got a result and gold result for this covered += 1 input_pairs.append((top_result, gold_result)) input_filenames.append(filename)
def main(): usage = "%prog [options] <results-files>" description = "Evaluates parse results stored in files by comparing "\ "them to the gold standard results stored with them, using any "\ "a variety of metrics." parser = OptionParser(usage=usage, description=description) parser.add_option("--tabbed", dest="tabbed", action="store_true", help="output a tabbed table of values") options, arguments = parser.parse_args() if len(arguments) == 0: print >>sys.stderr, "Specify at least one file to read the results from" sys.exit(1) deprec_metric = command_line_metric(formalism, "deprec", options="output=f") deps_metric = command_line_metric(formalism, "deps", options="output=f") # Try loading all the input files input_pairs = [] errors = [] covered = 0 input_filenames = [] for filename in arguments: try: pres = ParseResults.from_file(filename) except ParseResults.LoadError, err: errors.append(filename) continue # Try to get a gold standard result gold_result = pres.get_gold_semantics() if gold_result is None: # Can't evaluate this: ignore it if options.unscored: print "No gold result for", filename continue # Get the top result's semantics if len(pres.semantics) == 0: # No results for this input_pairs.append((None, gold_result)) input_filenames.append(filename) continue top_result = pres.semantics[0][1] # Got a result and gold result for this covered += 1 input_pairs.append((top_result, gold_result)) input_filenames.append(filename)
def main(): usage = "%prog [options] <results-dir1> <results-dir2>" description = "Measures statistical significance of two sets of results "\ "using stratified shuffling. Only works with f-score metrics." parser = OptionParser(usage=usage, description=description) parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options") parser.add_option("--mc", "--metric-computation", dest="print_computation", action="store_true", help="show the metric's computation trace for each input") parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="just output the p-value, nothing else") parser.add_option("-i", "--iterations", dest="iterations", action="store", type="int", help="number of shuffles to do. Default: 100,000", default=100000) parser.add_option("-p", "--pattern", dest="pattern", action="store", help="filename glob pattern to look for in the directories. Default: *.res", default="*.res") parser.add_option("-e", "--exhaustive", dest="exhaustive", action="store_true", help="perform all possible shuffles exhaustively. You probably never want to do this. If not set, shuffles randomly for a fixed number of iterations") options, arguments = parser.parse_args() metric = command_line_metric(formalism, options.metric, options.mopts or []) if not isinstance(metric, FScoreMetric): print >>sys.stderr, "%s is not an f-score metric. The script is only for f-scores" sys.exit(1) if len(arguments) < 2: print >>sys.stderr, "Specify two directories to read results from" sys.exit(1) res_dir1 = arguments[0] res_dir2 = arguments[1] # Look for .res files in the two directories filenames1 = glob(os.path.join(res_dir1, options.pattern)) filenames2 = glob(os.path.join(res_dir2, options.pattern)) # We must be able to pair the filenames basenames1 = [os.path.basename(fn) for fn in filenames1] basenames2 = [os.path.basename(fn) for fn in filenames2] for basename2 in basenames2: if basename2 not in basenames1: print "No result in set 1 for %s" % basename2 for basename1 in basenames1: if basename1 not in basenames2: print "No result in set 2 for %s" % basename1 # Only use filenames that are in both directories basenames = list(set(basenames1) & set(basenames2)) def _load_res(filename): try: return ParseResults.from_file(filename) except ParseResults.LoadError, err: if not options.quiet: print >>sys.stderr, "Error loading file %s: %s" % (filename, err)
def main(): usage = "%prog [options] <results-files>" description = "Evaluates parse results stored in files by comparing "\ "them to the gold standard results stored with them, using any "\ "a variety of metrics." parser = OptionParser(usage=usage, description=description) parser.add_option("--errors", dest="errors", action="store_true", help="display errors reading in the files.") parser.add_option("--unscored", dest="unscored", action="store_true", help="output a list of files containing no results (i.e. no successful full parses) and exit") parser.add_option("--timeout", dest="timeout", action="store_true", help="output a list of parses that timed out") parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options") parser.add_option("--mc", "--metric-computation", dest="print_computation", action="store_true", help="show the metric's computation trace for each input") parser.add_option("-f", "--f-score", dest="f_score", action="store_true", help="outputs recall, precision and f-score for an f-score-based metric. Just uses the same metric 3 times with output=recall, etc. Will only work with appropriate metrics") parser.add_option("-q", "--quiet", dest="quiet", action="store_true", help="just output the numbers, nothing else") parser.add_option("-t", "--time", dest="time", action="store_true", help="output average parse time. This is output by default, but hidden in quiet mode unless this switch is used") options, arguments = parser.parse_args() if options.f_score: # Special case: get 3 metrics metrics = [] opts = options.mopts or [] for opt in [ "output=precision", "output=recall" ]: metrics.append(command_line_metric(formalism, options.metric, opts+[opt])) if not options.quiet: print "Evaluating precision, recall and f-score on %s" % metrics[0].name else: # Get a metric according to the options metrics = [command_line_metric(formalism, options.metric, options.mopts)] if not options.quiet: print "Evaluating using metric: %s" % metrics[0].name if len(arguments) == 0: print >>sys.stderr, "Specify at least one file to read the results from" sys.exit(1) # Try loading all the input files input_pairs = [] errors = [] covered = 0 input_filenames = [] times = [] timed_out = 0 for filename in arguments: # We read in the whole file (it's pickled, so we have to), but don't # keep the pres object after the loop iteration, because it can # be very big try: pres = ParseResults.from_file(filename) except ParseResults.LoadError, err: if options.errors: # Print all load errors print >>sys.stderr, "Error loading file: %s" % (err) errors.append(filename) continue if options.timeout and pres.timed_out: print "Timed out: %s" % filename if pres.timed_out: timed_out += 1 # Try to get a gold standard result gold_result = pres.get_gold_semantics() if gold_result is None: # Can't evaluate this: ignore it if not options.quiet: print "No gold result for", filename continue # Get the top result's semantics if len(pres.semantics) == 0: # No results for this input_pairs.append((None, gold_result)) input_filenames.append(filename) if options.unscored: print "No results: %s" % filename continue top_result = pres.semantics[0][1] # Got a result and gold result for this covered += 1 input_pairs.append((top_result, gold_result)) input_filenames.append(filename) # Check this for compat with old stored results if hasattr(pres, 'cpu_time'): times.append(pres.cpu_time)
def main(): usage = "%prog [options] <consistency-data>" description = "Evaluates annotator consistency." parser = OptionParser(usage=usage, description=description) parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of "\ "available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' "\ "with -m to see available options") parser.add_option("-f", "--f-score", dest="f_score", action="store_true", help="outputs recall, precision and f-score for an f-score-based "\ "metric. Just uses the same metric 3 times with output=recall, "\ "etc. Will only work with appropriate metrics") options, arguments = parser.parse_args() grammar = get_grammar() if options.metric is not None: use_metric = True if options.f_score: # Special case: get 3 metrics metrics = [] opts = options.mopts or [] for opt in [ "output=precision", "output=recall", "output=f" ]: metrics.append(command_line_metric(formalism, options.metric, opts+[opt])) print "Evaluating precision, recall and f-score on %s" % metrics[0].name else: # Get a metric according to the options metrics = [command_line_metric(formalism, options.metric, options.mopts)] print "Evaluating using metric: %s" % metrics[0].name else: use_metric = False if len(arguments) < 1: print >>sys.stderr, "Specify a consistency data file" sys.exit(1) filename = arguments[0] consdata = ConsistencyData.from_file(filename) # Count up matching annotations matches = 0 chords = 0 for ann1,ann2 in consdata: for chord1,chord2 in zip(ann1,ann2): chords += 1 if chord1.category == chord2.category: matches += 1 # Count matching coordination points rean_coords = sum(sum( [1 for crd in seq if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in seq if crd.treeinfo.coord_resolved]) for seq,gs in consdata) gold_coords = sum(sum( [1 for crd in gs if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in gs if crd.treeinfo.coord_resolved]) for seq,gs in consdata) match_coords = sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_unresolved and crdg.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_resolved and crdg.treeinfo.coord_resolved]) for seq,gs in consdata) # Compute precision, recall and f-score from this precision = 100.0 * (matches + match_coords) / (chords + rean_coords) recall = 100.0 * (matches + match_coords) / (chords + gold_coords) fscore = 2.0 * precision * recall / (precision+recall) print "%d chords" % chords print "\nCategory and coordination accuracy:" print "Precision: %.2f" % precision print "Recall: %.2f" % recall print "F-score: %.2f" % fscore if use_metric: print def _parse_seq(seq): # Parse the annotations to get a semantics try: gold_parses = parse_sequence_with_annotations( DbInput.from_sequence(seq), grammar=grammar, allow_subparses=False) # Got a result: return its semantics return gold_parses[0].semantics except ParseError, err: # Could not parse annotated sequence print >>sys.stderr, "Could not parse sequence '%s': %s" % \ (seq.string_name, err) return # Prepare pairs of gold-standard parse results from the two annotations sem_pairs = [ (_parse_seq(ann1), _parse_seq(ann2)) for (ann1,ann2) in consdata ] # Compute the distance using the metrics for metric in metrics: distance = metric.total_distance(sem_pairs) print "%s: %s" % (metric.identifier.capitalize(), metric.format_distance(distance))
def main(): usage = "%prog [options] <consistency-data>" description = "Evaluates annotator consistency." parser = OptionParser(usage=usage, description=description) parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of "\ "available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' "\ "with -m to see available options") parser.add_option("-f", "--f-score", dest="f_score", action="store_true", help="outputs recall, precision and f-score for an f-score-based "\ "metric. Just uses the same metric 3 times with output=recall, "\ "etc. Will only work with appropriate metrics") options, arguments = parser.parse_args() grammar = get_grammar() if options.metric is not None: use_metric = True if options.f_score: # Special case: get 3 metrics metrics = [] opts = options.mopts or [] for opt in ["output=precision", "output=recall", "output=f"]: metrics.append( command_line_metric(formalism, options.metric, opts + [opt])) print "Evaluating precision, recall and f-score on %s" % metrics[ 0].name else: # Get a metric according to the options metrics = [ command_line_metric(formalism, options.metric, options.mopts) ] print "Evaluating using metric: %s" % metrics[0].name else: use_metric = False if len(arguments) < 1: print >> sys.stderr, "Specify a consistency data file" sys.exit(1) filename = arguments[0] consdata = ConsistencyData.from_file(filename) # Count up matching annotations matches = 0 chords = 0 for ann1, ann2 in consdata: for chord1, chord2 in zip(ann1, ann2): chords += 1 if chord1.category == chord2.category: matches += 1 # Count matching coordination points rean_coords = sum(sum( [1 for crd in seq if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in seq if crd.treeinfo.coord_resolved]) for seq,gs in consdata) gold_coords = sum(sum( [1 for crd in gs if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in gs if crd.treeinfo.coord_resolved]) for seq,gs in consdata) match_coords = sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_unresolved and crdg.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_resolved and crdg.treeinfo.coord_resolved]) for seq,gs in consdata) # Compute precision, recall and f-score from this precision = 100.0 * (matches + match_coords) / (chords + rean_coords) recall = 100.0 * (matches + match_coords) / (chords + gold_coords) fscore = 2.0 * precision * recall / (precision + recall) print "%d chords" % chords print "\nCategory and coordination accuracy:" print "Precision: %.2f" % precision print "Recall: %.2f" % recall print "F-score: %.2f" % fscore if use_metric: print def _parse_seq(seq): # Parse the annotations to get a semantics try: gold_parses = parse_sequence_with_annotations( DbInput.from_sequence(seq), grammar=grammar, allow_subparses=False) # Got a result: return its semantics return gold_parses[0].semantics except ParseError, err: # Could not parse annotated sequence print >>sys.stderr, "Could not parse sequence '%s': %s" % \ (seq.string_name, err) return # Prepare pairs of gold-standard parse results from the two annotations sem_pairs = [(_parse_seq(ann1), _parse_seq(ann2)) for (ann1, ann2) in consdata] # Compute the distance using the metrics for metric in metrics: distance = metric.total_distance(sem_pairs) print "%s: %s" % (metric.identifier.capitalize(), metric.format_distance(distance))