def options_help_text(options, intro=None): """ Produces a load of help text to output to the command line to display the usage of all of the options in the list. """ if len(options) == 0: return "This module has no options" from jazzparser.utils.tableprint import pprint_table from StringIO import StringIO rows = [] # Put required options first for opt in [o for o in options if o.required]: rows.append([opt.name, "%s (REQUIRED)" % opt.usage, opt.help_text]) for opt in [o for o in options if not o.required]: rows.append([opt.name, opt.usage, opt.help_text]) output = StringIO() # Print the options in a nice table pprint_table(output, rows, separator="", justs=[True, True, True], widths=[None, 35, 40], blank_row=True) strout = output.getvalue() output.close() if intro is not None: strout = "%s\n%s\n%s" % (intro, "=" * len(intro), strout) return strout
def count_categories(options, arguments): # Make a Django query to get all the chord data query = Chord.objects.exclude(sequence__analysis_omitted=True) # Allow blank categories to be ignored if options.no_blanks: print >> sys.stderr, "Excluding unannotated chords" query = query.exclude(category="") categories = query.values('category').annotate( count=Count('id')).order_by('category') total = query.count() table_header = [['Category', 'Count', '%']] table_data = [] for data in categories: category = data['category'] and "%s" % data['category'] or "No category" percent = float(data['count']) / float(total) * 100.0 table_data.append([category, data['count'], percent]) # Sort the rows by the count table_data = reversed(sorted(table_data, key=lambda d: d[1])) # Now format the numbers table_data = [[row[0], "%s" % row[1], "%.02f" % row[2]] for row in table_data] # Add the header on the top table_data = table_header + table_data if options.csv: print "\n".join([",".join([v for v in row]) for row in table_data]) else: pprint_table(sys.stdout, table_data, [True, False, False], "|") print "Total chords: %s" % total return 0
def count_categories(options, arguments): # Make a Django query to get all the chord data query = Chord.objects.exclude(sequence__analysis_omitted=True) # Allow blank categories to be ignored if options.no_blanks: print >>sys.stderr, "Excluding unannotated chords" query = query.exclude(category="") categories = query.values('category').annotate(count=Count('id')).order_by('category') total = query.count() table_header = [['Category','Count','%']] table_data = [] for data in categories: category = data['category'] and "%s" % data['category'] or "No category" percent = float(data['count']) / float(total) * 100.0 table_data.append([category, data['count'], percent]) # Sort the rows by the count table_data = reversed(sorted(table_data, key=lambda d: d[1])) # Now format the numbers table_data = [[row[0], "%s" % row[1], "%.02f" % row[2]] for row in table_data] # Add the header on the top table_data = table_header + table_data if options.csv: print "\n".join([",".join([v for v in row]) for row in table_data]) else: pprint_table(sys.stdout, table_data, [True,False,False], "|") print "Total chords: %s" % total return 0
def count_categories(options, arguments): # Read in the sequence data from the file filename = os.path.abspath(arguments[0]) seqs = SequenceIndex.from_file(filename) category_counts = {} total = 0 # Count up how many times each category is used for seq in seqs.sequences: for chord in seq.iterator(): total += 1 if chord.category not in category_counts: category_counts[chord.category] = 1 else: category_counts[chord.category] += 1 table_header = [['Category','Count','%']] table_data = [] for cat,count in category_counts.items(): category = cat or "No category" percent = float(count) / float(total) * 100.0 table_data.append([category, count, percent]) # Sort the rows by the count table_data = reversed(sorted(table_data, key=lambda d: d[1])) # Now format the numbers table_data = [[row[0], "%s" % row[1], "%.02f" % row[2]] for row in table_data] # Add the header on the top table_data = table_header + table_data if options.csv: print "\n".join([",".join([v for v in row]) for row in table_data]) else: pprint_table(sys.stdout, table_data, [True,False,False], "|") print "Total chords: %s" % total return 0
def run(self, args, state): from jazzparser.utils.tableprint import pprint_table import sys if len(args) == 0: # Print the command usage info table = [] for tool in state.all_tools: if len(tool.commands) > 1: alts = " [Alternatively: %s]" % ", ".join( tool.commands[1:]) else: alts = "" # If the command has options, list them here as well if len(tool.tool_options) != 0: opts = "\nOptions: %s" % ", ".join(\ [opt.name for opt in tool.tool_options]) else: opts = "" table.append([tool.usage[0], tool.usage[1] + alts + opts]) pprint_table(sys.stdout, table, default_just=True, widths=[30,50], \ blank_row=True, hanging_indent=4) print "\nType 'help <command>' for detailed help about a command" else: command = args[0] if command not in state.tools: print "%s is not a valid command." % command print "Type 'help' for a full command list." else: tool = state.tools[command] title = "%s Shell Command" % tool.name # Compile the help text for the tool's options if len(tool.tool_options): opts = "\nOptions:" # Put required options first for opt in [o for o in tool.tool_options if o.required]: opts += "\n %s %s (REQUIRED)\n %s" % \ (opt.name, opt.usage, \ "\n ".join(wrap(opt.help_text, 75))) # Then all the rest for opt in [ o for o in tool.tool_options if not o.required ]: opts += "\n%s %s\n %s" % \ (opt.name, opt.usage, \ "\n ".join(wrap(opt.help_text, 75))) else: opts = "" # Print out all of the info print """\ %s %s Usage: %s %s Command aliases: %s %s%s""" % (title, "=" * len(title), tool.usage[0], tool.usage[1], ", ".join( tool.commands), tool.help, opts)
def main(): parser = OptionParser() parser.add_option( "-t", "--tagger", dest="tagger", action="store_true", help= "The tagger component to use (full python path to the tagger class). Default: %s" % DEFAULT_TAGGER) options, arguments = parser.parse_args() if options.tagger is not None: tagger = options.tagger else: tagger = DEFAULT_TAGGER # Use the default grammar grammar = Grammar() tagger_class = get_tagger(tagger) total_entropy = 0.0 total_chords = 0 # Compile the data for displaying in a table data = [] for sequence in ChordSequence.objects.filter(analysis_omitted=False): print "Analyzing entropy of model on %s" % sequence.name # Calculate the total word-level entropy of this sequence sequence_chords = list(sequence.iterator()) entropy, sequence_length = sequence_entropy(sequence_chords, grammar, tagger_class) data.append({ 'name': sequence.name.encode('ascii', 'replace'), 'entropy': entropy, 'length': sequence_length, 'entropy_per_chord': (sequence_length != 0 and (entropy / sequence_length) or 0.0), }) if sequence_length: total_entropy += entropy total_chords += sequence_length # Display a table of the results table_data = [['Sequence', 'Entropy', 'Chords', 'Entropy per chord']] + [[ d['name'], "%.4f" % d['entropy'], "%d" % d['length'], "%.4f" % d['entropy_per_chord'] ] for d in data] pprint_table(sys.stdout, table_data, [True, False, False, False]) # Calculate the perplexity over the whole set perplexity = math.pow(2, total_entropy / total_chords) print "### Entropy per chord: %.4f" % (total_entropy / total_chords) print "### Perplexity = %.4f" % perplexity
def run(self, args, state): from jazzparser.utils.tableprint import pprint_table import sys if len(args) == 0: # Print the command usage info table = [] for tool in state.all_tools: if len(tool.commands) > 1: alts = " [Alternatively: %s]" % ", ".join(tool.commands[1:]) else: alts = "" # If the command has options, list them here as well if len(tool.tool_options) != 0: opts = "\nOptions: %s" % ", ".join(\ [opt.name for opt in tool.tool_options]) else: opts = "" table.append([tool.usage[0], tool.usage[1]+alts+opts]) pprint_table(sys.stdout, table, default_just=True, widths=[30,50], \ blank_row=True, hanging_indent=4) print "\nType 'help <command>' for detailed help about a command" else: command = args[0] if command not in state.tools: print "%s is not a valid command." % command print "Type 'help' for a full command list." else: tool = state.tools[command] title = "%s Shell Command" % tool.name # Compile the help text for the tool's options if len(tool.tool_options): opts = "\nOptions:" # Put required options first for opt in [o for o in tool.tool_options if o.required]: opts += "\n %s %s (REQUIRED)\n %s" % \ (opt.name, opt.usage, \ "\n ".join(wrap(opt.help_text, 75))) # Then all the rest for opt in [o for o in tool.tool_options if not o.required]: opts += "\n%s %s\n %s" % \ (opt.name, opt.usage, \ "\n ".join(wrap(opt.help_text, 75))) else: opts = "" # Print out all of the info print """\ %s %s Usage: %s %s Command aliases: %s %s%s""" % (title, "=" * len(title), tool.usage[0], tool.usage[1], ", ".join(tool.commands), tool.help, opts)
def main(): usage = "%prog [options] <seq-file>" description = "Outputs the details of all chord sequences from a "\ "sequence index file to stdout. This is for getting a "\ "(relatively) human-readable form of the data" parser = OptionParser(usage=usage, description=description) parser.add_option("--categories", "-c", dest="categories", action="store_true", help="include category annotations") parser.add_option("--coordinations", "-o", dest="coordinations", action="store_true", help="include coordination annotations") parser.add_option("--meta", "-m", dest="meta", action="store_true", help="output sequence meta data") parser.add_option("--no-map", "-n", dest="no_map", action="store_true", help="don't apply a mapping from the names in the corpus to those used in the paper") parser.add_option("--all", "-a", dest="all", action="store_true", help="output everything") options, arguments = parser.parse_args() if len(arguments) < 1: print "You must specify a sequence file" sys.exit(1) # Get the chord sequence seqs = SequenceIndex.from_file(arguments[0]) # Show the song name for seq in seqs: print "Chords for '%s'" % seq.string_name if options.meta or options.all: print "Main key: %s" % seq.key print "Bar length: %d" % seq.bar_length # Put together a table of chords plus annotations (if requested) data = [[ str(chord) for chord in seq ], [ str(chord.duration) for chord in seq ]] if options.categories or options.all: if options.no_map: # Don't apply any mapping to the category names data.append([ chord.category for chord in seq ]) else: # Map the names to those used in the paper/thesis data.append([ annotation_to_lexicon_name(chord.category) for chord in seq ]) if options.coordinations or options.all: coords = [] for chord in seq: ti = chord.treeinfo if ti.coord_resolved and ti.coord_unresolved: coords.append(")(") elif ti.coord_resolved: coords.append(")") elif ti.coord_unresolved: coords.append("(") else: coords.append("") data.append(coords) pprint_table(sys.stdout, data, default_just=True) print
def main(): parser = OptionParser() usage = "%prog [options] [<seq-db-file>]" description = "Measure the degree of ambiguity (average cats per chord) "\ "for a grammar over a particular dataset" parser.add_option('-g', '--grammar', dest='grammar', action='store', help='Speficy a grammar by name') options, arguments = parser.parse_args() if len(arguments) < 1: print "No sequence index file given: grammar stats only" seq_file = None else: seq_file = arguments[0] # Load the grammar grammar = get_grammar(options.grammar) # Some stats about ambiguity in the grammar table = [] class_cats = [] for class_name, chord_class in grammar.chord_classes.items(): if class_name not in EXCLUDE_CLASSES: cats = grammar.get_signs_for_word(str(chord_class.words[0])) table.append([str(class_name), str(len(cats))]) class_cats.append(len(cats)) table.append(["Mean", "%.2f" % (float(sum(class_cats)) / len(class_cats))]) table.append(["Std dev", "%.2f" % (std(class_cats))]) print "Cats for each chord class:" pprint_table(sys.stdout, table, justs=[True, True]) # Ambiguity stats on the dataset if seq_file is not None: seqs = SequenceIndex.from_file(arguments[0]) counts = [] for seq in seqs: for chord in seq: cats = grammar.get_signs_for_word(chord) counts.append(len(cats)) table = [] table.append(["Chords", str(len(counts))]) table.append( ["Cats per chord", "%.2f" % (float(sum(counts)) / len(counts))]) table.append(["Std dev", "%.2f" % (std(counts))]) print pprint_table(sys.stdout, table, justs=[True, True])
def main(): parser = OptionParser() usage = "%prog [options] [<seq-db-file>]" description = "Measure the degree of ambiguity (average cats per chord) "\ "for a grammar over a particular dataset" parser.add_option('-g', '--grammar', dest='grammar', action='store', help='Speficy a grammar by name') options, arguments = parser.parse_args() if len(arguments) < 1: print "No sequence index file given: grammar stats only" seq_file = None else: seq_file = arguments[0] # Load the grammar grammar = get_grammar(options.grammar) # Some stats about ambiguity in the grammar table = [] class_cats = [] for class_name,chord_class in grammar.chord_classes.items(): if class_name not in EXCLUDE_CLASSES: cats = grammar.get_signs_for_word(str(chord_class.words[0])) table.append([str(class_name), str(len(cats))]) class_cats.append(len(cats)) table.append(["Mean", "%.2f" % (float(sum(class_cats))/len(class_cats))]) table.append(["Std dev", "%.2f" % (std(class_cats))]) print "Cats for each chord class:" pprint_table(sys.stdout, table, justs=[True, True]) # Ambiguity stats on the dataset if seq_file is not None: seqs = SequenceIndex.from_file(arguments[0]) counts = [] for seq in seqs: for chord in seq: cats = grammar.get_signs_for_word(chord) counts.append(len(cats)) table = [] table.append(["Chords", str(len(counts))]) table.append(["Cats per chord", "%.2f" % (float(sum(counts)) / len(counts))]) table.append(["Std dev", "%.2f" % (std(counts))]) print pprint_table(sys.stdout, table, justs=[True, True])
def list_results(results, silent): """ Like jazzparser.parser.list_results, but shows probabilities. Note this doesn't obey the Latex option because I couldn't be bothered. """ import math def _fmt_index(i): return format(i, " >3d") if len(results) == 0: if not silent: print "No results" elif silent: # Only print the results themselves if we're in silent mode for i in range(len(results)): print "%s, %s" % (results[i], fmt_prob(results[i].probability)) else: previous_prob = None # Get the highest scoring probability to compute the ratio of the others if len(results): log_highest_prob = results[0].probability print "Log highest prob: %s" % log_highest_prob table = [["", "", "Prob", "Ratio", "Sign"]] for i in range(len(results)): # Mark where probabilities are identical if previous_prob == results[i].probability: same_marker = "*" else: same_marker = " " # Compute the ratio to the highest probability prob_ratio = math.exp(results[i].probability - log_highest_prob) table.append([ "%s>" % _fmt_index(i), same_marker, fmt_prob(math.exp(results[i].probability)), "%.4f" % prob_ratio, str(results[i]) ]) previous_prob = results[i].probability pprint_table(sys.stdout, table, justs=[True, True, True, True, True])
def main(): parser = OptionParser() parser.add_option("-t", "--tagger", dest="tagger", action="store_true", help="The tagger component to use (full python path to the tagger class). Default: %s" % DEFAULT_TAGGER) options, arguments = parser.parse_args() if options.tagger is not None: tagger = options.tagger else: tagger = DEFAULT_TAGGER # Use the default grammar grammar = Grammar() tagger_class = get_tagger(tagger) total_entropy = 0.0 total_chords = 0 # Compile the data for displaying in a table data = [] for sequence in ChordSequence.objects.filter(analysis_omitted=False): print "Analyzing entropy of model on %s" % sequence.name # Calculate the total word-level entropy of this sequence sequence_chords = list(sequence.iterator()) entropy,sequence_length = sequence_entropy(sequence_chords, grammar, tagger_class) data.append( { 'name' : sequence.name.encode('ascii', 'replace'), 'entropy' : entropy, 'length' : sequence_length, 'entropy_per_chord' : (sequence_length!=0 and (entropy/sequence_length) or 0.0), }) if sequence_length: total_entropy += entropy total_chords += sequence_length # Display a table of the results table_data = [['Sequence', 'Entropy', 'Chords', 'Entropy per chord']] + [ [ d['name'], "%.4f" % d['entropy'], "%d" % d['length'], "%.4f" % d['entropy_per_chord'] ] for d in data ] pprint_table(sys.stdout, table_data, [True, False, False, False]) # Calculate the perplexity over the whole set perplexity = math.pow(2, total_entropy/total_chords) print "### Entropy per chord: %.4f" % (total_entropy/total_chords) print "### Perplexity = %.4f" % perplexity
def list_results(results, silent): """ Like jazzparser.parser.list_results, but shows probabilities. Note this doesn't obey the Latex option because I couldn't be bothered. """ import math def _fmt_index(i): return format(i, " >3d") if len(results) == 0: if not silent: print "No results" elif silent: # Only print the results themselves if we're in silent mode for i in range(len(results)): print "%s, %s" % (results[i], fmt_prob(results[i].probability)) else: previous_prob = None # Get the highest scoring probability to compute the ratio of the others if len(results): log_highest_prob = results[0].probability print "Log highest prob: %s" % log_highest_prob table = [["", "", "Prob", "Ratio", "Sign"]] for i in range(len(results)): # Mark where probabilities are identical if previous_prob == results[i].probability: same_marker = "*" else: same_marker = " " # Compute the ratio to the highest probability prob_ratio = math.exp(results[i].probability - log_highest_prob) table.append(["%s>" % _fmt_index(i), same_marker, fmt_prob(math.exp(results[i].probability)), "%.4f" % prob_ratio, str(results[i])]) previous_prob = results[i].probability pprint_table(sys.stdout, table, justs=[True,True,True,True,True])
def options_help_text(options, intro=None): """ Produces a load of help text to output to the command line to display the usage of all of the options in the list. """ if len(options) == 0: return "This module has no options" from jazzparser.utils.tableprint import pprint_table from StringIO import StringIO rows = [] # Put required options first for opt in [o for o in options if o.required]: rows.append([opt.name, "%s (REQUIRED)" % opt.usage, opt.help_text]) for opt in [o for o in options if not o.required]: rows.append([opt.name, opt.usage, opt.help_text]) output = StringIO() # Print the options in a nice table pprint_table(output, rows, separator="", justs=[True, True, True], widths=[None, 35, 40], blank_row=True) strout = output.getvalue() output.close() if intro is not None: strout = "%s\n%s\n%s" % (intro, "=" * len(intro), strout) return strout
def confusion_matrix(matrix): """ Given a confusion matrix as a dictionary, outputs it as a table. The matrix should be in the format of a dictionary, keyed by correct values (strings), containing dictionaries, keyed by incorrect values (string), of integers. The integers represent the number of times the incorrect value was mistaken for the correct value. """ from jazzparser.utils.tableprint import pprint_table import sys # Convert the matrix into table data rows = [] for cor,incor_table in matrix.items(): for incor,count in incor_table.items(): rows.append([cor,incor,count]) rows = list(reversed(sorted(rows, key=lambda r:r[2]))) rows = [[cor,incor,str(count)] for cor,incor,count in rows] header = [['Correct','Incorrect','Count'],['','','']] return pprint_table(sys.stdout, header+rows, separator=" | ", outer_seps=True, justs=[True,True,False])
def main(): usage = "%prog [options] <seq-file>" description = "Outputs the details of all chord sequences from a "\ "sequence index file to stdout. This is for getting a "\ "(relatively) human-readable form of the data" parser = OptionParser(usage=usage, description=description) parser.add_option("--categories", "-c", dest="categories", action="store_true", help="include category annotations") parser.add_option("--coordinations", "-o", dest="coordinations", action="store_true", help="include coordination annotations") parser.add_option("--meta", "-m", dest="meta", action="store_true", help="output sequence meta data") parser.add_option( "--no-map", "-n", dest="no_map", action="store_true", help= "don't apply a mapping from the names in the corpus to those used in the paper" ) parser.add_option("--all", "-a", dest="all", action="store_true", help="output everything") options, arguments = parser.parse_args() if len(arguments) < 1: print "You must specify a sequence file" sys.exit(1) # Get the chord sequence seqs = SequenceIndex.from_file(arguments[0]) # Show the song name for seq in seqs: print "Chords for '%s'" % seq.string_name if options.meta or options.all: print "Main key: %s" % seq.key print "Bar length: %d" % seq.bar_length # Put together a table of chords plus annotations (if requested) data = [[str(chord) for chord in seq], [str(chord.duration) for chord in seq]] if options.categories or options.all: if options.no_map: # Don't apply any mapping to the category names data.append([chord.category for chord in seq]) else: # Map the names to those used in the paper/thesis data.append([ annotation_to_lexicon_name(chord.category) for chord in seq ]) if options.coordinations or options.all: coords = [] for chord in seq: ti = chord.treeinfo if ti.coord_resolved and ti.coord_unresolved: coords.append(")(") elif ti.coord_resolved: coords.append(")") elif ti.coord_unresolved: coords.append("(") else: coords.append("") data.append(coords) pprint_table(sys.stdout, data, default_just=True) print
def main(): def _check_args(args): if len(args) != 3: print >>sys.stderr, "Specify a tagger, model name and input file" sys.exit(1) return args[1],args[2] partitions,part_ids,options,arguments = prepare_evaluation_options( usage = "%prog [options] <tagger> <model-name> <input-file>", description = "Evaluate a tagging model by "\ "tagging sequences from an input file. If the tagger doesn't "\ "need a model name, use '-' as the model name.", check_args = _check_args, optparse_groups = [ (("Tagging",), [(("--topt", "--tagger-options"), {'dest':"topts", 'action':"append", 'help':"options to pass to the tagger."}), ]), (("Output",), [(("--no-model-info",), {'dest':"no_model_info", 'action':"store_true", 'help':"turns of outputing of information about the model being used before using it (useful for identifying output piped to a file later, but may be too verbose sometimes)"}), ]), (("Evaluation", "Type of evaluation and options"), [(("-a", "--agreement"), {'dest':"agreement", 'action':"store_true", 'help':"instead of doing any parses, just report the agreement of the tops tags with the gold standard tags."}), (("--confusion",), {'dest':"confusion", 'action':"store_true", 'help':"print out confusion matrix after agreement calculation. Applies only in combination with --agreement"}), (("-e", "--entropy"), {'dest':"entropy", 'action':"store_true", 'help':"instead of doing any parses, just report the entropy of the returned tag distribution with respect to the gold standard tags."}), (("--tag-stats",), {'dest':"tag_stats", 'action':"store_true", 'help':"just output stats about the tags that the model assigns to this sequence (or these sequences)"}), (("--topn",), {'dest':"topn", 'type':"int", 'action':"store", 'help':"when evaluating agreement consider the top N tags the tagger returns. By default, allows only the top one to count as a hit.", 'default':1}), ]), ], ) grammar = Grammar() tagger_name = arguments[0] model_name = arguments[1] # Tagger shouldn't use a model in some cases no_tagger_model = model_name == "-" # Load the requested tagger class tagger_cls = get_tagger(tagger_name) topts = ModuleOption.process_option_string(options.topts) def _model_info(mname): """ Outputs info about the named model """ if options.no_model_info: print >>sys.stderr, "Model %s" % mname else: # Can only output the nice model info if it's a ModelTagger if issubclass(tagger_cls, ModelTagger): print >>sys.stderr, "======== Model info ========" print >>sys.stderr, tagger_cls.MODEL_CLASS.load_model(mname).description print >>sys.stderr, "============================" else: print >>sys.stderr, "Tagger %s using model %s" % (tagger_cls.__name__, mname) num_parts = len(partitions) num_seqs = sum([len(p[0]) for p in partitions]) ################# Evaluation ######################## if options.tag_stats: raise NotImplementedError, "fix this if you want it" # Print out statistics for each partition, with its model if no_tagger_model: # There could be some circumstance in which we want to do this, # but I can't think what it is, so I'm not implementing it for now print >>sys.stderr, "Cannot run tag_stats with no tagger model" sys.exit(1) all_stats = {} for parti in range(num_parts): sequences,model,part_num = partitions[parti] # Output the model training info if requested _model_info(model) ######## This doesn't exist any more stats = sequences_top_tags_dict(tagger_cls, model, sequences, topn=options.topn) for tag,num in stats.items(): if tag in all_stats: all_stats[tag] += stats[tag] else: all_stats[tag] = stats[tag] pprint_table(sys.stdout, list(reversed(sorted(all_stats.items(), key=lambda r:r[1]))), separator="|") elif options.agreement: # Print out agreement stats for each partition if no_tagger_model: # Same a tag_stats: probably no need for this ever print >>sys.stderr, "Cannot run agreement with no tagger model" sys.exit(1) correct = 0 total = 0 conf_mat = {} for parti in range(num_parts): sequences,model,part_num = partitions[parti] topts['model'] = model # Output the model training info if requested _model_info(model) pcorrect = 0 ptotal = 0 # Go through each sequence for seq in sequences: print >>sys.stderr, "Evaluating %s" % seq.string_name input = DbInput.from_sequence(seq) correct_tags = [chord.category for chord in seq.iterator()] cor,tot = tagger_agreement(input, grammar, tagger_cls, correct_tags, options=topts, confusion_matrix=conf_mat, topn=options.topn) pcorrect += cor ptotal += tot print " Sequence: %.1f%%" % (float(cor)/tot*100) print " So far: %.1f%%" % (float(pcorrect)/ptotal*100) print "Partition %d: %d / %d (%.2f%%)" % (part_num, pcorrect, ptotal, (float(pcorrect)/ptotal*100)) correct += pcorrect total += ptotal if num_parts > 1: # Print out the overall stats print "%d / %d (%f%%)" % (correct,total,(float(correct)/total*100)) if options.confusion: confusion_matrix(conf_mat) elif options.entropy: print "Calculating cross-entropy of tagger with gold standard tags" entropy = 0.0 num_chords = 0 for parti in range(num_parts): sequences,model,part_num = partitions[parti] if not no_tagger_model: topts['model'] = model # Output the model training info if requested _model_info(model) pentropy = 0.0 pnum_chords = 0 # Compute the entropy for the partition model for seq in sequences: print >>sys.stderr, "Evaluating %s" % seq.string_name input = " ".join([str(chord) for chord in seq.iterator()]) correct_tags = [chord.category for chord in seq.iterator()] ent,crds = tagger_entropy(input, grammar, tagger_cls, correct_tags, options=topts) pentropy += ent pnum_chords += crds print " %f bits per chord" % (ent/crds) print "Partition %d: %f bits per chord (%d chords)" % (part_num, (pentropy/pnum_chords), pnum_chords) entropy += pentropy num_chords += pnum_chords # Print out the stats for all partitions together if num_parts > 1: print "%f bits per chord (%d chords)" % ((entropy/num_chords), num_chords) else: print >>sys.stderr, "Select an evaluation operation with one of the options" sys.exit(1)
def main(): usage = "%prog [options] <results-files>" description = """\ Read in a ParseResults file, just like result_alignment.py. Examines the \ errors that were made and outputs them in context. """ parser = OptionParser(usage=usage, description=description) parser.add_option("--window", "-w", dest="window", action="store", type="int", help="size of context window to show before and after each error. Default: 2", default=2) parser.add_option("--distance", "--dist", dest="distance", action="store_true", help="show the total distance travelled in the tonal space by the result and the gold standard") parser.add_option("--output-opts", "--oopts", dest="output_opts", action="store", help="options that affect the output formatting. Use '--output-opts help' for a list of options.") parser.add_option("--summary-threshold", dest="summary_threshold", action="store", type="int", help="how many times a substitution/insertion/deletion needs to have happened to be including in the summary (default: 4)", default=4) options, arguments = parser.parse_args() if len(arguments) == 0: print >>sys.stderr, "Specify at least one file to read the results from" sys.exit(1) grammar = get_grammar() grammar.formalism.cl_output_options(options.output_opts) # Size of window of context to show win = options.window errors = [] unscored_files = [] scored = 0 unscored = 0 result_lengths = [] gold_lengths = [] insertions = {} deletions = {} substitutions = {} error_types = {} for filename in arguments: try: top_result, gold_result = get_top_result(filename) except ParseResults.LoadError, err: print >>sys.stderr, "Error loading file: %s" % (err) errors.append(filename) continue else: print "=============================" print "File: %s" % filename if top_result is None: # No alignment was found unscored +=1 print "No result" else: # Wrap these up as a semantics, since some functions need that as input Sems = grammar.formalism.Semantics.Semantics top_sems, gold_sems = Sems(top_result), Sems(gold_result) # Do the alignment of the top result and gold result alignment,gold_seq,result_seq = results_alignment(top_result, gold_result) scored += 1 # Get the actual list of coordinates coords = zip(*grammar.formalism.semantics_to_coordinates(gold_sems))[0] funs = zip(*grammar.formalism.semantics_to_functions(gold_sems))[0] gold_coords = zip(coords, funs) coords = zip(*grammar.formalism.semantics_to_coordinates(top_sems))[0] funs = zip(*grammar.formalism.semantics_to_functions(top_sems))[0] result_coords = zip(coords, funs) print "Result length: %d, gold length: %d" % \ (len(result_coords), len(gold_coords)) result_lengths.append(len(result_coords)) gold_lengths.append(len(gold_coords)) if options.distance: # Work out the total distance travelled start, end = gold_coords[-1][0], gold_coords[0][0] gold_vect = end[0] - start[0], end[1] - start[1] # And for the actual result start, end = result_coords[-1][0], result_coords[0][0] result_vect = end[0] - start[0], end[1] - start[1] print "Distance travelled:" print " Gold result:", gold_vect print " Top result: ", result_vect print # Put together a table of error windows table = [ # Header row ["", "Step", "", "Result", "Gold"] ] gold = iter(zip(gold_seq,gold_coords)) result = iter(zip(result_seq,result_coords)) context = [] post_context = 0 unseen = 0 for op in alignment: # Keep a record of how many of each error occur if op not in error_types: error_types[op] = 1 else: error_types[op] += 1 if op == "A": # Aligned pair # Move both sequences on gold_step,gold_point = gold.next() result_step,result_point = result.next() if post_context > 0: # Show this as part of the post-context of an error table.append(["A", str(gold_step), "", str(result_point), str(gold_point)]) context = [] post_context -= 1 else: # Add this to the rolling window of pre-context if len(context) >= win: # We've not shown something here unseen += 1 if win > 0: context.append((gold_step, gold_point, result_step, result_point)) context = context[-win:] else: # Mark if there was something we didn't show if unseen: table.append(["", " ...%d..." % unseen, "", "", ""]) unseen = 0 if context: # Show the error's pre-context for (pre_gold_step,pre_gold_point,__,pre_result_point) in context: table.append(["A", str(pre_gold_step), "", str(pre_result_point), str(pre_gold_point)]) context = [] if op == "I": # Inserted in the result result_step,result_point = result.next() table.append(["I", str(result_step), "", str(result_point), ""]) if str(result_step) not in insertions: insertions[str(result_step)] = 1 else: insertions[str(result_step)] += 1 elif op == "D": # Deleted in the result gold_step,gold_point = gold.next() table.append(["D", str(gold_step), "", "", str(gold_point)]) if str(gold_step) not in deletions: deletions[str(gold_step)] = 1 else: deletions[str(gold_step)] += 1 else: # Substituted result_step, result_point = result.next() gold_step, gold_point = gold.next() table.append([str(op), str(result_step), "for %s" % str(gold_step), str(result_point), str(gold_point)]) subst_key = "%s > %s" % (gold_step, result_step) if subst_key not in substitutions: substitutions[subst_key] = 1 else: substitutions[subst_key] += 1 # After anything other than an alignment, cancel the # context window context = [] # Show up to <win> in the post-context of alignments post_context = win # Mark if there was something at the end we didn't show if unseen: table.append(["", " ...%d..." % unseen, "", "", ""]) # Print out the table pprint_table(sys.stdout, table, justs=[True,True,True,True,True]) print "\n"
print "Processed %d result sets" % (scored+unscored) print "Errors processing %d result sets" % len(errors) print "Average result length: %.2f (%d)" % ( float(sum(result_lengths)) / len(result_lengths), sum(result_lengths)) print "Average gold length: %.2f (%d)" % ( float(sum(gold_lengths)) / len(gold_lengths), sum(gold_lengths)) # A table of error types print print "Error types:" error_table = [] for error, count in error_types.items(): if error != "A": error_table.append([error, "%d" % count]) pprint_table(sys.stdout, error_table, justs=[True, False]) # Show common mistakes # Substitutions print print "Common substitutions:" subst_table = [] for subst,count in reversed(sorted(substitutions.items(), key=lambda x:x[1])): if count >= options.summary_threshold: subst_table.append(["%s" % subst, "%d" % count]) pprint_table(sys.stdout, subst_table, justs=[True, False]) # Deletions print print "Common deletions:" del_table = [] for deln,count in reversed(sorted(deletions.items(), key=lambda x:x[1])):
for name, songsem in corpus: # Get the distance from this song dist = metric.distance(result, songsem) distances.append((name, dist, songsem)) # Sort them to get the closest first distances.sort(key=lambda x: x[1]) print # Print out the top results, as many as requested top_results = distances[:print_up_to] table = [["", "Song", "Distance"]] + [[ "*" if res[0] == correct_song else "", "%s" % res[0], "%.2f" % res[1] ] for res in top_results] pprint_table(sys.stdout, table, default_just=True) print if correct_song is not None: # Look for the correct answer in the results for rank, (name, distance, __) in enumerate(distances): # Match up the song name to the correct one if name == correct_song: correct_rank = rank break else: # The song name was not found in the corpus at all correct_rank = None if correct_rank is None: print "Song was not found in corpus"
def main(): usage = "%prog [options] <in-file> [<index1> [<index2> ...]]" description = "Print the names of sequences in a sequence input "\ "file. Optionally specify indices of sequences. If no index "\ "is given, displays all sequences." parser = OptionParser(usage=usage, description=description) parser.add_option("--sa", "-a", "--sort-alpha", "--alpha", dest="alphabetical", action="store_true", help="order sequences alphabetically by name") parser.add_option("--sl", "--sort-length", dest="sort_length", action="store_true", help="order sequences by length") parser.add_option( "-i", "--index", dest="index", action="store_true", help= "also display the indices in the sequence file of each sequence, in the column before the ids" ) parser.add_option("-l", "--lengths", dest="lengths", action="store_true", help="output lengths of the sequences") options, arguments = parser.parse_args() if len(arguments) < 1: print "You must specify an input file" sys.exit(1) seqs = SequenceIndex.from_file(arguments[0]) indices = [int(ind) for ind in arguments[1:]] if len(indices) == 0: sequences = seqs.sequences else: sequences = [seqs.sequence_by_index(index) for index in indices] if options.alphabetical: # Sort by string_name sequences.sort(key=lambda s: s.string_name) elif options.sort_length: # Sort by sequence length sequences.sort(key=lambda s: len(s)) header = ["Song name", "Id"] justs = [True, False] if options.lengths: header.append("Length") justs.append(False) if options.index: header.append("Index") justs.append(False) rows = [header] for seq in sequences: row = [seq.string_name, str(seq.id)] if options.lengths: row.append(str(len(seq))) if options.index: row.append(str(seqs.index_for_id(seq.id))) rows.append(row) pprint_table(sys.stdout, rows, justs=justs)
def main(): usage = "%prog [options] <in-file> [<index1> [<index2> ...]]" description = ( "Print the names of sequences in a sequence input " "file. Optionally specify indices of sequences. If no index " "is given, displays all sequences." ) parser = OptionParser(usage=usage, description=description) parser.add_option( "--sa", "-a", "--sort-alpha", "--alpha", dest="alphabetical", action="store_true", help="order sequences alphabetically by name", ) parser.add_option( "--sl", "--sort-length", dest="sort_length", action="store_true", help="order sequences by length" ) parser.add_option( "-i", "--index", dest="index", action="store_true", help="also display the indices in the sequence file of each sequence, in the column before the ids", ) parser.add_option("-l", "--lengths", dest="lengths", action="store_true", help="output lengths of the sequences") options, arguments = parser.parse_args() if len(arguments) < 1: print "You must specify an input file" sys.exit(1) seqs = SequenceIndex.from_file(arguments[0]) indices = [int(ind) for ind in arguments[1:]] if len(indices) == 0: sequences = seqs.sequences else: sequences = [seqs.sequence_by_index(index) for index in indices] if options.alphabetical: # Sort by string_name sequences.sort(key=lambda s: s.string_name) elif options.sort_length: # Sort by sequence length sequences.sort(key=lambda s: len(s)) header = ["Song name", "Id"] justs = [True, False] if options.lengths: header.append("Length") justs.append(False) if options.index: header.append("Index") justs.append(False) rows = [header] for seq in sequences: row = [seq.string_name, str(seq.id)] if options.lengths: row.append(str(len(seq))) if options.index: row.append(str(seqs.index_for_id(seq.id))) rows.append(row) pprint_table(sys.stdout, rows, justs=justs)
def main(): usage = "%prog [options] <results-files>" description = """\ Read in a ParseResults file, just like result_alignment.py. Examines the \ errors that were made and outputs them in context. """ parser = OptionParser(usage=usage, description=description) parser.add_option( "--window", "-w", dest="window", action="store", type="int", help= "size of context window to show before and after each error. Default: 2", default=2) parser.add_option( "--distance", "--dist", dest="distance", action="store_true", help= "show the total distance travelled in the tonal space by the result and the gold standard" ) parser.add_option( "--output-opts", "--oopts", dest="output_opts", action="store", help= "options that affect the output formatting. Use '--output-opts help' for a list of options." ) parser.add_option( "--summary-threshold", dest="summary_threshold", action="store", type="int", help= "how many times a substitution/insertion/deletion needs to have happened to be including in the summary (default: 4)", default=4) options, arguments = parser.parse_args() if len(arguments) == 0: print >> sys.stderr, "Specify at least one file to read the results from" sys.exit(1) grammar = get_grammar() grammar.formalism.cl_output_options(options.output_opts) # Size of window of context to show win = options.window errors = [] unscored_files = [] scored = 0 unscored = 0 result_lengths = [] gold_lengths = [] insertions = {} deletions = {} substitutions = {} error_types = {} for filename in arguments: try: top_result, gold_result = get_top_result(filename) except ParseResults.LoadError, err: print >> sys.stderr, "Error loading file: %s" % (err) errors.append(filename) continue else: print "=============================" print "File: %s" % filename if top_result is None: # No alignment was found unscored += 1 print "No result" else: # Wrap these up as a semantics, since some functions need that as input Sems = grammar.formalism.Semantics.Semantics top_sems, gold_sems = Sems(top_result), Sems(gold_result) # Do the alignment of the top result and gold result alignment, gold_seq, result_seq = results_alignment( top_result, gold_result) scored += 1 # Get the actual list of coordinates coords = zip( *grammar.formalism.semantics_to_coordinates(gold_sems))[0] funs = zip( *grammar.formalism.semantics_to_functions(gold_sems))[0] gold_coords = zip(coords, funs) coords = zip( *grammar.formalism.semantics_to_coordinates(top_sems))[0] funs = zip( *grammar.formalism.semantics_to_functions(top_sems))[0] result_coords = zip(coords, funs) print "Result length: %d, gold length: %d" % \ (len(result_coords), len(gold_coords)) result_lengths.append(len(result_coords)) gold_lengths.append(len(gold_coords)) if options.distance: # Work out the total distance travelled start, end = gold_coords[-1][0], gold_coords[0][0] gold_vect = end[0] - start[0], end[1] - start[1] # And for the actual result start, end = result_coords[-1][0], result_coords[0][0] result_vect = end[0] - start[0], end[1] - start[1] print "Distance travelled:" print " Gold result:", gold_vect print " Top result: ", result_vect print # Put together a table of error windows table = [ # Header row ["", "Step", "", "Result", "Gold"] ] gold = iter(zip(gold_seq, gold_coords)) result = iter(zip(result_seq, result_coords)) context = [] post_context = 0 unseen = 0 for op in alignment: # Keep a record of how many of each error occur if op not in error_types: error_types[op] = 1 else: error_types[op] += 1 if op == "A": # Aligned pair # Move both sequences on gold_step, gold_point = gold.next() result_step, result_point = result.next() if post_context > 0: # Show this as part of the post-context of an error table.append([ "A", str(gold_step), "", str(result_point), str(gold_point) ]) context = [] post_context -= 1 else: # Add this to the rolling window of pre-context if len(context) >= win: # We've not shown something here unseen += 1 if win > 0: context.append((gold_step, gold_point, result_step, result_point)) context = context[-win:] else: # Mark if there was something we didn't show if unseen: table.append( ["", " ...%d..." % unseen, "", "", ""]) unseen = 0 if context: # Show the error's pre-context for (pre_gold_step, pre_gold_point, __, pre_result_point) in context: table.append([ "A", str(pre_gold_step), "", str(pre_result_point), str(pre_gold_point) ]) context = [] if op == "I": # Inserted in the result result_step, result_point = result.next() table.append([ "I", str(result_step), "", str(result_point), "" ]) if str(result_step) not in insertions: insertions[str(result_step)] = 1 else: insertions[str(result_step)] += 1 elif op == "D": # Deleted in the result gold_step, gold_point = gold.next() table.append( ["D", str(gold_step), "", "", str(gold_point)]) if str(gold_step) not in deletions: deletions[str(gold_step)] = 1 else: deletions[str(gold_step)] += 1 else: # Substituted result_step, result_point = result.next() gold_step, gold_point = gold.next() table.append([ str(op), str(result_step), "for %s" % str(gold_step), str(result_point), str(gold_point) ]) subst_key = "%s > %s" % (gold_step, result_step) if subst_key not in substitutions: substitutions[subst_key] = 1 else: substitutions[subst_key] += 1 # After anything other than an alignment, cancel the # context window context = [] # Show up to <win> in the post-context of alignments post_context = win # Mark if there was something at the end we didn't show if unseen: table.append(["", " ...%d..." % unseen, "", "", ""]) # Print out the table pprint_table(sys.stdout, table, justs=[True, True, True, True, True]) print "\n"
distances = [] for name,songsem in corpus: # Get the distance from this song dist = metric.distance(result, songsem) distances.append((name,dist,songsem)) # Sort them to get the closest first distances.sort(key=lambda x:x[1]) print # Print out the top results, as many as requested top_results = distances[:print_up_to] table = [["","Song","Distance"]] + [ ["*" if res[0] == correct_song else "", "%s" % res[0], "%.2f" % res[1]] for res in top_results] pprint_table(sys.stdout, table, default_just=True) print if correct_song is not None: # Look for the correct answer in the results for rank,(name,distance,__) in enumerate(distances): # Match up the song name to the correct one if name == correct_song: correct_rank = rank break else: # The song name was not found in the corpus at all correct_rank = None if correct_rank is None: print "Song was not found in corpus"
print "\n" print "Processed %d result sets" % (scored + unscored) print "Errors processing %d result sets" % len(errors) print "Average result length: %.2f (%d)" % ( float(sum(result_lengths)) / len(result_lengths), sum(result_lengths)) print "Average gold length: %.2f (%d)" % ( float(sum(gold_lengths)) / len(gold_lengths), sum(gold_lengths)) # A table of error types print print "Error types:" error_table = [] for error, count in error_types.items(): if error != "A": error_table.append([error, "%d" % count]) pprint_table(sys.stdout, error_table, justs=[True, False]) # Show common mistakes # Substitutions print print "Common substitutions:" subst_table = [] for subst, count in reversed( sorted(substitutions.items(), key=lambda x: x[1])): if count >= options.summary_threshold: subst_table.append(["%s" % subst, "%d" % count]) pprint_table(sys.stdout, subst_table, justs=[True, False]) # Deletions print print "Common deletions:" del_table = []