def test_load_cached(self): """ Loads a grammar using L{jazzparser.grammar.get_grammar} and then checks that if we load another we get the same instance. """ g1 = get_grammar() g2 = get_grammar() self.assertIs(g1, g2)
def get_gold_semantics(self): """ Tries to return a gold standard semantics. In some cases this is stored along with the results in C{gold_parse}. In others this is not available, but a gold annotated chord sequence is: then we can get the gold semantics by parsing the annotations. Note that this might take a little bit of time. In other cases neither is available. Then C{None} will be returned. """ from jazzparser.evaluation.parsing import parse_sequence_with_annotations if self.gold_parse is not None: return self.gold_parse elif self.gold_sequence is not None: # Parse the annotations to get a semantics try: gold_parses = parse_sequence_with_annotations( self.gold_sequence, grammar=get_grammar(), allow_subparses=False ) if len(gold_parses) != 1: # This shouldn't happen, since allow_subparses was False return None # Got a result: return its semantics return gold_parses[0].semantics except ParseError: # Could not parse annotated sequence return None else: return None
def get_gold_semantics(self): """ Tries to return a gold standard semantics. In some cases this is stored along with the results in C{gold_parse}. In others this is not available, but a gold annotated chord sequence is: then we can get the gold semantics by parsing the annotations. Note that this might take a little bit of time. In other cases neither is available. Then C{None} will be returned. """ from jazzparser.evaluation.parsing import parse_sequence_with_annotations if self.gold_parse is not None: return self.gold_parse elif self.gold_sequence is not None: # Parse the annotations to get a semantics try: gold_parses = parse_sequence_with_annotations( self.gold_sequence, grammar=get_grammar(), allow_subparses=False) if len(gold_parses) != 1: # This shouldn't happen, since allow_subparses was False return None # Got a result: return its semantics return gold_parses[0].semantics except ParseError: # Could not parse annotated sequence return None else: return None
def train(name, training_data, options, grammar=None, logger=None): if grammar is None: grammar = get_grammar() if logger is None: logger = create_dummy_logger() # If cat_bins wasn't given, read it from the grammar if options["cat_bins"]: cat_bins = options["cat_bins"] elif grammar.max_categories: cat_bins = grammar.max_categories else: # Nothing given in the grammar either: error raise ValueError, "no value was given for cat_bins and the "\ "grammar doesn't supply one" # Create a new model with empty distributions model = HalfspanPcfgModel( name, cutoff = options['cutoff'], cat_bins = cat_bins, estimator = options['estimator'], lexical = options['lexical'], chordmap = options['chord_mapping'], grammar = grammar) # Add counts to this model for each sequence for sequence in training_data: try: model._sequence_train(sequence) except ModelTrainingError, err: logger.warn("Error training on %s: %s" % (sequence.string_name, err))
def __init__(self, input, options={}, grammar=None, *args, **kwargs): super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs) process_chord_input(self) if grammar is None: self.grammar = get_grammar() else: self.grammar = grammar #### Tag the input sequence #### self._tagged_data = [] chord_map = self.model.model.chord_map if isinstance(self.wrapped_input, ChordInput): chords = self.wrapped_input.to_db_input().chords observations = [(chord.root, chord_map[chord.type]) for chord in chords] self.input = chords elif isinstance(self.wrapped_input, DbInput): observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords] elif isinstance(self.wrapped_input, WeightedChordLabelInput): observations = lattice_to_emissions(input, chord_map=chord_map) # Use the ngram model to get tag probabilities for each input by # computing the state occupation probability matrix path_probs = self.model.viterbi_paths(observations, self.options['paths']) self._paths = [ self.grammar.formalism.backoff_states_to_lf(zip(states,self.times)) for states,prob in path_probs] # Set the probability on each result for path,(states,prob) in zip(self._paths,path_probs): path.probability = prob
def result_lengths(filename, grammar=None): """ Opens the parse results file and returns the lengths of the gold standard path and the top parse result's path. """ if grammar is None: grammar = get_grammar() # Load the data in from the file res = ParseResults.from_file(filename) gold_parse = res.get_gold_semantics() if gold_parse is None: gold_length = 0 else: # Measure the length of the gold standard gold_length = grammar.formalism.Evaluation.tonal_space_length(gold_parse) # Get the results in order of probability results = res.semantics if len(results) == 0: # No results: cannot analyse them return gold_length,0 top_result = results[0][1] top_length = grammar.formalism.Evaluation.tonal_space_length(top_result) return gold_length, top_length
def train(self, data, grammar=None, logger=None): if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() model = HmmPathNgram.train(data, self.options['estimator'], grammar, cutoff=self.options['cutoff'], chord_map=self.options['chord_mapping'], order=self.options['n'], backoff_orders=self.options['backoff']) self.model = model # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(data), 'samples' : sum([len(s) for s in data], 0), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], }
def main(): usage = "%prog <model-name>" description = "Generate chord sequences from a PCFG model" parser = OptionParser(usage=usage, description=description) parser.add_option("-g", "--grammar", dest="grammar", action="store", \ help="use the named grammar instead of the default.") parser.add_option("-d", "--debug", dest="debug", action="store_true", \ help="output debugging information during generation") options, arguments = parse_args_with_config(parser) if options.debug: logger = create_plain_stderr_logger(log_level=logging.DEBUG) else: logger = create_plain_stderr_logger(log_level=logging.WARN) if len(arguments) < 1: print "Specify a model name" sys.exit(1) model_name = arguments[0] grammar = get_grammar(options.grammar) PcfgModel = grammar.formalism.PcfgModel # Load the trained model model = PcfgModel.load_model(model_name) sequence = model.generate(logger=logger) if sequence is None: print "Model did not generate a sequence" else: print sequence
def train(name, training_data, options, grammar=None, logger=None): if grammar is None: grammar = get_grammar() if logger is None: logger = create_dummy_logger() # If cat_bins wasn't given, read it from the grammar if options["cat_bins"]: cat_bins = options["cat_bins"] elif grammar.max_categories: cat_bins = grammar.max_categories else: # Nothing given in the grammar either: error raise ValueError, "no value was given for cat_bins and the "\ "grammar doesn't supply one" # Create a new model with empty distributions model = HalfspanPcfgModel(name, cutoff=options['cutoff'], cat_bins=cat_bins, estimator=options['estimator'], lexical=options['lexical'], chordmap=options['chord_mapping'], grammar=grammar) # Add counts to this model for each sequence for sequence in training_data: try: model._sequence_train(sequence) except ModelTrainingError, err: logger.warn("Error training on %s: %s" % (sequence.string_name, err))
def __init__(self, input, options={}, grammar=None, *args, **kwargs): super(MidiHmmPathBuilder, self).__init__(input, options, *args, **kwargs) if grammar is None: self.grammar = get_grammar() else: self.grammar = grammar # Make a copy of the options that we will pass through to HmmPath options = self.options.copy() # Remove the options that the tagger doesn't need labeling_model_name = options.pop("labeling_model") latticen = options.pop("latticen") beam_ratio = options.pop("lattice_beam") viterbi = options.pop("label_viterbi") partition_labeler = options.pop("partition_labeler") # Use an HP chord labeler to label the MIDI data # Partition the labeling model if requested and a partition number # was given for the supertagger if partition_labeler and "partition" in self.options and self.options["partition"] is not None: labeling_model_name += "%d" % self.options["partition"] # First run the chord labeler on the MIDI input # Load a labeling model labeler = HPChordLabeler.load_model(labeling_model_name) self.labeler = labeler # Get chord labels from the model: get a lattice of possible chords lattice = labeler.label_lattice(input, options={"n": latticen, "nokey": True, "viterbi": viterbi}, corpus=True) # Store the lattice for later reference self.lattice = lattice # Beam the lattice to get rid of very low probability labels lattice.apply_ratio_beam(ratio=beam_ratio) # Tag the lattice self.hmmpath = HmmPathBuilder(lattice, options, grammar, *args, **kwargs)
def result_lengths(filename, grammar=None): """ Opens the parse results file and returns the lengths of the gold standard path and the top parse result's path. """ if grammar is None: grammar = get_grammar() # Load the data in from the file res = ParseResults.from_file(filename) gold_parse = res.get_gold_semantics() if gold_parse is None: gold_length = 0 else: # Measure the length of the gold standard gold_length = grammar.formalism.Evaluation.tonal_space_length( gold_parse) # Get the results in order of probability results = res.semantics if len(results) == 0: # No results: cannot analyse them return gold_length, 0 top_result = results[0][1] top_length = grammar.formalism.Evaluation.tonal_space_length(top_result) return gold_length, top_length
def get_gold_analysis(self): """ Parses the annotations to get a gold analysis. """ from jazzparser.evaluation.parsing import parse_sequence_with_annotations from jazzparser.grammar import get_grammar parses = parse_sequence_with_annotations(self, get_grammar(), allow_subparses=False) return parses[0].semantics
def keys_for_sequence(sequence, grammar=None): """ Takes a chord sequence from the chord corpus and parses using its annotations. Returns a list of the key (as a pitch class integer) for each chord. This is simply worked out, once the parse is done. Every chord in a cadence has the same key as the resolution of the cadence, which can be read off by taking the equal temperament pitch class for the tonal space point of the resolution. """ from jazzparser.evaluation.parsing import parse_sequence_with_annotations if grammar is None: grammar = get_grammar() # Try parsing the sequence according to the tree in the database sub_parses = parse_sequence_with_annotations(sequence, grammar) if len(sub_parses) > 1: # We can only continue if we got a full parse raise ParseError, "could not fully parse the sequence %s." % \ sequence.string_name sems = sub_parses[0].semantics # Get the keys for this LF, and the times when they start keys = grammar.formalism.semantics_to_keys(sems) key_roots, change_times = zip(*keys) key_roots = iter(key_roots) change_times = iter(change_times) chords = iter(sequence) # Get the first key as the current key key = key_roots.next() # Ignore the first time, as it should be 0 change_times.next() chord_keys = [] try: # Get the next time at which we'll need to change next_change = change_times.next() time = 0 for chord in sequence.chords: if time >= next_change: # Move onto the next key key = key_roots.next() next_change = change_times.next() # Add the next chord with the current key value chord_keys.append((chord, key)) time += chord.duration except StopIteration: # No more timings left # Include the rest of the chords with the current key for chord in chords: chord_keys.append((chord, key)) return chord_keys
def keys_for_sequence(sequence, grammar=None): """ Takes a chord sequence from the chord corpus and parses using its annotations. Returns a list of the key (as a pitch class integer) for each chord. This is simply worked out, once the parse is done. Every chord in a cadence has the same key as the resolution of the cadence, which can be read off by taking the equal temperament pitch class for the tonal space point of the resolution. """ from jazzparser.evaluation.parsing import parse_sequence_with_annotations if grammar is None: grammar = get_grammar() # Try parsing the sequence according to the tree in the database sub_parses = parse_sequence_with_annotations(sequence, grammar) if len(sub_parses) > 1: # We can only continue if we got a full parse raise ParseError, "could not fully parse the sequence %s." % sequence.string_name sems = sub_parses[0].semantics # Get the keys for this LF, and the times when they start keys = grammar.formalism.semantics_to_keys(sems) key_roots, change_times = zip(*keys) key_roots = iter(key_roots) change_times = iter(change_times) chords = iter(sequence) # Get the first key as the current key key = key_roots.next() # Ignore the first time, as it should be 0 change_times.next() chord_keys = [] try: # Get the next time at which we'll need to change next_change = change_times.next() time = 0 for chord in sequence.chords: if time >= next_change: # Move onto the next key key = key_roots.next() next_change = change_times.next() # Add the next chord with the current key value chord_keys.append((chord, key)) time += chord.duration except StopIteration: # No more timings left # Include the rest of the chords with the current key for chord in chords: chord_keys.append((chord, key)) return chord_keys
def train(self, sequences, grammar=None, logger=None): if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() # We can only train on annotated chord sequence input if not isinstance(sequences, (DbBulkInput, AnnotatedDbBulkInput)): raise TaggerTrainingError, "can only train ngram-multi model "\ "on bulk db chord input (bulk-db or bulk-db-annotated). Got "\ "input of type '%s'" % type(sequences).__name__ if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']} # Get all the possible pos tags from the grammar schemata = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = list(set(self.options['chord_mapping'].values())) self.model = MultiChordNgramModel.train( sequences, schemata, chord_types, self.options['estimator'], cutoff=self.options['cutoff'], chord_map=self.options['chord_mapping'], order=self.options['n'], backoff_orders=self.options['backoff'], backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d\ """ % \ { 'est' : est_name, 'seqs' : len(sequences), 'cutoff' : self.options['cutoff'], 'chordmap' : self.options['chord_mapping'].name, 'order' : self.options['n'], 'backoff' : self.options['backoff'], }
def main(): parser = OptionParser() usage = "%prog [options] [<seq-db-file>]" description = "Measure the degree of ambiguity (average cats per chord) "\ "for a grammar over a particular dataset" parser.add_option('-g', '--grammar', dest='grammar', action='store', help='Speficy a grammar by name') options, arguments = parser.parse_args() if len(arguments) < 1: print "No sequence index file given: grammar stats only" seq_file = None else: seq_file = arguments[0] # Load the grammar grammar = get_grammar(options.grammar) # Some stats about ambiguity in the grammar table = [] class_cats = [] for class_name, chord_class in grammar.chord_classes.items(): if class_name not in EXCLUDE_CLASSES: cats = grammar.get_signs_for_word(str(chord_class.words[0])) table.append([str(class_name), str(len(cats))]) class_cats.append(len(cats)) table.append(["Mean", "%.2f" % (float(sum(class_cats)) / len(class_cats))]) table.append(["Std dev", "%.2f" % (std(class_cats))]) print "Cats for each chord class:" pprint_table(sys.stdout, table, justs=[True, True]) # Ambiguity stats on the dataset if seq_file is not None: seqs = SequenceIndex.from_file(arguments[0]) counts = [] for seq in seqs: for chord in seq: cats = grammar.get_signs_for_word(chord) counts.append(len(cats)) table = [] table.append(["Chords", str(len(counts))]) table.append( ["Cats per chord", "%.2f" % (float(sum(counts)) / len(counts))]) table.append(["Std dev", "%.2f" % (std(counts))]) print pprint_table(sys.stdout, table, justs=[True, True])
def results_alignment(top_result, gold, grammar=None): """ Returns the list of alignment operations that result in the optimal alignment. @return: tuple containing the alignment and the two sequences in the form that they were compared (gold, top result). """ if grammar is None: grammar = get_grammar() # Perform the alignment alignment,gold_seq,result_seq = grammar.formalism.Evaluation.tonal_space_alignment(gold, top_result) return alignment,gold_seq,result_seq
def results_alignment(top_result, gold, grammar=None): """ Returns the list of alignment operations that result in the optimal alignment. @return: tuple containing the alignment and the two sequences in the form that they were compared (gold, top result). """ if grammar is None: grammar = get_grammar() # Perform the alignment alignment, gold_seq, result_seq = grammar.formalism.Evaluation.tonal_space_alignment( gold, top_result) return alignment, gold_seq, result_seq
def generate_tag_list(filename, grammar=None): """ Generates a list of possible tags to be stored along with a C&C model. It contains all tags that are in the grammar. """ from jazzparser.grammar import get_grammar if grammar is None: # Load the default grammar grammar = get_grammar() tags = grammar.families.keys() data = "\n".join(tags) file = open(filename, 'w') file.write(data) file.close()
def main(): parser = OptionParser() usage = "%prog [options] [<seq-db-file>]" description = "Measure the degree of ambiguity (average cats per chord) "\ "for a grammar over a particular dataset" parser.add_option('-g', '--grammar', dest='grammar', action='store', help='Speficy a grammar by name') options, arguments = parser.parse_args() if len(arguments) < 1: print "No sequence index file given: grammar stats only" seq_file = None else: seq_file = arguments[0] # Load the grammar grammar = get_grammar(options.grammar) # Some stats about ambiguity in the grammar table = [] class_cats = [] for class_name,chord_class in grammar.chord_classes.items(): if class_name not in EXCLUDE_CLASSES: cats = grammar.get_signs_for_word(str(chord_class.words[0])) table.append([str(class_name), str(len(cats))]) class_cats.append(len(cats)) table.append(["Mean", "%.2f" % (float(sum(class_cats))/len(class_cats))]) table.append(["Std dev", "%.2f" % (std(class_cats))]) print "Cats for each chord class:" pprint_table(sys.stdout, table, justs=[True, True]) # Ambiguity stats on the dataset if seq_file is not None: seqs = SequenceIndex.from_file(arguments[0]) counts = [] for seq in seqs: for chord in seq: cats = grammar.get_signs_for_word(chord) counts.append(len(cats)) table = [] table.append(["Chords", str(len(counts))]) table.append(["Cats per chord", "%.2f" % (float(sum(counts)) / len(counts))]) table.append(["Std dev", "%.2f" % (std(counts))]) print pprint_table(sys.stdout, table, justs=[True, True])
def get_depend_graph(semantics): # 'coord', 'xycoord', 'alpha' or 'roman' grammar = get_grammar() grammar.formalism.cl_output_options("tsformat=coord") coords = zip(*grammar.formalism.semantics_to_coordinates(semantics))[0] funs = zip(*grammar.formalism.semantics_to_functions(semantics))[0] gold_seq = zip(coords, funs) tags = [] for g in gold_seq: t = "%s,%s" % (coordinate_to_roman_name(g[0]).replace("-","").replace("b", ""), g[1]) tags.append(t) gold_graph,gold_time_map = semantics_to_dependency_graph(semantics) depend_graph_tags = eval("%s" % gold_graph.get_graph_pos(tags)) gold_graph = eval("%s" % gold_graph.get_graph_index()) return [gold_graph, depend_graph_tags]
def get_gold_analysis(self): """ Parses the annotations, if present, to get a gold analysis. Unlike L{AnnotatedDbInput}, this input type cannot be assumed to have annotations. It will therefore not raise an error if annotations are missing or incomplete, but just return None. """ from jazzparser.evaluation.parsing import parse_sequence_with_annotations from jazzparser.grammar import get_grammar from jazzparser.parsers import ParseError try: parses = parse_sequence_with_annotations(self, get_grammar(), allow_subparses=False) except ParseError: return None else: return parses[0].semantics
def __init__(self, input, options={}, grammar=None, *args, **kwargs): super(MidiHmmPathBuilder, self).__init__(input, options, *args, **kwargs) if grammar is None: self.grammar = get_grammar() else: self.grammar = grammar # Make a copy of the options that we will pass through to HmmPath options = self.options.copy() # Remove the options that the tagger doesn't need labeling_model_name = options.pop('labeling_model') latticen = options.pop('latticen') beam_ratio = options.pop('lattice_beam') viterbi = options.pop('label_viterbi') partition_labeler = options.pop('partition_labeler') # Use an HP chord labeler to label the MIDI data # Partition the labeling model if requested and a partition number # was given for the supertagger if partition_labeler and 'partition' in self.options and \ self.options['partition'] is not None: labeling_model_name += "%d" % self.options['partition'] # First run the chord labeler on the MIDI input # Load a labeling model labeler = HPChordLabeler.load_model(labeling_model_name) self.labeler = labeler # Get chord labels from the model: get a lattice of possible chords lattice = labeler.label_lattice(input, options={ 'n': latticen, 'nokey': True, 'viterbi': viterbi }, corpus=True) # Store the lattice for later reference self.lattice = lattice # Beam the lattice to get rid of very low probability labels lattice.apply_ratio_beam(ratio=beam_ratio) # Tag the lattice self.hmmpath = HmmPathBuilder(lattice, options, grammar, *args, **kwargs)
def main(): usage = "%prog [<options>] <model-name>" description = "Delete a PCFG model" parser = OptionParser(usage=usage, description=description) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \ help="Number of partitions the model is divided into") parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") options, arguments = parse_args_with_config(parser) # Load a grammar grammar = get_grammar(options.grammar) # Get the pcfg model class for the formalism PcfgModel = grammar.formalism.PcfgModel if len(arguments) == 0: print >> sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >> sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] print "Model base name:", model_name if options.partitions is not None: parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)] else: parts = [(None, model_name)] # First check all the models exist for parti, part_model in parts: if part_model not in PcfgModel.list_models(): print "The model '%s' does not exist" % part_model sys.exit(1) # Now delete them one by one for parti, part_model in parts: # Load the model model = PcfgModel.load_model(part_model) model.delete() print "Removed model: %s" % part_model
def main(): usage = "%prog <model-name>" description = "Debug a PCFG model" parser = OptionParser(usage=usage, description=description) parser.add_option("-g", "--grammar", dest="grammar", action="store", \ help="use the named grammar instead of the default.") parser.add_option("-d", "--debug", dest="debug", action="store_true", \ help="output debugging information during generation") parser.add_option("--file-options", "--fopt", dest="file_options", \ action="store", help="options for the input file "\ "(--file). Type '--fopt help' for a list of available "\ "options.") options, arguments = parse_args_with_config(parser) if len(arguments) < 1: print "Specify a model name" sys.exit(1) model_name = arguments[0] if len(arguments) < 2: print "Specify an input file" grammar = get_grammar(options.grammar) PcfgModel = grammar.formalism.PcfgModel # Load the trained model model = PcfgModel.load_model(model_name) # Try getting a file from the command-line options input_data = command_line_input(filename=arguments[1], filetype="db", options=options.file_options) # Prepare the input and annotations sequence = input_data.sequence categories = [chord.category for chord in sequence.iterator()] str_inputs = input_data.inputs # Build the implicit normal-form tree from the annotations try: tree = build_tree_for_sequence(sequence) except TreeBuildError, err: raise ModelTrainingError, "could not build a tree for '%s': %s" % \ (sequence.string_name, err)
def main(): usage = "%prog <model-name> [options]" description = "Outputs a summary of a named model (counts, etc)" parser = OptionParser(usage=usage, description=description) parser.add_option("-g", "--grammar", dest="grammar", action="store", \ help="use the named grammar instead of the default.") options, arguments = parser.parse_args() grammar = get_grammar(options.grammar) PcfgModel = grammar.formalism.PcfgModel if len(arguments) == 0: print >>sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >>sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] # Load the trained model model = PcfgModel.load_model(model_name) print model.description()
def main(): usage = "%prog <model-name> [options]" description = "Outputs a summary of a named model (counts, etc)" parser = OptionParser(usage=usage, description=description) parser.add_option("-g", "--grammar", dest="grammar", action="store", \ help="use the named grammar instead of the default.") options, arguments = parser.parse_args() grammar = get_grammar(options.grammar) PcfgModel = grammar.formalism.PcfgModel if len(arguments) == 0: print >> sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >> sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] # Load the trained model model = PcfgModel.load_model(model_name) print model.description()
def main(): usage = "%prog [<options>] <model-name>" description = "Delete a PCFG model" parser = OptionParser(usage=usage, description=description) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \ help="Number of partitions the model is divided into") parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") options, arguments = parse_args_with_config(parser) # Load a grammar grammar = get_grammar(options.grammar) # Get the pcfg model class for the formalism PcfgModel = grammar.formalism.PcfgModel if len(arguments) == 0: print >>sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >>sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] print "Model base name:", model_name if options.partitions is not None: parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)] else: parts = [(None, model_name)] # First check all the models exist for parti,part_model in parts: if part_model not in PcfgModel.list_models(): print "The model '%s' does not exist" % part_model sys.exit(1) # Now delete them one by one for parti,part_model in parts: # Load the model model = PcfgModel.load_model(part_model) model.delete() print "Removed model: %s" % part_model
def __init__(self, input, options={}, grammar=None, *args, **kwargs): super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs) process_chord_input(self) if grammar is None: self.grammar = get_grammar() else: self.grammar = grammar #### Tag the input sequence #### self._tagged_data = [] chord_map = self.model.model.chord_map if isinstance(self.wrapped_input, ChordInput): chords = self.wrapped_input.to_db_input().chords observations = [(chord.root, chord_map[chord.type]) for chord in chords] self.input = chords elif isinstance(self.wrapped_input, DbInput): observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords] elif isinstance(self.wrapped_input, WeightedChordLabelInput): observations = lattice_to_emissions(input, chord_map=chord_map) # Use the ngram model to get tag probabilities for each input by # computing the state occupation probability matrix path_probs = self.model.viterbi_paths(observations, self.options['paths']) self._paths = [ self.grammar.formalism.backoff_states_to_lf(zip( states, self.times)) for states, prob in path_probs ] # Set the probability on each result for path, (states, prob) in zip(self._paths, path_probs): path.probability = prob
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum( [["%d-%s" % (interval, chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff': self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }
def setUp(self): # Load a grammar self.grammar = get_grammar() self.coord = self.grammar.rules_by_name['coord']
def setUp(self): # Load a grammar self.grammar = get_grammar() self.devel = self.grammar.rules_by_name['dev']
def main(): features = {} input_files = glob.glob(PARSES_FILES) for file_results in input_files: # We read in the whole file (it's pickled, so we have to), but don't # keep the pres object after the loop iteration, because it can # be very big try: pres = ParseResults.from_file(file_results) except ParseResults.LoadError, err: if options.errors: # Print all load errors print >>sys.stderr, "Error loading file: %s" % (err) errors.append(file_results) continue print file_results if len(pres.semantics) == 0: continue top_result = pres.semantics[0][1] gold_result = pres.get_gold_semantics() # 'coord', 'xycoord', 'alpha' or 'roman' grammar = get_grammar() grammar.formalism.cl_output_options("tsformat=coord") coords = zip(*grammar.formalism.semantics_to_coordinates(gold_result))[0] funs = zip(*grammar.formalism.semantics_to_functions(gold_result))[0] gold_seq = zip(coords, funs) tags = [] for g in gold_seq: t = "%s,%s" % (coordinate_to_roman_name(g[0]), g[1]) tags.append(t) gold_graph,gold_time_map = semantics_to_dependency_graph(gold_result) depend_graph = eval("%s" % gold_graph.get_graph_pos(tags)) gold_graph = eval("%s" % gold_graph.get_graph_index()) # Words for g in gold_graph: word1 = g[0].split(",") uni_word = "UNIGRAM:"+str(word1[0]) if uni_word not in features: features[uni_word] = 0 else: features[uni_word] += 1 for dep in depend_graph: word1 = dep[0].split(",") uni_word = "UNIGRAM:"+str(word1[0]) if uni_word not in features: features[uni_word] = 0 else: features[uni_word] += 1 # Tags for dep in depend_graph: word1 = dep[0].split(",") uni_tag = "UNIGRAM:"+str(word1[1]) if uni_tag not in features: features[uni_tag] = 0 else: features[uni_tag] += 1 # Bigram Words for g in gold_graph: word1 = g[0].split(",") if g[1] == "ROOT": bigram_word = "BIGRAM:"+str(word1[0])+":ROOT" else: word2 = g[1].split(",") bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0]) if bigram_word not in features: features[bigram_word] = 0 else: features[bigram_word] += 1 for dep in depend_graph: word1 = dep[0].split(",") if dep[1] == "ROOT": bigram_word = "BIGRAM:"+str(word1[0])+":ROOT" else: word2 = dep[1].split(",") bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0]) if bigram_word not in features: features[bigram_word] = 0 else: features[bigram_word] += 1 # Bigram Tags for dep in depend_graph: word1 = dep[0].split(",") if dep[1] == "ROOT": bigram_tag = "BIGRAM:"+str(word1[1])+":ROOT" else: word2 = dep[1].split(",") bigram_tag = "BIGRAM:"+str(word1[1])+":"+str(word2[1]) if bigram_tag not in features: features[bigram_tag] = 0 else: features[bigram_tag] += 1 # Bigram Words/Tags for dep in depend_graph: word1 = dep[0].split(",") if dep[1] == "ROOT": bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":ROOT" else: word2 = dep[1].split(",") bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":"+str(word2[0])+":"+str(word2[1]) if bigram_words_tags not in features: features[bigram_words_tags] = 0 else: features[bigram_words_tags] += 1 # Trigram words for i in range(len(gold_graph)): if gold_graph[i][1] == "ROOT": # Get trigram if gold_graph[i-1][1] != "ROOT" and gold_graph[i-2][1] != "ROOT": head_root_word = gold_graph[i][0].split(",")[0] head_i1_word = gold_graph[i-1][0].split(",")[0] head_i2_word = gold_graph[i-2][0].split(",")[0] trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word if trigram_word not in features: features[trigram_word] = 0 else: features[trigram_word] += 1 for i in range(len(depend_graph)): if depend_graph[i][1] == "ROOT": # Get trigram if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT": head_root_word = depend_graph[i][0].split(",")[0] head_i1_word = depend_graph[i-1][0].split(",")[0] head_i2_word = depend_graph[i-2][0].split(",")[0] trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word if trigram_word not in features: features[trigram_word] = 0 else: features[trigram_word] += 1 # Trigram tags for i in range(len(depend_graph)): if depend_graph[i][1] == "ROOT": # Get trigram if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT": head_root_tag = depend_graph[i][0].split(",")[1] head_i1_tag = depend_graph[i-1][0].split(",")[1] head_i2_tag = depend_graph[i-2][0].split(",")[1] trigram_tag = "TRIGRAM:" + head_root_tag + ":" + head_i1_tag + ":" + head_i2_tag if trigram_tag not in features: features[trigram_tag] = 0 else: features[trigram_tag] += 1 # Trigram words/tags for i in range(len(depend_graph)): if depend_graph[i][1] == "ROOT": # Get trigram if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT": head_root = depend_graph[i][0].split(",") head_root_word_tag = head_root[0] + ":" + head_root[1] # words/tags head_i1 = depend_graph[i-1][0].split(",") head_i2 = depend_graph[i-2][0].split(",") head_i1_word_tag = head_i1[0] + ":" + head_i1[1] head_i2_word_tag = head_i2[0] + ":" + head_i2[1] trigram_word_tag = "TRIGRAM:" + head_root_word_tag + ":" + head_i1_word_tag + ":" + head_i2_word_tag if trigram_word_tag not in features: features[trigram_word_tag] = 0 else: features[trigram_word_tag] += 1
def main(): usage = "%prog [<options>]" description = "Runs a supertagger from the Jazz Parser to tag some input "\ "but just outputs the results, rather than continuing to parse." optparser = OptionParser(usage=usage, description=description) # Tagger options optparser.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER) optparser.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.") # Commonly-used misc optparser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") # File input options optparser.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.") optparser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords') optparser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") # Misc options optparser.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.") optparser.add_option("-i", "--interactive", dest="interactive", action="store_true", help="instead of just outputing all tags in one go, wait for user input between each iteration of adaptive supertagging") # Logging options optparser.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.") # Read in command line options and args options, clinput = parse_args_with_config(optparser) ########################### Option processing #################### if options.logger: # Directory parse_logger_dir = options.logger check_directory(parse_logger_dir) else: parse_logger_dir = None ######## Grammar ######## # Read in the grammar grammar = get_grammar(options.grammar) ######## Supertagger ######## # Load the supertagger requested if options.supertagger.lower() == "help": print "Available taggers are: %s" % ", ".join(TAGGERS) return 0 try: tagger_cls = get_tagger(options.supertagger) except TaggerLoadError: logger.error("The tagger '%s' could not be loaded. Possible "\ "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS))) return 1 # Get supertagger options before initializing the tagger if options.topts is not None: toptstr = options.topts if "help" in [s.strip().lower() for s in toptstr]: # Output this tagger's option help from jazzparser.utils.options import options_help_text print options_help_text(tagger_cls.TAGGER_OPTIONS, intro="Available options for selected tagger") return 0 toptstr = ":".join(toptstr) else: toptstr = "" topts = ModuleOption.process_option_string(toptstr) # Check that the options are valid try: tagger_cls.check_options(topts) except ModuleOptionError, err: print "Problem with tagger options (--topt): %s" % err return 1
def setUp(self): # Load a grammar self.grammar = get_grammar() self.fapply = self.grammar.rules_by_name['appf'] self.bapply = self.grammar.rules_by_name['appb']
def main(): usage = "%prog [<options>]" description = "Runs a supertagger from the Jazz Parser to tag some input "\ "but just outputs the results, rather than continuing to parse." optparser = OptionParser(usage=usage, description=description) # Tagger options optparser.add_option( "-t", "--tagger", "--supertagger", dest="supertagger", action="store", help= "run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER) optparser.add_option( "--topt", "--tagger-options", dest="topts", action="append", help= "specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options." ) # Commonly-used misc optparser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") # File input options optparser.add_option( "--file", "-f", dest="file", action="store", help= "use a file to get parser input from. Use --filetype to specify the type of the file." ) optparser.add_option( "--filetype", "--ft", dest="filetype", action="store", help= "select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords') optparser.add_option( "--file-options", "--fopt", dest="file_options", action="store", help= "options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options." ) # Misc options optparser.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.") optparser.add_option( "-i", "--interactive", dest="interactive", action="store_true", help= "instead of just outputing all tags in one go, wait for user input between each iteration of adaptive supertagging" ) # Logging options optparser.add_option( "--logger", dest="logger", action="store", help= "directory to put parser logging in. A filename based on an identifier for each individual input will be appended." ) # Read in command line options and args options, clinput = parse_args_with_config(optparser) ########################### Option processing #################### if options.logger: # Directory parse_logger_dir = options.logger check_directory(parse_logger_dir) else: parse_logger_dir = None ######## Grammar ######## # Read in the grammar grammar = get_grammar(options.grammar) ######## Supertagger ######## # Load the supertagger requested if options.supertagger.lower() == "help": print "Available taggers are: %s" % ", ".join(TAGGERS) return 0 try: tagger_cls = get_tagger(options.supertagger) except TaggerLoadError: logger.error("The tagger '%s' could not be loaded. Possible "\ "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS))) return 1 # Get supertagger options before initializing the tagger if options.topts is not None: toptstr = options.topts if "help" in [s.strip().lower() for s in toptstr]: # Output this tagger's option help from jazzparser.utils.options import options_help_text print options_help_text( tagger_cls.TAGGER_OPTIONS, intro="Available options for selected tagger") return 0 toptstr = ":".join(toptstr) else: toptstr = "" topts = ModuleOption.process_option_string(toptstr) # Check that the options are valid try: tagger_cls.check_options(topts) except ModuleOptionError, err: print "Problem with tagger options (--topt): %s" % err return 1
def build_tree_for_sequence(sequence, debug_stack=False, grammar=None, logger=None): """ Run through the motions of parsing the sequence in order to build its tree structure. Most of the structure is implicit in the lexical categories. Additional information is given in the TreeInfo model, associated with chords. """ # Read in the possible categories from the grammar if grammar is None: grammar = get_grammar() # This function will format a string and output it to a logger if logging if logger is None: def _log(*args): pass else: def _log(string, *args): string = string % args logger.info(string) input = [] shift_reduce = [] categories = [] for chord in sequence.iterator(): # Try getting a family for the specified category if chord.category is None or chord.category == "": category = None cat_name = None else: if chord.category not in grammar.families: raise TreeBuildError, "Could not find the category %s in "\ "the lexicon" % chord.category # Assume there's only one entry per family, or at least that if # there are multiple they have the same argument structure. category = grammar.families[chord.category][0].entries[0].sign.category cat_name = chord.category # Put the generalized form of the category into the stack gen_cat = generalize_category(category, grammar.formalism) # Attached a tree leaf to this chord gen_cat.tree = SyntacticTerminal(chord, category=cat_name) input.append(gen_cat) categories.append("%s <= %s" % (chord,category)) _log("CATEGORIES %s", categories) input = list(reversed(input)) stack = [] rules = [ compf, compb, appf, appb, cont ] # Now do the vague pseudo-parse while len(input) > 0: # SHIFT shift_reduce.append("S") stack.append(input.pop()) if debug_stack: print stack _log("SHIFT stack = %s, input = %s", stack, input) # Use the additional information given to us to override default # rule applications coord_unresolved = False coord_resolved = False if stack[-1].tree.chord.treeinfo.coord_unresolved: # This is the end of the first part of a coordination. # Continue reducing, but add a special marker afterwards coord_unresolved = True if stack[-1].tree.chord.treeinfo.coord_resolved: # The end of the second part of a coordination. # Continue reducing, then apply coordination coord_resolved = True # REDUCE # Try combining the top categories on the stack changed = True while changed: changed = False # Try each rule and see whether it applies for rule in rules: res = rule(stack) if res: shift_reduce.append("R(%s)" % rule.name) changed = True _log("REDUCE %s, stack = %s", rule.name, stack) if coord_resolved: # Try to reduce the coordination coord(stack) if coord_unresolved: # Add a special marker to the stack so we know where the # coordination began stack.append(CoordinationMiddleMarker()) for cat in stack: if isinstance(cat, CoordinationMiddleMarker): raise TreeBuildError, "Coordination middle marker not "\ "matched by an end marker. Stack: %s" % strs(stack, ", ") tree = SyntacticTreeRoot([cat.tree for cat in stack], shift_reduce=shift_reduce) return tree
def main(): usage = "%prog [<options>] <model-name> <training-input>" description = "Training of PCFG models." parser = OptionParser(usage=usage, description=description) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \ help="Number of partitions to divide the data into. "\ "For train, divides the input file, trains a model on each "\ "partition's complement and appends partition number to "\ "the model names. For del, appends partition numbers to model "\ "names and deletes all the models. Recache does similarly. "\ "Has no effect for parse.") parser.add_option( '--opts', dest="training_opts", action="store", help= "options to pass to the model trainer. Type '--opts help' for a list of options" ) parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr") parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") options, arguments = parse_args_with_config(parser) if options.debug: log_level = logging.DEBUG else: log_level = logging.WARN # Create a logger for training logger = create_logger(log_level=log_level, name="training", stderr=True) # Load a grammar grammar = get_grammar(options.grammar) # Get the pcfg model class for the formalism PcfgModel = grammar.formalism.PcfgModel # Parse the option string if options.training_opts is None: opts = {} elif options.training_opts.lower() == "help": print options_help_text(PcfgModel.TRAINING_OPTIONS, intro="Training options for PCFGs") sys.exit(0) else: opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), PcfgModel.TRAINING_OPTIONS) if len(arguments) == 0: print >> sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >> sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] print "Model base name:", model_name if options.partitions is not None: parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)] else: parts = [(None, model_name)] if len(arguments) < 2: print >> sys.stderr, "Specify an input file to read sequence data from" sys.exit(1) # Read in the training data from the given file seqs = SequenceIndex.from_file(arguments[1]) if options.partitions is not None: # Prepare each training partition datasets = holdout_partition(seqs.sequences, options.partitions) else: datasets = [seqs.sequences] for dataset, (parti, part_model) in zip(datasets, parts): # Train the named model on the sequence data model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, logger=logger) model.save() print "Trained model", part_model
def main(): usage = "%prog [options] <seq-file>" description = "Parses a sequence from a sequence index file using the "\ "annotations stored in the same file." parser = OptionParser(usage=usage, description=description) parser.add_option( "--popt", "--parser-options", dest="popts", action="append", help= "specify options for the parser. Type '--popt help' to get a list of options (we use a DirectedCkyParser)" ) parser.add_option("--derivations", "--deriv", dest="derivations", action="store_true", help="print out derivation traces of all the results") parser.add_option("--index", "-i", dest="index", action="store", type="int", help="parse just the sequence with this index") parser.add_option("--quiet", "-q", dest="quiet", action="store_true", help="show only errors in the output") parser.add_option( "--tonal-space", "--ts", dest="tonal_space", action="store_true", help="show the tonal space path (with -q, shows only paths)") parser.add_option( "--output-set", "-o", dest="output_set", action="store", help="store the analyses to a tonal space analysis set with this name") parser.add_option( "--trace-parse", "-t", dest="trace_parse", action="store_true", help= "output a trace of the shift-reduce parser's operations in producing the full interpretation from the annotations" ) options, arguments = parser.parse_args() if len(arguments) < 1: print "You must specify a sequence file" sys.exit(1) if options.popts is not None: poptstr = options.popts if "help" in [s.strip().lower() for s in poptstr]: # Output this tagger's option help print options_help_text( DirectedCkyParser.PARSER_OPTIONS, intro="Available options for the directed parser") return 0 else: poptstr = "" popts = ModuleOption.process_option_string(poptstr) grammar = get_grammar() if options.quiet: logger = create_plain_stderr_logger(log_level=logging.ERROR) else: logger = create_plain_stderr_logger() if options.trace_parse: parse_logger = logger else: parse_logger = None seq_index = SequenceIndex.from_file(arguments[0]) # Get the chord sequence(s) if options.index is None: seqs = seq_index.sequences else: seqs = [seq_index.sequence_by_index(options.index)] logger.info("%d sequences\n" % len(seqs)) full_analyses = [] stats = { 'full': 0, 'partial': 0, 'fail': 0, } # Try parsing every sequence for seq in seqs: logger.info("====== Sequence %s =======" % seq.string_name) try: results = parse_sequence_with_annotations( seq, grammar, logger=logger, parse_logger=parse_logger) except ParseError, err: logger.error("Error parsing: %s" % err) stats['fail'] += 1 else: # This may have resulted in multiple partial parses logger.info("%d partial parses" % len(results)) if len(results) == 1: stats['full'] += 1 else: stats['partial'] += 1 if options.derivations: # Output the derivation trace for each partial parse for result in results: print print result.derivation_trace if options.tonal_space: # Output the tonal space coordinates path = grammar.formalism.sign_to_coordinates(results[0]) for i, point in enumerate(path): print "%d, %d: %s" % (seq.id, i, point) # Only include a result in the output analyses if it was a full parse if len(results) == 1: full_analyses.append((seq.string_name, results[0].semantics)) else: logger.warn("%s was not included in the output analyses, "\ "since it was not fully parsed" % seq.string_name)
def main(): usage = "%prog [options] <consistency-data>" description = "Evaluates annotator consistency." parser = OptionParser(usage=usage, description=description) parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of "\ "available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' "\ "with -m to see available options") parser.add_option("-f", "--f-score", dest="f_score", action="store_true", help="outputs recall, precision and f-score for an f-score-based "\ "metric. Just uses the same metric 3 times with output=recall, "\ "etc. Will only work with appropriate metrics") options, arguments = parser.parse_args() grammar = get_grammar() if options.metric is not None: use_metric = True if options.f_score: # Special case: get 3 metrics metrics = [] opts = options.mopts or [] for opt in ["output=precision", "output=recall", "output=f"]: metrics.append( command_line_metric(formalism, options.metric, opts + [opt])) print "Evaluating precision, recall and f-score on %s" % metrics[ 0].name else: # Get a metric according to the options metrics = [ command_line_metric(formalism, options.metric, options.mopts) ] print "Evaluating using metric: %s" % metrics[0].name else: use_metric = False if len(arguments) < 1: print >> sys.stderr, "Specify a consistency data file" sys.exit(1) filename = arguments[0] consdata = ConsistencyData.from_file(filename) # Count up matching annotations matches = 0 chords = 0 for ann1, ann2 in consdata: for chord1, chord2 in zip(ann1, ann2): chords += 1 if chord1.category == chord2.category: matches += 1 # Count matching coordination points rean_coords = sum(sum( [1 for crd in seq if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in seq if crd.treeinfo.coord_resolved]) for seq,gs in consdata) gold_coords = sum(sum( [1 for crd in gs if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in gs if crd.treeinfo.coord_resolved]) for seq,gs in consdata) match_coords = sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_unresolved and crdg.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_resolved and crdg.treeinfo.coord_resolved]) for seq,gs in consdata) # Compute precision, recall and f-score from this precision = 100.0 * (matches + match_coords) / (chords + rean_coords) recall = 100.0 * (matches + match_coords) / (chords + gold_coords) fscore = 2.0 * precision * recall / (precision + recall) print "%d chords" % chords print "\nCategory and coordination accuracy:" print "Precision: %.2f" % precision print "Recall: %.2f" % recall print "F-score: %.2f" % fscore if use_metric: print def _parse_seq(seq): # Parse the annotations to get a semantics try: gold_parses = parse_sequence_with_annotations( DbInput.from_sequence(seq), grammar=grammar, allow_subparses=False) # Got a result: return its semantics return gold_parses[0].semantics except ParseError, err: # Could not parse annotated sequence print >>sys.stderr, "Could not parse sequence '%s': %s" % \ (seq.string_name, err) return # Prepare pairs of gold-standard parse results from the two annotations sem_pairs = [(_parse_seq(ann1), _parse_seq(ann2)) for (ann1, ann2) in consdata] # Compute the distance using the metrics for metric in metrics: distance = metric.total_distance(sem_pairs) print "%s: %s" % (metric.identifier.capitalize(), metric.format_distance(distance))
def main(): usage = "%prog [<options>] <model-name> <training-input>" description = "Training of PCFG models." parser = OptionParser(usage=usage, description=description) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \ help="Number of partitions to divide the data into. "\ "For train, divides the input file, trains a model on each "\ "partition's complement and appends partition number to "\ "the model names. For del, appends partition numbers to model "\ "names and deletes all the models. Recache does similarly. "\ "Has no effect for parse.") parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options") parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr") parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") options, arguments = parse_args_with_config(parser) if options.debug: log_level = logging.DEBUG else: log_level = logging.WARN # Create a logger for training logger = create_logger(log_level = log_level, name = "training", stderr = True) # Load a grammar grammar = get_grammar(options.grammar) # Get the pcfg model class for the formalism PcfgModel = grammar.formalism.PcfgModel # Parse the option string if options.training_opts is None: opts = {} elif options.training_opts.lower() == "help": print options_help_text(PcfgModel.TRAINING_OPTIONS, intro="Training options for PCFGs") sys.exit(0) else: opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), PcfgModel.TRAINING_OPTIONS) if len(arguments) == 0: print >>sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >>sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] print "Model base name:", model_name if options.partitions is not None: parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)] else: parts = [(None, model_name)] if len(arguments) < 2: print >>sys.stderr, "Specify an input file to read sequence data from" sys.exit(1) # Read in the training data from the given file seqs = SequenceIndex.from_file(arguments[1]) if options.partitions is not None: # Prepare each training partition datasets = holdout_partition(seqs.sequences, options.partitions) else: datasets = [seqs.sequences] for dataset,(parti,part_model) in zip(datasets,parts): # Train the named model on the sequence data model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, logger=logger) model.save() print "Trained model", part_model
def main(): usage = "%prog [options] <results-files>" description = """\ Read in a ParseResults file, just like result_alignment.py. Examines the \ errors that were made and outputs them in context. """ parser = OptionParser(usage=usage, description=description) parser.add_option("--window", "-w", dest="window", action="store", type="int", help="size of context window to show before and after each error. Default: 2", default=2) parser.add_option("--distance", "--dist", dest="distance", action="store_true", help="show the total distance travelled in the tonal space by the result and the gold standard") parser.add_option("--output-opts", "--oopts", dest="output_opts", action="store", help="options that affect the output formatting. Use '--output-opts help' for a list of options.") parser.add_option("--summary-threshold", dest="summary_threshold", action="store", type="int", help="how many times a substitution/insertion/deletion needs to have happened to be including in the summary (default: 4)", default=4) options, arguments = parser.parse_args() if len(arguments) == 0: print >>sys.stderr, "Specify at least one file to read the results from" sys.exit(1) grammar = get_grammar() grammar.formalism.cl_output_options(options.output_opts) # Size of window of context to show win = options.window errors = [] unscored_files = [] scored = 0 unscored = 0 result_lengths = [] gold_lengths = [] insertions = {} deletions = {} substitutions = {} error_types = {} for filename in arguments: try: top_result, gold_result = get_top_result(filename) except ParseResults.LoadError, err: print >>sys.stderr, "Error loading file: %s" % (err) errors.append(filename) continue else: print "=============================" print "File: %s" % filename if top_result is None: # No alignment was found unscored +=1 print "No result" else: # Wrap these up as a semantics, since some functions need that as input Sems = grammar.formalism.Semantics.Semantics top_sems, gold_sems = Sems(top_result), Sems(gold_result) # Do the alignment of the top result and gold result alignment,gold_seq,result_seq = results_alignment(top_result, gold_result) scored += 1 # Get the actual list of coordinates coords = zip(*grammar.formalism.semantics_to_coordinates(gold_sems))[0] funs = zip(*grammar.formalism.semantics_to_functions(gold_sems))[0] gold_coords = zip(coords, funs) coords = zip(*grammar.formalism.semantics_to_coordinates(top_sems))[0] funs = zip(*grammar.formalism.semantics_to_functions(top_sems))[0] result_coords = zip(coords, funs) print "Result length: %d, gold length: %d" % \ (len(result_coords), len(gold_coords)) result_lengths.append(len(result_coords)) gold_lengths.append(len(gold_coords)) if options.distance: # Work out the total distance travelled start, end = gold_coords[-1][0], gold_coords[0][0] gold_vect = end[0] - start[0], end[1] - start[1] # And for the actual result start, end = result_coords[-1][0], result_coords[0][0] result_vect = end[0] - start[0], end[1] - start[1] print "Distance travelled:" print " Gold result:", gold_vect print " Top result: ", result_vect print # Put together a table of error windows table = [ # Header row ["", "Step", "", "Result", "Gold"] ] gold = iter(zip(gold_seq,gold_coords)) result = iter(zip(result_seq,result_coords)) context = [] post_context = 0 unseen = 0 for op in alignment: # Keep a record of how many of each error occur if op not in error_types: error_types[op] = 1 else: error_types[op] += 1 if op == "A": # Aligned pair # Move both sequences on gold_step,gold_point = gold.next() result_step,result_point = result.next() if post_context > 0: # Show this as part of the post-context of an error table.append(["A", str(gold_step), "", str(result_point), str(gold_point)]) context = [] post_context -= 1 else: # Add this to the rolling window of pre-context if len(context) >= win: # We've not shown something here unseen += 1 if win > 0: context.append((gold_step, gold_point, result_step, result_point)) context = context[-win:] else: # Mark if there was something we didn't show if unseen: table.append(["", " ...%d..." % unseen, "", "", ""]) unseen = 0 if context: # Show the error's pre-context for (pre_gold_step,pre_gold_point,__,pre_result_point) in context: table.append(["A", str(pre_gold_step), "", str(pre_result_point), str(pre_gold_point)]) context = [] if op == "I": # Inserted in the result result_step,result_point = result.next() table.append(["I", str(result_step), "", str(result_point), ""]) if str(result_step) not in insertions: insertions[str(result_step)] = 1 else: insertions[str(result_step)] += 1 elif op == "D": # Deleted in the result gold_step,gold_point = gold.next() table.append(["D", str(gold_step), "", "", str(gold_point)]) if str(gold_step) not in deletions: deletions[str(gold_step)] = 1 else: deletions[str(gold_step)] += 1 else: # Substituted result_step, result_point = result.next() gold_step, gold_point = gold.next() table.append([str(op), str(result_step), "for %s" % str(gold_step), str(result_point), str(gold_point)]) subst_key = "%s > %s" % (gold_step, result_step) if subst_key not in substitutions: substitutions[subst_key] = 1 else: substitutions[subst_key] += 1 # After anything other than an alignment, cancel the # context window context = [] # Show up to <win> in the post-context of alignments post_context = win # Mark if there was something at the end we didn't show if unseen: table.append(["", " ...%d..." % unseen, "", "", ""]) # Print out the table pprint_table(sys.stdout, table, justs=[True,True,True,True,True]) print "\n"
def main(): usage = "%prog [options] <results-files> <index>" description = "Prints a dependency tree for a parse result" parser = OptionParser(usage=usage, description=description) parser.add_option("-l", "--latex", dest="latex", action="store_true", help="output Latex for the graphs using tikz-dependency") parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help' for a list of available options.") options, arguments = parser.parse_args() if len(arguments) < 1: print >>sys.stderr, "Specify a file to read the results from" sys.exit(1) filename = arguments[0] if len(arguments) < 2: print >>sys.stderr, "Specify an of the sequence to load" sys.exit(1) index = int(arguments[1]) grammar = get_grammar() # We always need an index, so this is given as an argument # Put it in the options list for loading the file fopts = options.file_options if fopts and len(fopts): fopts += ":index=%d" % index else: fopts = "index=%d" % index # Load the sequence index file dbinput = command_line_input(filename=filename, filetype="db", options=fopts) name = dbinput.name anal = parse_sequence_with_annotations(dbinput, grammar)[0] graph, time_map = semantics_to_dependency_graph(anal.semantics) # Join together chords that are on the same dependency node times = iter(sorted(time_map.values())) dep_time = times.next() current_chord = [] joined_chords = [] finished = False for chord_time,chord in sorted(dbinput.sequence.time_map.items()): if chord_time >= dep_time and not finished: if len(current_chord): joined_chords.append(current_chord) current_chord = [chord] try: dep_time = times.next() except StopIteration: finished = True else: current_chord.append(chord) joined_chords.append(current_chord) chords = [" ".join(filter_latex(str(crd)) for crd in item) for item in joined_chords] annotations = [" ".join(filter_latex(crd.category) for crd in item) for item in joined_chords] graph.words = annotations if options.latex: # Exit with status 1 if we don't output anything exit_status = 1 # Output a full Latex document in one go if name is not None: title = r"""\title{%s} \author{} \date{}""" % name.capitalize() maketitle = r"\maketitle\thispagestyle{empty}\vspace{-20pt}" else: title = "" maketitle = "" # Print the header print r"""\documentclass[a4paper]{article} \usepackage{tikz-dependency} %% You may need to set paperheight (for width) and paperwidth (for height) to get things to fit \usepackage[landscape,margin=1cm,paperheight=50cm]{geometry} \pagestyle{empty} %(title)s \begin{document} %(maketitle)s \tikzstyle{every picture}+=[remember picture] \centering """ % \ { 'title' : title, 'maketitle' : maketitle } if graph is not None: exit_status = 0 print dependency_graph_to_latex(graph, fmt_lab=_fmt_label, extra_rows=[chords]) print "\n\\vspace{15pt}" # Finish off the document print r""" \end{document} """ sys.exit(exit_status) else: # Not outputing Latex print graph
def main(): usage = "%prog [options] <consistency-data>" description = "Evaluates annotator consistency." parser = OptionParser(usage=usage, description=description) parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of "\ "available metrics") parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' "\ "with -m to see available options") parser.add_option("-f", "--f-score", dest="f_score", action="store_true", help="outputs recall, precision and f-score for an f-score-based "\ "metric. Just uses the same metric 3 times with output=recall, "\ "etc. Will only work with appropriate metrics") options, arguments = parser.parse_args() grammar = get_grammar() if options.metric is not None: use_metric = True if options.f_score: # Special case: get 3 metrics metrics = [] opts = options.mopts or [] for opt in [ "output=precision", "output=recall", "output=f" ]: metrics.append(command_line_metric(formalism, options.metric, opts+[opt])) print "Evaluating precision, recall and f-score on %s" % metrics[0].name else: # Get a metric according to the options metrics = [command_line_metric(formalism, options.metric, options.mopts)] print "Evaluating using metric: %s" % metrics[0].name else: use_metric = False if len(arguments) < 1: print >>sys.stderr, "Specify a consistency data file" sys.exit(1) filename = arguments[0] consdata = ConsistencyData.from_file(filename) # Count up matching annotations matches = 0 chords = 0 for ann1,ann2 in consdata: for chord1,chord2 in zip(ann1,ann2): chords += 1 if chord1.category == chord2.category: matches += 1 # Count matching coordination points rean_coords = sum(sum( [1 for crd in seq if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in seq if crd.treeinfo.coord_resolved]) for seq,gs in consdata) gold_coords = sum(sum( [1 for crd in gs if crd.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crd in gs if crd.treeinfo.coord_resolved]) for seq,gs in consdata) match_coords = sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_unresolved and crdg.treeinfo.coord_unresolved]) for seq,gs in consdata) + \ sum(sum( [1 for crdr,crdg in zip(seq,gs) if crdr.treeinfo.coord_resolved and crdg.treeinfo.coord_resolved]) for seq,gs in consdata) # Compute precision, recall and f-score from this precision = 100.0 * (matches + match_coords) / (chords + rean_coords) recall = 100.0 * (matches + match_coords) / (chords + gold_coords) fscore = 2.0 * precision * recall / (precision+recall) print "%d chords" % chords print "\nCategory and coordination accuracy:" print "Precision: %.2f" % precision print "Recall: %.2f" % recall print "F-score: %.2f" % fscore if use_metric: print def _parse_seq(seq): # Parse the annotations to get a semantics try: gold_parses = parse_sequence_with_annotations( DbInput.from_sequence(seq), grammar=grammar, allow_subparses=False) # Got a result: return its semantics return gold_parses[0].semantics except ParseError, err: # Could not parse annotated sequence print >>sys.stderr, "Could not parse sequence '%s': %s" % \ (seq.string_name, err) return # Prepare pairs of gold-standard parse results from the two annotations sem_pairs = [ (_parse_seq(ann1), _parse_seq(ann2)) for (ann1,ann2) in consdata ] # Compute the distance using the metrics for metric in metrics: distance = metric.total_distance(sem_pairs) print "%s: %s" % (metric.identifier.capitalize(), metric.format_distance(distance))
def main(): usage = "%prog [options] <results-files>" description = """\ Read in a ParseResults file, just like result_alignment.py. Examines the \ errors that were made and outputs them in context. """ parser = OptionParser(usage=usage, description=description) parser.add_option( "--window", "-w", dest="window", action="store", type="int", help= "size of context window to show before and after each error. Default: 2", default=2) parser.add_option( "--distance", "--dist", dest="distance", action="store_true", help= "show the total distance travelled in the tonal space by the result and the gold standard" ) parser.add_option( "--output-opts", "--oopts", dest="output_opts", action="store", help= "options that affect the output formatting. Use '--output-opts help' for a list of options." ) parser.add_option( "--summary-threshold", dest="summary_threshold", action="store", type="int", help= "how many times a substitution/insertion/deletion needs to have happened to be including in the summary (default: 4)", default=4) options, arguments = parser.parse_args() if len(arguments) == 0: print >> sys.stderr, "Specify at least one file to read the results from" sys.exit(1) grammar = get_grammar() grammar.formalism.cl_output_options(options.output_opts) # Size of window of context to show win = options.window errors = [] unscored_files = [] scored = 0 unscored = 0 result_lengths = [] gold_lengths = [] insertions = {} deletions = {} substitutions = {} error_types = {} for filename in arguments: try: top_result, gold_result = get_top_result(filename) except ParseResults.LoadError, err: print >> sys.stderr, "Error loading file: %s" % (err) errors.append(filename) continue else: print "=============================" print "File: %s" % filename if top_result is None: # No alignment was found unscored += 1 print "No result" else: # Wrap these up as a semantics, since some functions need that as input Sems = grammar.formalism.Semantics.Semantics top_sems, gold_sems = Sems(top_result), Sems(gold_result) # Do the alignment of the top result and gold result alignment, gold_seq, result_seq = results_alignment( top_result, gold_result) scored += 1 # Get the actual list of coordinates coords = zip( *grammar.formalism.semantics_to_coordinates(gold_sems))[0] funs = zip( *grammar.formalism.semantics_to_functions(gold_sems))[0] gold_coords = zip(coords, funs) coords = zip( *grammar.formalism.semantics_to_coordinates(top_sems))[0] funs = zip( *grammar.formalism.semantics_to_functions(top_sems))[0] result_coords = zip(coords, funs) print "Result length: %d, gold length: %d" % \ (len(result_coords), len(gold_coords)) result_lengths.append(len(result_coords)) gold_lengths.append(len(gold_coords)) if options.distance: # Work out the total distance travelled start, end = gold_coords[-1][0], gold_coords[0][0] gold_vect = end[0] - start[0], end[1] - start[1] # And for the actual result start, end = result_coords[-1][0], result_coords[0][0] result_vect = end[0] - start[0], end[1] - start[1] print "Distance travelled:" print " Gold result:", gold_vect print " Top result: ", result_vect print # Put together a table of error windows table = [ # Header row ["", "Step", "", "Result", "Gold"] ] gold = iter(zip(gold_seq, gold_coords)) result = iter(zip(result_seq, result_coords)) context = [] post_context = 0 unseen = 0 for op in alignment: # Keep a record of how many of each error occur if op not in error_types: error_types[op] = 1 else: error_types[op] += 1 if op == "A": # Aligned pair # Move both sequences on gold_step, gold_point = gold.next() result_step, result_point = result.next() if post_context > 0: # Show this as part of the post-context of an error table.append([ "A", str(gold_step), "", str(result_point), str(gold_point) ]) context = [] post_context -= 1 else: # Add this to the rolling window of pre-context if len(context) >= win: # We've not shown something here unseen += 1 if win > 0: context.append((gold_step, gold_point, result_step, result_point)) context = context[-win:] else: # Mark if there was something we didn't show if unseen: table.append( ["", " ...%d..." % unseen, "", "", ""]) unseen = 0 if context: # Show the error's pre-context for (pre_gold_step, pre_gold_point, __, pre_result_point) in context: table.append([ "A", str(pre_gold_step), "", str(pre_result_point), str(pre_gold_point) ]) context = [] if op == "I": # Inserted in the result result_step, result_point = result.next() table.append([ "I", str(result_step), "", str(result_point), "" ]) if str(result_step) not in insertions: insertions[str(result_step)] = 1 else: insertions[str(result_step)] += 1 elif op == "D": # Deleted in the result gold_step, gold_point = gold.next() table.append( ["D", str(gold_step), "", "", str(gold_point)]) if str(gold_step) not in deletions: deletions[str(gold_step)] = 1 else: deletions[str(gold_step)] += 1 else: # Substituted result_step, result_point = result.next() gold_step, gold_point = gold.next() table.append([ str(op), str(result_step), "for %s" % str(gold_step), str(result_point), str(gold_point) ]) subst_key = "%s > %s" % (gold_step, result_step) if subst_key not in substitutions: substitutions[subst_key] = 1 else: substitutions[subst_key] += 1 # After anything other than an alignment, cancel the # context window context = [] # Show up to <win> in the post-context of alignments post_context = win # Mark if there was something at the end we didn't show if unseen: table.append(["", " ...%d..." % unseen, "", "", ""]) # Print out the table pprint_table(sys.stdout, table, justs=[True, True, True, True, True]) print "\n"
def main(): usage = "%prog [options] <results-files> <index>" description = "Prints a dependency tree for a parse result" parser = OptionParser(usage=usage, description=description) parser.add_option("-l", "--latex", dest="latex", action="store_true", help="output Latex for the graphs using tikz-dependency") parser.add_option( "--file-options", "--fopt", dest="file_options", action="store", help= "options for the input file (--file). Type '--fopt help' for a list of available options." ) options, arguments = parser.parse_args() if len(arguments) < 1: print >> sys.stderr, "Specify a file to read the results from" sys.exit(1) filename = arguments[0] if len(arguments) < 2: print >> sys.stderr, "Specify an of the sequence to load" sys.exit(1) index = int(arguments[1]) grammar = get_grammar() # We always need an index, so this is given as an argument # Put it in the options list for loading the file fopts = options.file_options if fopts and len(fopts): fopts += ":index=%d" % index else: fopts = "index=%d" % index # Load the sequence index file dbinput = command_line_input(filename=filename, filetype="db", options=fopts) name = dbinput.name anal = parse_sequence_with_annotations(dbinput, grammar)[0] graph, time_map = semantics_to_dependency_graph(anal.semantics) # Join together chords that are on the same dependency node times = iter(sorted(time_map.values())) dep_time = times.next() current_chord = [] joined_chords = [] finished = False for chord_time, chord in sorted(dbinput.sequence.time_map.items()): if chord_time >= dep_time and not finished: if len(current_chord): joined_chords.append(current_chord) current_chord = [chord] try: dep_time = times.next() except StopIteration: finished = True else: current_chord.append(chord) joined_chords.append(current_chord) chords = [ " ".join(filter_latex(str(crd)) for crd in item) for item in joined_chords ] annotations = [ " ".join(filter_latex(crd.category) for crd in item) for item in joined_chords ] graph.words = annotations if options.latex: # Exit with status 1 if we don't output anything exit_status = 1 # Output a full Latex document in one go if name is not None: title = r"""\title{%s} \author{} \date{}""" % name.capitalize() maketitle = r"\maketitle\thispagestyle{empty}\vspace{-20pt}" else: title = "" maketitle = "" # Print the header print r"""\documentclass[a4paper]{article} \usepackage{tikz-dependency} %% You may need to set paperheight (for width) and paperwidth (for height) to get things to fit \usepackage[landscape,margin=1cm,paperheight=50cm]{geometry} \pagestyle{empty} %(title)s \begin{document} %(maketitle)s \tikzstyle{every picture}+=[remember picture] \centering """ % \ { 'title' : title, 'maketitle' : maketitle } if graph is not None: exit_status = 0 print dependency_graph_to_latex(graph, fmt_lab=_fmt_label, extra_rows=[chords]) print "\n\\vspace{15pt}" # Finish off the document print r""" \end{document} """ sys.exit(exit_status) else: # Not outputing Latex print graph
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }
def train(self, inputs, grammar=None, logger=None): """ @type inputs: L{jazzparser.data.input.MidiTaggerTrainingBulkInput} or list of L{jazzparser.data.input.Input}s @param inputs: training MIDI data. Annotated chord sequences should also be given (though this is optional) by loading a bulk db input file in the MidiTaggerTrainingBulkInput. """ if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() if len(inputs) == 0: # No data - nothing to do return # Check the type of one of the inputs - no guarantee they're all the # same, but there's something seriously weird going on if they're not input_type = detect_input_type(inputs[0], allowed=['segmidi']) # Get the chord training data too if it's been given if isinstance(inputs, MidiTaggerTrainingBulkInput) and \ inputs.chords is not None: chord_inputs = inputs.chords else: chord_inputs = None # Initialize the emission distribution for chord classes self.hmm = ChordClassHmm.initialize_chord_classes( self.options['ccprob'], self.options['maxnotes'], grammar, metric=self.options['metric'], illegal_transitions=self.options['illegal_transitions'], fixed_root_transitions=self.options['fixed_roots']) if chord_inputs: # If chord training data was given, initially train transition # distribution from this self.hmm.add_history("Training initial transition distribution "\ "from annotated chord data") self.hmm.train_transition_distribution(chord_inputs, grammar, \ contprob=self.options['contprob']) else: # Otherwise it gets left as a uniform distribution self.hmm.add_history("No annotated chord training data given. "\ "Transition distribution initialized to uniform.") # Get a Baum-Welch trainer to do the EM retraining # Pull out the options to pass to the trainer bw_opt_names = [opt.name for opt in ChordClassBaumWelchTrainer.OPTIONS] bw_opts = dict([(name,val) for (name,val) in self.options.items() \ if name in bw_opt_names]) retrainer = ChordClassBaumWelchTrainer(self.hmm, options=bw_opts) # Prepare a callback to save def _get_save_callback(): def _save_callback(): self.save() return _save_callback save_callback = _get_save_callback() # Do the Baum-Welch training retrainer.train(inputs, logger=logger, save_callback=save_callback) self.model_description = """\ Initial chord class emission prob: %(ccprob)f Initial self-transition prob: %(contprob)s Metrical model: %(metric)s """ % \ { 'ccprob' : self.options['ccprob'], 'metric' : self.options['metric'], 'contprob' : self.options['contprob'], }
def build_tree_for_sequence(sequence, debug_stack=False, grammar=None, logger=None): """ Run through the motions of parsing the sequence in order to build its tree structure. Most of the structure is implicit in the lexical categories. Additional information is given in the TreeInfo model, associated with chords. """ # Read in the possible categories from the grammar if grammar is None: grammar = get_grammar() # This function will format a string and output it to a logger if logging if logger is None: def _log(*args): pass else: def _log(string, *args): string = string % args logger.info(string) input = [] shift_reduce = [] categories = [] for chord in sequence.iterator(): # Try getting a family for the specified category if chord.category is None or chord.category == "": category = None cat_name = None else: if chord.category not in grammar.families: raise TreeBuildError, "Could not find the category %s in "\ "the lexicon" % chord.category # Assume there's only one entry per family, or at least that if # there are multiple they have the same argument structure. category = grammar.families[ chord.category][0].entries[0].sign.category cat_name = chord.category # Put the generalized form of the category into the stack gen_cat = generalize_category(category, grammar.formalism) # Attached a tree leaf to this chord gen_cat.tree = SyntacticTerminal(chord, category=cat_name) input.append(gen_cat) categories.append("%s <= %s" % (chord, category)) _log("CATEGORIES %s", categories) input = list(reversed(input)) stack = [] rules = [compf, compb, appf, appb, cont] # Now do the vague pseudo-parse while len(input) > 0: # SHIFT shift_reduce.append("S") stack.append(input.pop()) if debug_stack: print stack _log("SHIFT stack = %s, input = %s", stack, input) # Use the additional information given to us to override default # rule applications coord_unresolved = False coord_resolved = False if stack[-1].tree.chord.treeinfo.coord_unresolved: # This is the end of the first part of a coordination. # Continue reducing, but add a special marker afterwards coord_unresolved = True if stack[-1].tree.chord.treeinfo.coord_resolved: # The end of the second part of a coordination. # Continue reducing, then apply coordination coord_resolved = True # REDUCE # Try combining the top categories on the stack changed = True while changed: changed = False # Try each rule and see whether it applies for rule in rules: res = rule(stack) if res: shift_reduce.append("R(%s)" % rule.name) changed = True _log("REDUCE %s, stack = %s", rule.name, stack) if coord_resolved: # Try to reduce the coordination coord(stack) if coord_unresolved: # Add a special marker to the stack so we know where the # coordination began stack.append(CoordinationMiddleMarker()) for cat in stack: if isinstance(cat, CoordinationMiddleMarker): raise TreeBuildError, "Coordination middle marker not "\ "matched by an end marker. Stack: %s" % strs(stack, ", ") tree = SyntacticTreeRoot([cat.tree for cat in stack], shift_reduce=shift_reduce) return tree
def main(): set_proc_title("jazzparser") ######################################################## usage = "jazzparser [<options>]" description = "The main parser interface for the Jazz Parser" ## Process the input options optparser = OptionParser(usage=usage, description=description) ### # File input options group = OptionGroup(optparser, "Input", "Input type and location") optparser.add_option_group(group) group.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.") group.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords') group.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") group.add_option("--index", "--indices", dest="input_index", action="store", help="select individual inputs to process. Specify as a comma-separated list of indices. All inputs are loaded as usual, but only the ith input is processed, for each i in the list") group.add_option("--only-load", dest="only_load", action="store_true", help="don't do anything with the inputs, just load and list them. Handy for checking the inputs load and getting their indices") group.add_option("--partitions", dest="partitions", action="store", type="int", help="divide the input data into this number of partitions and use a different set of models for each. For any parser, tagger and backoff that takes a 'model' argument, the partition number will be appended to the given value") group.add_option("--seq-parts", "--sequence-partitions", dest="sequence_partitions", action="store", help="use a chord sequence index to partition the inputs. Input type (bulk) must support association of the inputs with chord sequences by id. Sequences in the given sequence index file are partitioned n ways (--partitions) and the inputs are processed according to their associated sequence.") group.add_option("--continue", "--skip-done", dest="skip_done", action="store_true", help="skip any inputs for which a readable results file already exists. This is useful for continuing a bulk job that was stopped in the middle") ### group = OptionGroup(optparser, "Parser", "Parser, supertagger and backoff parser") optparser.add_option_group(group) group.add_option("-d", "--derivations", dest="derivations", action="store_true", help="keep derivation logs during parse.") group.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") # Parser options group.add_option("-p", "--parser", dest="parser", action="store", help="use the named parser algorithm instead of the default. Use '-p help' to see the list of available parsers. Default: %s" % settings.DEFAULT_PARSER, default=settings.DEFAULT_PARSER) group.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser. Type '--popt help', using '--parser <name>' to select a parser module, to get a list of options.") # Tagger options group.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER) group.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.") # Backoff options group.add_option("-b", "--backoff", "--noparse", dest="backoff", action="store", help="use the named backoff model as a backoff if the parser produces no results") group.add_option("--bopt", "--backoff-options", "--backoff-options", "--npo", dest="backoff_opts", action="append", help="specify options for the backoff model. Type '--npo help', using '--backoff <name>' to select a backoff modules, to get a list of options.") ### # Multiprocessing options group = OptionGroup(optparser, "Multiprocessing") optparser.add_option_group(group) group.add_option("--processes", dest="processes", action="store", type="int", help="number of processes to create to perform parses in parallel. Default: 1, i.e. no process pool. Use -1 to create a process for every input", default=1) ### # Output options group = OptionGroup(optparser, "Output") optparser.add_option_group(group) group.add_option("--output", dest="output", action="store", help="directory name to output parse results to. A filename specific to the individual input will be appended to this") group.add_option("--topn", dest="topn", action="store", type="int", help="limit the number of final results to store in the output file to the top n by probability. By default, stores all") group.add_option("--output-opts", "--oopts", dest="output_opts", action="store", help="options that affect the output formatting. Use '--output-opts help' for a list of options.") group.add_option("-a", "--atomic-results", dest="atoms_only", action="store_true", help="only include atomic categories in the results.") group.add_option("-l", "--latex", dest="latex", action="store_true", help="output all results as Latex source. Used to produce a whole Latex document, but doesn't any more") group.add_option("--all-times", dest="all_times", action="store_true", help="display all timing information on semantics in output.") group.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.") group.add_option("--time", dest="time", action="store_true", help="time how long the parse takes and output with the results.") group.add_option("--no-results", dest="no_results", action="store_true", help="don't print out the parse results at the end. Obviously you'll want to make sure they're going to a file (--output). This is useful for bulk parse jobs, where the results produce a lot of unnecessary output") group.add_option("--no-progress", dest="no_progress", action="store_true", help="don't output the summary of completed sequences after each one finishes") ### # Output analysis and harmonical group = OptionGroup(optparser, "Output processing", "Output analysis and harmonical") optparser.add_option_group(group) group.add_option("--harmonical", dest="harmonical", action="store", help="use the harmonical to play the chords justly intoned according to the top result and output to a wave file.") group.add_option("--enharmonical", dest="enharmonical", action="store", help="use the harmonical to play the chords in equal temperament and output to a wave file.") group.add_option("--midi", dest="midi", action="store_true", help="generate MIDI files from the harmonical, instead of wave files.") group.add_option("--tempo", dest="tempo", action="store", type=int, help="tempo to use for the generated music (see --harmonical/--enharmonical). Default: 120", default=120) group.add_option("--lh-analysis", dest="lh_analysis", action="store_true", help="output the Longuet-Higgins space interpretation of the semantics for each result.") group.add_option("--lh-coordinates", dest="lh_coord", action="store_true", help="like lh-analysis, but displays the coordinates of the points instead of their names.") ### # Logging options group = OptionGroup(optparser, "Logging") optparser.add_option_group(group) group.add_option("--long-progress", dest="long_progress", action="store_true", help="print a summary of the chart so far after each chord/word has been processed.") group.add_option("--progress", "--short-progress", dest="short_progress", action="store_true", help="print a small amount of information out during parsing to indicate progress.") group.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.") ### # Shell options group = OptionGroup(optparser, "Shell", "Interactive shell for inspecting results and parser state") optparser.add_option_group(group) group.add_option("-i", "--interactive", dest="interactive", action="store_true", help="enter interactive mode after parsing.") group.add_option("--error", dest="error_shell", action="store_true", help="catch any errors, report them and then enter the interactive shell. This also catches keyboard interrupts, so you can use it to halt parsing and enter the shell.") # Read in command line options and args options, clinput = parse_args_with_config(optparser) ########################### Option processing #################### # Get log level option first, so we can start using the logger if options.debug: log_level = logging.DEBUG else: log_level = logging.INFO # Set up a logger init_logging(log_level) if options.latex: settings.OPTIONS.OUTPUT_LATEX = True if options.logger: # Directory parse_logger_dir = options.logger check_directory(parse_logger_dir) else: parse_logger_dir = None ######## Grammar ######## # Check the grammar actually exists grammar_names = get_grammar_names() if options.grammar is not None and options.grammar not in grammar_names: # This is not a valid grammar name logger.error("The grammar '%s' does not exist. Possible "\ "grammars are: %s." % (options.grammar, ", ".join(grammar_names))) return 1 grammar = get_grammar(options.grammar) ######## Parser ######## # Load the requested parser from jazzparser.parsers import PARSERS if options.parser.lower() == "help": print "Available parsers are: %s" % ", ".join(PARSERS) return 0 try: parser_cls = get_parser(options.parser) except ParserLoadError: logger.error("The parser '%s' could not be loaded. Possible "\ "parsers are: %s" % (options.parser, ", ".join(PARSERS))) return 1 # Get parser options if options.popts is not None: poptstr = options.popts if "help" in [s.strip().lower() for s in poptstr]: # Output this tagger's option help from jazzparser.utils.options import options_help_text print options_help_text(parser_cls.PARSER_OPTIONS, intro="Available options for selected parser") return 0 poptstr = ":".join(poptstr) else: poptstr = "" popts = ModuleOption.process_option_string(poptstr) # Check that the options are valid try: parser_cls.check_options(popts) except ModuleOptionError, err: logger.error("Problem with parser options (--popt): %s" % err) return 1