示例#1
0
def main():
    usage = "%prog [options] <model-type> <model_name> <in-file>"
    description = "Trains a backoff builder model using the given "\
        "input data. Specify a model type (ngram, etc) and a name to "\
        "identify it. The data file should be a stored SequenceIndex file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    # File input options
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db')
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    options, arguments = parse_args_with_config(parser)
    
    if len(arguments) < 3:
        print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[2])
    model_type = arguments[0]
    model_name = arguments[1]
    
    builder_cls = get_backoff_builder(model_type)
    model_cls = builder_cls.MODEL_CLASS
    
    # Load the sequence data from a dbinput file
    input_data = command_line_input(filename=filename, 
                                    filetype=options.filetype, 
                                    options=options.file_options,
                                    allowed_types=['bulk-db', 'bulk-db-annotated'])
    
    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__)
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_dict(
                            ModuleOption.process_option_string(options.training_opts), 
                            model_cls.TRAINING_OPTIONS)
        
    if options.partitions is not None:
        parts = holdout_partition(input_data, options.partitions)
        models = [(builder_cls.partition_model_name(model_name,num),seqs) for \
                        num,seqs in enumerate(parts)]
    else:
        models = [(model_name,input_data)]
    
    for part_name,seqs in models:
        # Instantiate a fresh model with this name
        model = model_cls(part_name, options=training_opts)
        # Train it with the loaded data
        model.train(seqs)
        model.save()
        print "Trained model %s" % (part_name)
示例#2
0
 def check_options(cls, options):
     """
     Normally, options are validated when the tagger is instantiated. 
     This allows you to check them before that.
     
     """
     return ModuleOption.process_option_dict(options, cls.TAGGER_OPTIONS)
示例#3
0
def main():
    usage = "%prog [options] <song-set> <results-file0> [<results-file1> ...]"
    parser = OptionParser(usage=usage)
    parser.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser that interprets the gold standard annotations. Type '--popt help' to get a list of options (we use a DirectedCkyParser)")
    parser.add_option("-m", "--metric", dest="metric", action="store", help="semantics distance metric to use. Use '-m help' for a list of available metrics")
    parser.add_option("--mopt", "--metric-options", dest="mopts", action="append", help="options to pass to the semantics metric. Use with '--mopt help' with -m to see available options")
    parser.add_option("-r", "--print-results", dest="print_results", action="store", default=5, type="int", help="number of top search results to print for each query (parse result). Default: 5. Use -1 to print distances from all songs in the corpus")
    parser.add_option("-g", "--gold-only", dest="gold_only", action="store_true", help="skip results that have no gold standard sequence associated with them (we can't tell which is the right answer for these)")
    parser.add_option("--mc", "--metric-computation", dest="metric_computation", action="store_true", help="output the computation information for the metric between the parse result and each top search result")
    options, arguments = parser.parse_args()
    
    # For now, we always use the music_halfspan formalism with this script
    # If we wanted to make it generic, we'd just load the formalism according 
    #  to a command-line option
    formalism = Formalism
    
    # Process parser options
    if options.popts is not None:
        poptstr = options.popts
        if "help" in [s.strip().lower() for s in poptstr]:
            # Output this parser's option help
            print options_help_text(DirectedCkyParser.PARSER_OPTIONS, intro="Available options for gold standard interpreter")
            sys.exit(0)
        poptstr = ":".join(poptstr)
    else:
        poptstr = ""
    popts = ModuleOption.process_option_string(poptstr)
    # Check that the options are valid
    try:
        DirectedCkyParser.check_options(popts)
    except ModuleOptionError, err:
        logger.error("Problem with parser options (--popt): %s" % err)
        sys.exit(1)
示例#4
0
 def process_option_dict(cls, options):
     """
     Verifies and processes the training option values. Returns the 
     processed dict.
     
     """
     return ModuleOption.process_option_dict(options, cls.OPTIONS)
示例#5
0
def main():
    usage = "%prog [options] <chord-corpus-file> <chord-labeling-model> <midi-file>"
    description = "Like findsong, but searches by chord label sequence "\
        "similarity. The input is not a results file, but a midi file, or "\
        "a midi bulk input (CSV)."
    parser = OptionParser(usage=usage)
    parser.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser that interprets the gold standard annotations. Type '--popt help' to get a list of options (we use a DirectedCkyParser)")
    parser.add_option("-r", "--print-results", dest="print_results", action="store", default=5, type="int", help="number of top search results to print for each query (parse result). Default: 5. Use -1 to print distances from all songs in the corpus")
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", default="bulk-segmidi", help="filetype to read in. Use 'segmidi' to read a single midi file, or 'bulk-segmidi' (default) to read many from a CSV")
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    parser.add_option("--labeler-options", "--lopt", dest="labeler_options", action="store", help="options for the labeler. Type '--lopt help' for a list of available options.")
    parser.add_option("-g", "--gold-only", dest="gold_only", action="store_true", help="skip results that have no gold standard sequence associated with them (we can't tell which is the right answer for these)")
    parser.add_option("--align", "--print-alignment", dest="print_alignment", action="store_true", help="print out the full alignment between the labeling and the top match")
    options, arguments = parser.parse_args()
    
    # Process parser options
    if options.popts is not None:
        poptstr = options.popts
        if "help" in [s.strip().lower() for s in poptstr]:
            # Output this parser's option help
            print options_help_text(DirectedCkyParser.PARSER_OPTIONS, intro="Available options for gold standard interpreter")
            sys.exit(0)
        poptstr = ":".join(poptstr)
    else:
        poptstr = ""
    popts = ModuleOption.process_option_string(poptstr)
    # Check that the options are valid
    try:
        DirectedCkyParser.check_options(popts)
    except ModuleOptionError, err:
        logger.error("Problem with parser options (--popt): %s" % err)
        sys.exit(1)
示例#6
0
 def process_option_dict(cls, options):
     """
     Verifies and processes the training option values. Returns the 
     processed dict.
     
     """
     return ModuleOption.process_option_dict(options, cls.OPTIONS)
示例#7
0
 def check_options(cls, options):
     """
     Normally, options are validated when the tagger is instantiated. 
     This allows you to check them before that.
     
     """
     return ModuleOption.process_option_dict(options, cls.TAGGER_OPTIONS)
示例#8
0
class ModelBackoffBuilder(BackoffBuilder):
    """
    Subclass of L{BackoffBuilder} that handles model loading.
    
    """
    MODEL_CLASS = None  # This should be set by subclasses

    BUILDER_OPTIONS = BackoffBuilder.BUILDER_OPTIONS + [
        ModuleOption('model', filter=str,
            help_text="Model name. This model must have been previously trained. Required",
            usage="model=X, where X is the name of a trained model",
            required=True),
        ModuleOption('partition', filter=int,
            help_text="If given, the numbered partition of the partitioned "\
                "model will be used. (This generally involves appending the "\
                "partition number to the model name.)",
            usage="partition=P, where P is an int",
            default=None
        ),
    ]

    def __init__(self, *args, **kwargs):
        BackoffBuilder.__init__(self, *args, **kwargs)
        # Check the subclass is properly defined
        if type(self).MODEL_CLASS is None:
            raise NotImplementedError, "BackoffBuilder "\
                "subclass %s does not define a model class" % type(self).__name__
        if self.options['partition'] is not None:
            self.model_name = type(self).partition_model_name(
                self.options['model'], self.options['partition'])
        else:
            self.model_name = self.options['model']
        self.logger.info("Backoff model: %s" % self.model_name)

        # Load a TaggerModel subclass instance to load the trained model data
        self.model = (type(self).MODEL_CLASS).load_model(self.model_name)

    @staticmethod
    def partition_model_name(model_name, partition_number):
        """
        The model name to use when the given partition number is requested. 
        The default implementation simply appends the number to the model 
        name. Subclasses may override this if they want to do something 
        different.
        
        """
        return "%s%d" % (model_name, partition_number)
示例#9
0
    def __init__(self,
                 grammar,
                 tagger,
                 options={},
                 backoff=None,
                 backoff_options={},
                 logger=None):
        """
        @param grammar: the L{jazzparser.grammar.Grammar} instance to use for 
            parsing
        @param tagger: the L{jazzparser.taggers.tagger.Tagger} subclass 
            instance to use to tag the input
        @param backoff: an optional 
            L{jazzparser.backoff.base.BackoffBuilder} class 
            to use as a fallback if the parser returns no parses. Whether 
            this is used and in what circumstances depends on the type of 
            parser.
        @param backoff_options: dictionary of options to pass to the backoff 
            model if it gets used.
        @type logger: C{logging.Logger}
        @param logger: a logger to which all progress information during 
            parsing will be written. By default, outputs to stderr.
        
        """
        self.grammar = grammar
        self.tagger = tagger
        self.backoff_options = backoff_options
        if backoff is not None:
            # Look up the backoff model if one is requested
            self.backoff = backoff
            # Pre-check the options dict
            # This will be done again by the module when instantiated, but
            #  we do it now to verify the options
            ModuleOption.process_option_dict(backoff_options,
                                             backoff.BUILDER_OPTIONS)
        else:
            self.backoff = None
        # Initialize using parser-specific options
        self.options = type(self).check_options(options)

        if logger is None:
            # Output to stderr instead
            self.logger = create_plain_stderr_logger()
        else:
            self.logger = logger

        self.timed_out = False
示例#10
0
 def check_options(cls, options):
     """
     In normal parser usage, the options dictionary is checked for 
     validity when the parser is instantiated. In this interface, you may 
     want to check the options before this point using this method.
     
     """
     return ModuleOption.process_option_dict(options, cls.PARSER_OPTIONS)
示例#11
0
 def check_options(cls, options):
     """
     In normal parser usage, the options dictionary is checked for 
     validity when the parser is instantiated. In this interface, you may 
     want to check the options before this point using this method.
     
     """
     return ModuleOption.process_option_dict(options, cls.PARSER_OPTIONS)
示例#12
0
 def process_training_options(self):
     """
     Verifies and processes the training option values. Access them in 
     self.options.
     
     """
     self._options = ModuleOption.process_option_dict(self._options_dict, 
                                                      self.TRAINING_OPTIONS)
示例#13
0
def get_chord_mapping_module_option(name="chord_mapping"):
    return ModuleOption(name, 
                        filter=choose_from_dict(NAMED_MAPPINGS),
                        help_text="Choose a mapping to apply to chord types "\
                            "to reduce the chord vocabulary",
                        usage="%s=M, where M is one of %s. Default: %s" % \
                            (name, ", ".join(MAPPINGS), DEFAULT_MAPPING),
                        default=get_chord_mapping())
示例#14
0
 def process_training_options(self):
     """
     Verifies and processes the training option values. Access them in 
     self.options.
     
     """
     self._options = ModuleOption.process_option_dict(
         self._options_dict, self.TRAINING_OPTIONS)
示例#15
0
 def __init__(self, model, options={}):
     self.model = model
     # Check this model is of one of the types we can train
     if type(model) not in self.MODEL_TYPES:
         raise RaphstoHmmParameterError, "trainer %s cannot train a model "\
             "of type %s" % (type(self).__name__, type(model).__name__)
     
     self.options = ModuleOption.process_option_dict(options, self.OPTIONS)
     self.model_cls = type(model)
示例#16
0
    def __init__(self, model, options={}):
        self.model = model
        # Check this model is of one of the types we can train
        if type(model) not in self.MODEL_TYPES:
            raise RaphstoHmmParameterError, "trainer %s cannot train a model "\
                "of type %s" % (type(self).__name__, type(model).__name__)

        self.options = ModuleOption.process_option_dict(options, self.OPTIONS)
        self.model_cls = type(model)
示例#17
0
 def __init__(cls, name, bases, dict):
     # Skip all this when the base class if created
     if name != "FormalismBase":
         # Initialize all the output options
         # If they're never set by whatever script is running, this 
         #  ensures that their default values are available
         formalism = cls.get_name()
         opts = ModuleOption.process_option_dict({}, cls.output_options)
         # Store this so it's globally available to the formalism
         settings.OPTIONS.OUTPUT[formalism] = opts
示例#18
0
 def __init__(cls, name, bases, dict):
     # Skip all this when the base class if created
     if name != "FormalismBase":
         # Initialize all the output options
         # If they're never set by whatever script is running, this
         #  ensures that their default values are available
         formalism = cls.get_name()
         opts = ModuleOption.process_option_dict({}, cls.output_options)
         # Store this so it's globally available to the formalism
         settings.OPTIONS.OUTPUT[formalism] = opts
示例#19
0
 def process_output_options(cls, optdict):
     """
     Makes output options globally available, based on a dictionary.
     
     @see: L{output_options}.
     
     """
     formalism = cls.get_name()
     opts = ModuleOption.process_option_dict(optdict, cls.output_options)
     settings.OPTIONS.OUTPUT[formalism] = opts
示例#20
0
 def process_output_options(cls, optdict):
     """
     Makes output options globally available, based on a dictionary.
     
     @see: L{output_options}.
     
     """
     formalism = cls.get_name()
     opts = ModuleOption.process_option_dict(optdict, cls.output_options)
     settings.OPTIONS.OUTPUT[formalism] = opts
示例#21
0
 def __init__(self, grammar, tagger, options={}, backoff=None, 
                 backoff_options={}, logger=None):
     """
     @param grammar: the L{jazzparser.grammar.Grammar} instance to use for 
         parsing
     @param tagger: the L{jazzparser.taggers.tagger.Tagger} subclass 
         instance to use to tag the input
     @param backoff: an optional 
         L{jazzparser.backoff.base.BackoffBuilder} class 
         to use as a fallback if the parser returns no parses. Whether 
         this is used and in what circumstances depends on the type of 
         parser.
     @param backoff_options: dictionary of options to pass to the backoff 
         model if it gets used.
     @type logger: C{logging.Logger}
     @param logger: a logger to which all progress information during 
         parsing will be written. By default, outputs to stderr.
     
     """
     self.grammar = grammar
     self.tagger = tagger
     self.backoff_options = backoff_options
     if backoff is not None:
         # Look up the backoff model if one is requested
         self.backoff = backoff
         # Pre-check the options dict
         # This will be done again by the module when instantiated, but 
         #  we do it now to verify the options
         ModuleOption.process_option_dict(backoff_options, 
                                          backoff.BUILDER_OPTIONS)
     else:
         self.backoff = None
     # Initialize using parser-specific options
     self.options = type(self).check_options(options)
     
     if logger is None:
         # Output to stderr instead
         self.logger = create_plain_stderr_logger()
     else:
         self.logger = logger
     
     self.timed_out = False
示例#22
0
class MidiTaggerTrainingBulkInput(SegmentedMidiBulkInput):
    """
    Subclass of L{SegmentedMidiBulkInput} for taking training input for midi 
    supertaggers. This is identical to L{SegmentedMidiBulkInput}, but has an 
    additional option C{chords} to specify a path from which to read a 
    L{AnnotatedDbBulkInput}. This may be used by the training procedure to initialize 
    or train parameters, in addition to the main midi training input.
    
    Accepts additionally all options accepted by L{AnnotatedDbBulkInput}. These will 
    be passed on to L{DbBulkInput} when it's read in.
    
    """
    FILE_INPUT_OPTIONS = \
            SegmentedMidiBulkInput.FILE_INPUT_OPTIONS + \
            [ModuleOption('chords',
                     help_text="path from which to read a bulk-db input, "\
                        "which may be used in addition to the midi training "\
                        "data by the training procedure",
                     usage="chords=F, where F is an filename")] + \
            AnnotatedDbBulkInput.FILE_INPUT_OPTIONS

    def __init__(self, inputs, chords=None):
        self.inputs = inputs
        self.chords = chords

    @staticmethod
    def from_file(filename, options={}):
        if 'chords' in options and options['chords'] is not None:
            # Read in the AnnotatedDbBulkInput from this file
            # Take AnnotatedDbBulkInput's options out of the option dict
            dboptions = {}
            for dbopt in AnnotatedDbBulkInput.FILE_INPUT_OPTIONS:
                if dbopt.name in options:
                    dboptions[dbopt.name] = options.pop(dbopt.name)
            chords = AnnotatedDbBulkInput.from_file(options['chords'],
                                                    options=dboptions)
        else:
            chords = None
        # Read the main midi data just as SegmentedMidiBulkInput does
        main_data = SegmentedMidiBulkInput.from_file(filename, options)
        return MidiTaggerTrainingBulkInput(main_data.inputs, chords=chords)

    def subset(self, *ranges):
        # Custom implementation so subsets get the chord input
        return MidiTaggerTrainingBulkInput(\
            sum([self.inputs[start:end] for (start,end) in ranges], []),
            chords=self.chords)
示例#23
0
 def cl_output_options(cls, string):
     """
     Convenience method so you don't have to do this lots of times over.
     
     Take a string of output options from the command line and set the 
     output options from it.
     
     Should only be used in command-line scripts.
     
     """
     if string is not None and string.lower() == "help":
         print "Available output options"
         print "========================"
         print options_help_text(cls.output_options)
         sys.exit(0)
     optdict = ModuleOption.process_option_string(string)
     cls.process_output_options(optdict)
示例#24
0
 def cl_output_options(cls, string):
     """
     Convenience method so you don't have to do this lots of times over.
     
     Take a string of output options from the command line and set the 
     output options from it.
     
     Should only be used in command-line scripts.
     
     """
     if string is not None and string.lower() == "help":
         print "Available output options"
         print "========================"
         print options_help_text(cls.output_options)
         sys.exit(0)
     optdict = ModuleOption.process_option_string(string)
     cls.process_output_options(optdict)
示例#25
0
class SongSelfSimilarityTool(Tool):
    """
    For fooling around with comparing songs to themselves to see what happens.
    
    """
    name = "Self similarity"

    commands = ['selfsim']
    usage = ('selfsim <song-num>', "")
    help = ""
    tool_options = Tool.tool_options + [
        ModuleOption('local', filter=str_to_bool,
                     usage="local=B, where B is true or false",
                     default=False,
                     help_text="Sort results by local alignment score, not "\
                        "global"),
    ]

    def run(self, args, state):
        from jazzparser.formalisms.music_halfspan.evaluation import \
                        tonal_space_local_alignment, tonal_space_distance
        songnum = int(args[0])

        name, song = get_song(songnum, state)
        songset = state.get_data("songset")
        distances = []
        # Try comparing this song to each song in the set
        for other_name, other_song in songset.analyses:
            # Align locally and globally
            ops,steps1,steps2,local_distance = \
                    tonal_space_local_alignment(other_song.lf, song.lf)
            global_distance = \
                    tonal_space_distance(other_song.lf, song.lf)
            distances.append((other_name, local_distance, global_distance))

        # Sort the results
        if self.options['local']:
            distances.sort(key=lambda x: x[1])
        else:
            distances.sort(key=lambda x: x[2])
        # Print out each one
        print "Aligned %s with:" % name
        for other_name, local_distance, global_distance in distances:
            print "%s:  local: %s,  global: %s" % \
                (other_name,local_distance,global_distance)
示例#26
0
class SongDependencyGraphTool(Tool):
    """
    Converts a song's semantics to a tree. Mainly just for debugging.
    
    """
    name = "Song dependency graph"
    commands = ['depgraph', 'dep']
    usage = ('depgraph <song-num>', "converts the semantics of the song to a "\
        "dependency graph representation")
    tool_options = Tool.tool_options + [
        ModuleOption('res',
                     filter=str_to_bool,
                     usage="res=B, where B is true or false",
                     default=False,
                     help_text="Show a result, instead of a corpus song"),
    ]
    help = """\
Converts the semantics of the numbered song to its tree representation that 
will be used for comparison to other logical forms. This is mainly for 
debugging and has no use in itself.
"""

    def run(self, args, state):
        from jazzparser.formalisms.music_halfspan.harmstruct import \
                                            semantics_to_dependency_graph
        if self.options['res']:
            resnum = int(args[0])
            res = state.results[resnum]
            song = res.semantics
            print "Dependency graph for result %d\n" % resnum
        else:
            songnum = int(args[0])
            name, song = get_song(songnum, state)
            print "Dependency graph for '%s'\n" % name

        print "Semantics:"
        print song
        print
        graph, times = semantics_to_dependency_graph(song)
        print graph
示例#27
0
class CkyParser(Parser):
    """
    CkyParser is the central class for the jazz chord sequence 
    recogniser parsing mechanism. 
    It constitutes the "algorithm" module of the system.
    It begins with a set of signs assigned to the input by the 
    tagger and parses to produce a chart, from which the resultant 
    signs can be extracted.
    
    """
    shell_tools = [
        ChartTool(),
        InteractiveChartTool(),
    ]
    PARSER_OPTIONS = Parser.PARSER_OPTIONS + [
        ModuleOption('max_iter', filter=int,
            help_text="Maximum number of parser iterations to perform "\
                "before giving up. If 0 or unspecified, continues "\
                "until parse is complete.",
            usage="max_iter=X, where X is an integer.",
            default=0,
        ),
        ModuleOption('min_iter', filter=int,
            help_text="Usually, the parser will stop as soon as it finds a "\
                "full parse. Use min_iter to make it continue parsing until "\
                "it has done min_iter iterations or the tagger has ceased to "\
                "return any categories. Use -1 to keep going until the tagger "\
                "gives no more categories.",
            usage="min_iter=X, where X is an integer.",
            default=0,
        ),
        ModuleOption('parses', filter=int,
            help_text="Number of parses to require before we terminate. "\
                "Default is 1: the parser will terminate as soon as it finds "\
                "at least one full parse (unless another option, like "\
                "min_iter, prevents it",
            usage="parses=X, where X is an integer",
            default=1,
        ),
        ModuleOption('timeout', filter=int,
            help_text="Maximum time allowed for the main parse loop, in "\
                "minutes. If this is exceded, the backoff will kick "\
                "in, if one is specified. Otherwise, no results will be "\
                "returned. The parser will not stop as soon as the timeout "\
                "expires, but after finishing processing the current input "\
                "word. 0 (default) imposes no timeout.",
            usage="timeout=X, where X is an integer number of seconds.",
            default=0,
        ),
        ModuleOption('inspect', filter=str_to_bool,
            help_text="If true, the graphical chart inspector will be "\
                "displayed during parsing.",
            usage="inspect=X, where X is a boolean value.",
            default=False
        ),
        ModuleOption('inspect_persist', filter=str_to_bool,
            help_text="Makes the chart inspector window persist after parsing "\
                "is completed. By default, it will be killed",
            usage="inspect_persist=X, where X is a boolean value.",
            default=False
        ),
        ModuleOption('dump_chart', filter=new_file_option,
            help_text="A file to dump the chart state to during parsing. "\
                "The first dump will be when the chart is created and "\
                "new dumps will be made throughout the parse.",
            usage="dump_chart=X, where X is a filename."
        ),
        ModuleOption('derivations', filter=str_to_bool,
            help_text="Store derivation traces along with the results",
            usage="derivations=X, where X is a boolean value",
            default=None,
        ),
    ]

    def _create_chart(self, *args, **kwargs):
        self.chart = Chart(self.grammar, *args, **kwargs)
        return self.chart

    def _add_signs(self, offset=0, prob_adder=None):
        """
        Adds new signs to the chart from the supertagger, using the given 
        offset when requesting them from the tagger.
        
        @rtype: list of tuples
        @return: all the signs that were actually added. Each is represented 
            by a tuple (start_node, end_node, sign)
        
        """
        signs = self.tagger.get_signs(offset)
        words = self.tagger.get_string_input()
        if signs is None or len(signs) == 0:
            return []
        # Add each new sign to the chart
        added = []
        for (start, end, signtup) in signs:
            word_list = words[start:end]
            word = " ".join(w for w in word_list)
            # Add the probabilities as an attribute to the signs
            cat, tag, prob = signtup
            if prob_adder is not None:
                prob_adder(start, end, signtup, word_list)
            # Add the signs to the chart
            newadd = self.chart.add_word_signs([signtup[0]],
                                               start,
                                               word,
                                               end_node=end)
            # Keep a record of those that got added
            if newadd:
                added.append((start, end, signtup))
        return added

    def parse(self, derivations=False, summaries=False, inspect=False):
        """
        Run the parser on the input, using the specified tagger. Runs 
        the CKY parsing algorithm to do chart parsing. For details of 
        chart parsing, see Chart class.
        
        If the parser was given a maximum number of iterations, the 
        routine will return as usual after this number is completed, 
        even if no parses have been found.
        
        @type derivations: bool
        @param derivations: store derivation traces, which 
            can subsequently be used to trace all the derivations that 
            led to any given sign in the chart. Overridden by the module 
            option if it's given
        @type summaries: int/bool
        @param summaries: output chart summary information to stderr during 
            parsing to track progress. Set to 2 to output some info, 
            but not the full chart.
        @type inspect: bool
        @param inspect: launch a graphical chart inspector during the 
            parse to display interactive chart information.
            
        @return: a list of signs that span the full input.
        """
        if 'derivations' in self.options and self.options[
                'derivations'] is not None:
            derivations = self.options['derivations']

        # Time excecution if we're showing any summaries
        time = bool(summaries)
        # Find out from the tagger how long the input it read in was
        input_length = self.tagger.input_length
        # Create and initialise a chart for parsing
        # Don't initialise the chart with signs - we'll add signs gradually instead
        chart = self._create_chart([[]] * input_length,
                                   derivations=derivations)

        # Launch a chart inspector if requested
        if self.options['inspect'] or inspect:
            # Get a string form of the input to display
            input_strs = self.tagger.get_string_input()
            chart.launch_inspector(input=input_strs)
        # Start dumping the chart if requested
        if self.options['dump_chart']:
            # Make the first dump of the empty chart
            from .chart import dump_chart
            dump_chart(chart, self.options['dump_chart'])
        # Stop after a given number of iterations
        if self.options['max_iter'] == 0:
            max_iter = None
        else:
            max_iter = self.options['max_iter']

        if self.options['min_iter'] == -1:
            # Special case: never stop until we've got all the categories
            min_iter = None
        else:
            min_iter = self.options['min_iter']

        required_parses = self.options['parses']

        timeout = 60 * self.options['timeout']
        check_timeout = timeout > 0
        # Make sure the timed out flag is unset to start with
        self.timed_out = False

        # This is where progress output will go
        # Note that it's not the same as logger, which is the main system logger
        prog_logger = self.logger

        if check_timeout:
            prog_logger.info("Due to timeout after %d mins" %
                             self.options['timeout'])

        ##################################################
        ### Here is the parser itself.
        # Keep track of how long since we started for timing out
        timeout_timer = ExecutionTimer(clock=True)

        signs_taken = [0] * input_length

        offset = 0
        last_lexicals = [0] * (input_length)
        try:
            # Keep adding signs until none left, or we get a full parse,
            #  or we complete the maximum iterations allowed
            # Keep going if min_iter is None (special value meaning don't stop
            #  when we get a parse
            while (min_iter is None or (offset < min_iter) \
                                        or len(chart.parses) < required_parses):
                if max_iter is not None and offset >= max_iter:
                    # Exceded maximum number of iterations: give up
                    prog_logger.info("Reached maximum number of iterations: "\
                                        "continuing to backoff/fail")
                    break
                prog_logger.info(">>> Parsing iteration: %d" % (offset + 1))
                # Get new signs from the tagger
                added = self._add_signs(offset=offset)
                # Note whether we added anything new
                if added:
                    # Apply unary rules to these new signs
                    added_spans = set([(start, end)
                                       for (start, end, sign) in added])
                    for (start, end) in added_spans:
                        chart.apply_unary_rules(start, end)
                else:
                    # No new signs added by the tagger: no point in continuing
                    prog_logger.info("No new signs added: ending parse")
                    break

                ##### Main parser loop: produce all possible results
                # Set end point to each node
                for end in range(1, input_length + 1):
                    if time:
                        # Start a timer
                        timer = ExecutionTimer()
                    chart.apply_unary_rules(end - 1, end)

                    # Set start point to each node before the end, in reverse order
                    for start in range(end - 2, -1, -1):
                        for middle in range(start + 1, end):
                            chart.apply_binary_rules(start, middle, end)

                            # Check whether the timeout has expired and don't process
                            #  any more if it has
                            if check_timeout:
                                # Check whether the timeout has passed
                                if int(timeout_timer.get_time()) > timeout:
                                    # Move on to post-parse stuff
                                    raise ParserTimeout

                        # Check for new unary rule applications
                        chart.apply_unary_rules(start, end)

                    if summaries:
                        prog_logger.info(
                            "Completed parsing up to node %d / %d (%.2f secs)"
                            % (end, input_length, timer.get_time()))
                        if summaries != 2:
                            prog_logger.info(chart.summary)
                    if self.options['dump_chart']:
                        # Dump an update of the chart to the file
                        dump_chart(chart, self.options['dump_chart'])

                if summaries:
                    prog_logger.info("Completed parsing to end of sequence")
                    if summaries != 2:
                        prog_logger.info(chart.summary)

                offset += 1
        except ParserTimeout:
            # The given timeout elapsed: just continue with no parses
            prog_logger.info("Parse timeout (%d mins) expired: continuing "\
                            "to backoff/fail" % self.options['timeout'])
            # Set the timed_out flag so we can check later whether we timed out
            self.timed_out = True
        except KeyboardInterrupt:
            # We pass the interrupt on to a higher level, but first kill
            #  the inspector window, so it doesn't hang around and mess up
            self.chart.kill_inspector()
            raise

        parses = chart.parses
        if len(parses) == 0 and self.backoff is not None:
            prog_logger.info("Using backoff model")
            backoff_results = self.run_backoff()
            if len(backoff_results) > 0:
                for res in backoff_results:
                    # Put the semantics result into a sign, with a dummy
                    #  syntactic category
                    sign = self.grammar.formalism.Syntax.Sign(
                        self.grammar.formalism.Syntax.DummyCategory(), res)
                    # If the semantics has a probability, put this on the sign
                    if hasattr(res, "probability"):
                        sign.probability = res.probability
                    parses.append(sign)
        elif len(parses):
            prog_logger.info("Parse finished with %d results" % len(parses))
        else:
            prog_logger.info("Parse finished with no results")

        # Close the inspector window if one was opened
        if not self.options['inspect_persist']:
            self.chart.kill_inspector()

        return parses
示例#28
0
def main():
    usage = "%prog [options] <model-type> <model_name> <in-file>"
    description = "Trains a supertagging model using the given "\
        "input data. Specify a model type (baseline1, etc) and a name to "\
        "identify it. The data file may be a stored SequenceIndex file, or "\
        "any other type of bulk data file. "\
        "This can only be used with the follow types of models: %s" % ", ".join(TRAINABLE_MODELS)
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    # File input options
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db')
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    # Logging output
    parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end")
    options, arguments = parse_args_with_config(parser)
    
    grammar = Grammar()
    
    # Get the model type first: we might not need the other args
    if len(arguments) == 0:
        print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
    model_type = arguments[0]
    
    if model_type not in TRAINABLE_MODELS:
        print >>sys.stderr, "'%s' is not a valid model type. Available taggers are: %s" % \
            (model_type, ", ".join(TRAINABLE_MODELS))
        sys.exit(1)
    if model_type not in TAGGERS:
        print >>sys.stderr, "'%s' isn't a registered model type. Check that "\
            "the name in TRAINABLE_MODELS is correct" % model_type
        sys.exit(1)
    
    tagger_cls = get_tagger(model_type)
    if not issubclass(tagger_cls, ModelTagger):
        print >>sys.stderr, "'%s' tagger cannot be trained with this script. Only model taggers can be." % (tagger_cls.__name__)
        sys.exit(1)
    model_cls = tagger_cls.MODEL_CLASS
    
    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__)
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_dict(
                            ModuleOption.process_option_string(options.training_opts), 
                            model_cls.TRAINING_OPTIONS)
    
    # Get the rest of the args
    if len(arguments) < 3:
        print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[2])
    model_name = arguments[1]

    # Load the sequence data
    # Only allow bulk types
    input_data = command_line_input(filename=filename, 
                                    filetype=options.filetype, 
                                    options=options.file_options,
                                    allowed_types=get_input_type_names(single=False, bulk=True))
    
    if options.partitions is not None and options.partitions > 1:
        parts = input_data.get_partitions(options.partitions)[1]
        models = [(tagger_cls.partition_model_name(model_name,num),seqs) for \
                                                num,seqs in enumerate(parts)]
    else:
        models = [(model_name,input_data)]
    
    for part_name,seqs in models:
        # Instantiate a fresh model with this name
        model = model_cls(part_name, options=training_opts)
        if options.log is not None:
            # Prepare a logger
            logfile = "%s%s.log" % (options.log, part_name)
            print "Logging output to file %s" % logfile
            logger = create_logger(filename=logfile)
        else:
            logger = None
            
        # Train the model with the loaded data
        model.train(seqs, logger=logger)
        model.save()
        print "Trained model %s" % (part_name)
示例#29
0
class Formalism(FormalismBase):
    rules = {
        'application': rules.ApplicationRule,
        'composition': rules.CompositionRule,
        'development': rules.DevelopmentRule,
        'coordination': rules.CoordinationRule,
        'tonicrepetition': rules.TonicRepetitionRule,
        'cadencerepetition': rules.CadenceRepetitionRule,
    }

    lexicon_builder = staticmethod(domxml.build_sign_from_node)
    # We don't need to do anything to distinguish variables
    distinguish_categories = staticmethod(lambda x, y: None)
    unify = staticmethod(syntax.unify)
    # This doesn't need to do anything for now
    clean_results = staticmethod(lambda x: x)

    shell_tools = [
        TimeOutputTool(),
        songtools.LoadCorpusTool(),
        songtools.ListSongsTool(),
        songtools.PrintAnalysisTool(),
        songtools.ResultSongTSEditDistanceTool(),
        songtools.ResultSongDependencyRecoveryTool(),
        songtools.RecogniseSongTool(),
        songtools.SongSelfSimilarityTool(),
        songtools.SongTreeTool(),
        songtools.SongDependencyGraphTool(),
    ]

    output_options = [
        ModuleOption('tsformat',
                 choose_from_list(['coord', 'xycoord', 'roman','alpha']),
                 help_text="Tonal space output format",
                 default="coord",
                 usage="tsformat=X, where X is one of 'coord', 'xycoord', "\
                    "'alpha' or 'roman'"),
    ]

    backoff_states_to_lf = staticmethod(semantics.backoff_states_to_lf)
    semantics_to_coordinates = staticmethod(semantics.semantics_to_coordinates)
    semantics_to_functions = staticmethod(semantics.semantics_to_functions)
    semantics_to_keys = staticmethod(semantics.semantics_to_keys)

    semantics_distance_metrics = [
        distance.TonalSpaceEditDistance,
        distance.LargestCommonEmbeddedSubtrees,
        distance.RandomDistance,
        distance.DependencyGraphSize,
        distance.OptimizedDependencyRecovery,
        distance.DependencyRecovery,
    ]

    PcfgModel = pcfg.HalfspanPcfgModel

    class Syntax(FormalismBase.Syntax):
        Sign = syntax.Sign
        ComplexCategory = syntax.ComplexCategory
        AtomicCategory = syntax.AtomicCategory
        Slash = syntax.Slash
        DummyCategory = syntax.DummyCategory
        merge_equal_signs = staticmethod(syntax.merge_equal_signs)

        # Unlike previous formalisms, we can't use the normal category
        #  structure abstraction, so we inject our own handling of
        #  half categories
        pre_generalize_category = staticmethod(syntax.pre_generalize_category)

        @classmethod
        def is_complex_category(cls, obj):
            """
            For the sake of efficiency, override this and don't use 
            isinstance.
            This gets called a LOT of times!
            """
            return obj.ATOMIC == False

        @classmethod
        def is_atomic_category(cls, obj):
            """
            For the sake of efficiency, override this and don't use 
            isinstance.
            This gets called a LOT of times!
            
            This works because the category classes in this formalism 
            all define ATOMIC, so we don't need to check the type.
            
            """
            return obj.ATOMIC == True

    class Semantics(FormalismBase.Semantics):
        Semantics = semantics.Semantics
        apply = staticmethod(semantics.apply)
        compose = staticmethod(semantics.compose)

    class PcfgParser(object):
        """ Formalism interface for the PcfgParser parser module. """
        # Function to generate the representation of a category to
        #  be used to index the model
        category_representation = staticmethod(pcfg.model_category_repr)
        # Mapping between the short names used for rules in annotated
        #  trees and the rule instantiations
        rule_short_names = {
            'compf': ('composition', {
                'dir': 'forward'
            }),
            'compb': ('composition', {
                'dir': 'backward'
            }),
            'appf': ('application', {
                'dir': 'forward'
            }),
            'appb': ('application', {
                'dir': 'backward'
            }),
            'cont': ('development', {}),
            'coord': ('coordination', {}),
        }
        category_relative_chord = staticmethod(pcfg.category_relative_chord)

    class Evaluation(FormalismBase.Evaluation):
        tonal_space_alignment_costs = staticmethod(
            evaluation.tonal_space_alignment_costs)
        tonal_space_distance = staticmethod(evaluation.tonal_space_distance)
        tonal_space_f_score = staticmethod(evaluation.tonal_space_f_score)
        tonal_space_alignment_score = staticmethod(
            evaluation.tonal_space_alignment_score)
        tonal_space_alignment = staticmethod(evaluation.tonal_space_alignment)

        tonal_space_length = staticmethod(evaluation.tonal_space_length)
        """ Number of points on the tonal space path represented by the semantics """
示例#30
0
def main():
    usage = "%prog [options] <model_name> <in-file>"
    description = "Trains a chord labeling model using the given "\
        "input data. The data file may be a stored SequenceIndex file, or "\
        "any other type of bulk data file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="training_opts", action="append", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    # File input options
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db')
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    # Logging output
    parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end")
    options, arguments = parse_args_with_config(parser)
    
    grammar = Grammar()
    
    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif "help" in [opt.lower() for opt in options.training_opts]:
        print options_help_text(HPChordLabeler.TRAINING_OPTIONS, intro="Training options:")
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_string(options.training_opts)
        
    if len(arguments) < 2:
        print >>sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]
    
    # Load the sequence data
    # Only allow bulk types
    input_data = command_line_input(filename=filename, 
                                    filetype=options.filetype, 
                                    options=options.file_options,
                                    allowed_types=get_input_type_names(single=False, bulk=True))
    
    # Only partition the chord data, not the MIDI data
    if options.partitions is not None and not \
            (isinstance(input_data, MidiTaggerTrainingBulkInput) and \
             input_data.chords is not None):
        print >>sys.stderr, "Can only partition chord data and no chord data "\
            "was supplied"
        sys.exit(1)
    
    if options.partitions:
        # The input includes chord training data
        parts = input_data.chords.get_partitions(options.partitions)[1]
        models = [("%s%d" % (model_name,num),chord_data) \
            for num,chord_data in enumerate(parts)]
    else:
        models = [(model_name,None)]
    
    for part_name,chord_data in models:
        if options.log is not None:
            # Prepare a logger
            logfile = "%s%s.log" % (options.log, part_name)
            print "Logging output to file %s" % logfile
            logger = create_logger(filename=logfile)
        else:
            logger = None
        
        # Create a fresh model with this name
        model = HPChordLabeler.train(input_data, part_name, 
                                     logger=logger, 
                                     options=training_opts,
                                     chord_data=chord_data)
        print "Trained model %s" % (part_name)
示例#31
0
 def check_options(cls, options):
     return ModuleOption.process_option_dict(options, cls.BUILDER_OPTIONS)
示例#32
0
class NgramTagger(ModelTagger):
    MODEL_CLASS = NgramTaggerModel
    TAGGER_OPTIONS = ModelTagger.TAGGER_OPTIONS + [
        ModuleOption('decode', filter=choose_from_list(DECODERS),
            help_text="Decoding method for inference.",
            usage="decode=X, where X is one of %s" % \
                                ", ".join("'%s'" % d for d in DECODERS),
            default="forward-backward"),
    ]
    INPUT_TYPES = ['db', 'chords']

    def __init__(self, grammar, input, options={}, *args, **kwargs):
        """
        Tags using an ngram model backed by NLTK.
        
        """
        super(NgramTagger, self).__init__(grammar, input, options, *args,
                                          **kwargs)
        process_chord_input(self)

        #### Tag the input sequence ####
        self._tagged_data = []
        self._batch_ranges = []
        # Group the input into pairs to get observations
        inpairs = group_pairs(self.input, none_final=True)
        # Convert the pairs into observations
        observations = [
            observation_from_chord_pair(pair[0], pair[1], self.model.chordmap)
            for pair in inpairs
        ]

        # Use the ngram model to get tag probabilities for each input by
        # computing the forward probability matrix
        if self.options['decode'] == "viterbi":
            probabilities = self.model.viterbi_probabilities(observations)
        elif self.options['decode'] == "forward":
            probabilities = self.model.forward_probabilities(observations)
        else:
            probabilities = self.model.forward_backward_probabilities(
                observations)

        word_tag_probs = []

        for index, probs in enumerate(probabilities):
            features = {
                'duration': self.durations[index],
                'time': self.times[index],
            }
            word_signs = []
            # Now assign a probability to each tag, given the observation
            for tag in self.model.tags:
                # Read a full sign out of the grammar
                sign = self.grammar.get_sign_for_word_by_tag(
                    self.input[index], tag, extra_features=features)
                if sign is not None:
                    # Read off the probability from the matrix
                    probability = probs[tag]
                    word_signs.append((sign, tag, probability))

            # Randomly sort the list first to make sure equal probabilities are randomly ordered
            word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs]
            random.shuffle(word_signs)
            # Now sort by probability
            word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2])))
            self._tagged_data.append(word_signs)

            # Store the list of probabilities for tags, which we'll use
            #  after we've tagged every word to work out the sizes
            #  of the tag batches
            word_tag_probs.append([p for __, __, p in word_signs])

        if self.options['best']:
            # Only return one for each word
            self._batch_ranges = [[(0, 1)] for i in range(len(self.input))]
        else:
            # Work out the number of tags to return in each batch
            batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
            # So far, this has assigned a probability to every possible
            #  tag. We don't want the tagger ever to return the least
            #  probably batch of tags, unless it's the only one.
            #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes]
            # Transform these into a form that's easier to use for getting the signs
            self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                    for batches in batch_sizes]

    def get_signs(self, offset=0):
        all_signs = []
        for start_node in range(len(self.input)):
            # Get the indices of the signs to return in this offset batch
            ranges = self._batch_ranges[start_node]
            if offset >= len(ranges):
                # No more batches left for this word
                continue
            start, end = ranges[offset]
            signs = self._tagged_data[start_node][start:end]
            # Add each sign to the output list along with its node values
            for sign in signs:
                all_signs.append((start_node, start_node + 1, sign))
        return all_signs

    def get_word(self, index):
        return self.input[index]
示例#33
0
     logger.error("The tagger '%s' could not be loaded. Possible "\
         "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS)))
     return 1
     
 # Get supertagger options before initializing the tagger
 if options.topts is not None:
     toptstr = options.topts
     if "help" in [s.strip().lower() for s in toptstr]:
         # Output this tagger's option help
         from jazzparser.utils.options import options_help_text
         print options_help_text(tagger_cls.TAGGER_OPTIONS, intro="Available options for selected tagger")
         return 0
     toptstr = ":".join(toptstr)
 else:
     toptstr = ""
 topts = ModuleOption.process_option_string(toptstr)
 # Check that the options are valid
 try:
     tagger_cls.check_options(topts)
 except ModuleOptionError, err:
     logger.error("Problem with tagger options (--topt): %s" % err)
     return 1
 
 ######## Backoff ########
 # Load the requested backoff model, if any
 if options.backoff is not None:
     from jazzparser.backoff import BUILDERS
     if options.backoff.lower() == "help":
         print "Available backoff model types are: %s" % ", ".join(BUILDERS)
         return 0
     try:
示例#34
0
 def process_labeling_options(opts):
     """ Verifies and processes the labeling option values (dict). """
     return ModuleOption.process_option_dict(opts, HPChordLabeler.LABELING_OPTIONS)
示例#35
0
 def process_training_options(opts):
     """ Verifies and processes the training option values. """
     return ModuleOption.process_option_dict(opts, HPChordLabeler.TRAINING_OPTIONS)
示例#36
0
     else:
         # No metric found matching this name
         print "No metric '%s'" % options.metric
         sys.exit(1)
 print >>sys.stderr, "Using distance metric: %s" % metric_cls.name
 # Now process the metric options
 if options.mopts is not None:
     moptstr = options.mopts
     if "help" in [s.strip().lower() for s in moptstr]:
         # Output this parser's option help
         print options_help_text(metric_cls.OPTIONS, intro="Available options for metric '%s'" % metric_cls.name)
         sys.exit(0)
     moptstr = ":".join(moptstr)
 else:
     moptstr = ""
 mopts = ModuleOption.process_option_string(moptstr)
 # Instantiate the metric with these options
 metric = metric_cls(options=mopts)
 
     
 if len(arguments) < 2:
     print >>sys.stderr, "Specify a song corpus name and one or more files to read results from"
     sys.exit(1)
 
 # First argument is an TonalSpaceAnalysisSet
 corpus_name = arguments[0]
 # Load the corpus file
 corpus = TonalSpaceAnalysisSet.load(corpus_name)
 
 # The rest of the args are result files to analyze
 res_files = arguments[1:]
示例#37
0
def main():
    usage = "%prog [options] <model_name> <in-file>"
    description = "Loads a chord labeling model and uses it to assign chord "\
        "labels to the given MIDI file."
    parser = OptionParser(usage=usage, description=description)
    # File input options
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='segmidi')
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    # Labeling options
    parser.add_option("--labeler-options", "--lopt", dest="labeler_options", action="append", help="options for the labeler. Type '--lopt help' for a list of available options.")
    parser.add_option("--no-key", "--nk", dest="no_key", action="store_true", help="merge together labels with the same key (same as --lopt nokey)")
    # Output options
    parser.add_option("--single", "-1", dest="single", action="store_true", help="show only one chord per time segment (same as --lopt n=1, but formats the output in a simpler way)")
    parser.add_option('-r', '--realize', dest="realize", action="store", help="realize the chord sequence as a midi file, overlaid on the input")
    parser.add_option('--chords-only', dest="chords_only", action="store_true", help="only realize the chords: don't overlay on the input midi (only works with -r)")
    options, arguments = parse_args_with_config(parser)
    
    if options.labeler_options is not None and "help" in options.labeler_options:
        print options_help_text(HPChordLabeler.LABELING_OPTIONS, intro="Options for HP chord labeler")
        sys.exit(0)
        
    if len(arguments) < 2:
        print >>sys.stderr, "You must specify a model name and an input "\
            "(MIDI) data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]
    
    # Process the labeler options
    lopt_dict = ModuleOption.process_option_string(options.labeler_options)
    if options.single:
        # No point in getting more than one label, since we only display one
        lopt_dict['n'] = 1
    if options.no_key:
        # Just set the nokey option
        lopt_dict['nokey'] = True
    
    # Check they're valid before doing anything else
    HPChordLabeler.process_labeling_options(lopt_dict)
    
    input_data = command_line_input(filename, 
                                    filetype=options.filetype, 
                                    options=options.file_options,
                                    allowed_types=['segmidi','bulk-segmidi'])
    bulk = not is_bulk_type(type(input_data))
    if bulk:
        input_data = [input_data]
        
    for i,data in enumerate(input_data):
        input_stream = data.stream
        print "Read midi data in %d segments" % len(data)
        
        # Load the model
        model = HPChordLabeler.load_model(model_name)
        # Perform labeling
        labels = model.label(data, options=lopt_dict)
        # Try labeling as it will be passed to the tagger
        labs = model.label_lattice(data, options=lopt_dict)
        
        if options.single:
            # Special output for single label output
            print ", ".join(["%s" % timelabs[0][0] for timelabs in labels])
        else:
            # Print out the labels for each timestep
            for time,timelabs in enumerate(labels):
                print "%d: %s" % (time, 
                    ", ".join(["%s (%.2e)" % (label,prob) for (label,prob) in timelabs]))
        
        if options.realize is not None:
            # Get the single best chord label for each time
            best_labels = [timelabs[0][0] for timelabs in labels]
            # Realize as a midi file
            print "Realizing output chord sequence"
            real = ChordSequenceRealizer(best_labels, 
                                         model.chord_vocab, 
                                         resolution=input_stream.resolution, 
                                         chord_length=data.time_unit,
                                         text_events=True)
            if options.chords_only:
                # Don't overlay
                stream = real.generate(offset=data.tick_offset)
            else:
                stream = real.generate(overlay=input_stream, offset=data.tick_offset)
                
            if bulk:
                filename = "%s-%d" % (options.realize, i)
            else:
                filename = options.realize
            write_midifile(stream, filename)
示例#38
0
class CandcMultiTagger(CandcTagger):
    """
    Uses the C&C supertagger component to get multiple tags for each 
    word.
    """
    command = "msuper"
    # Use a very low beta, so we get loads of tags, even improbable ones
    extra_args = ["--beta", "0.0"]
    
    TAGGER_OPTIONS = CandcTagger.TAGGER_OPTIONS + [
        ModuleOption('ignore-unknown', filter=str_to_bool,
            help_text="Ignore any tags that the tagger returns but which "\
                "are not found in the grammar. By default, an error will "\
                "be thrown.",
            usage="ignore-unknown=True (default False)",
            default=False),
    ]
    
    def __init__(self, *args, **kwargs):
        super(CandcMultiTagger, self).__init__(*args, **kwargs)
        
    def _tags_from_output(self, output):
        tags = []
        # Split up the output text to extract tags and probabilities
        for line in output.split("\n"):
            line = line.strip()
            if len(line):
                cols = line.split("\t")
                num_results = int(cols[2])
                results = []
                all_tags = []
                # Get the tags and probs from the output
                for result_num in range(num_results):
                    cat = cols[3+result_num*2]
                    prob = float(cols[4+result_num*2])
                    results.append((cat, prob))
                    all_tags.append(cat)
                
                # Check all the tags are covered and add them with 0 prob if not
                for tag in self.tag_list:
                    if tag not in all_tags:
                        results.append((tag, 0.0))
                
                tags.append(list(reversed(sorted(results, key=lambda x:x[1]))))
        
        if len(tags) != self.input_length:
            raise CandcTaggingError, "C&C output did not give a correct "\
                "set of tags: %s" % output
        
        # Redistribute the tag probability to account for unseen tags
        if self.options['unseen_tag_prob'] > 0.0:
            unseen_prob = self.options['unseen_tag_prob']
            # Scale down everything that has a probability
            prob_scale = 1.0 - unseen_prob
            for i in range(len(tags)):
                # Add reserved mass equally to every tag
                prob_add = unseen_prob / len(tags[i])
                tags[i] = [(tag,(prob*prob_scale+prob_add)) for \
                                    tag,prob in tags[i]]
        
        skip_tags = []
        # Work out what tags we're going to ignore altogether
        if self.options['ignore-unknown']:
            for tag_sequence in tags:
                for tag,prob in tag_sequence:
                    if tag not in self.grammar.families:
                        # This tag's not in the grammar: just ignore it
                        skip_tags.append(tag)
                        logger.warn("Ignoring tag '%s', which is not in "\
                            "the grammar." % tag)
        #~ #### I've already done this above
        #~ # Some tags get given zero probability by the model, either because 
        #~ #  it's not smoothing enough, or because of rounding errors
        #~ # We do a basic smoothing here, giving everything with 0 probability 
        #~ #  a probability smaller than the smallest the model assigned
        #~ smoothed_tags = []
        #~ for tag_probs in tags:
            #~ zeros = sum(prob == 0.0 for (tag,prob) in tag_probs)
            #~ # No need to smooth if everything got some prob
            #~ if zeros:
                #~ smallest = min(prob for (tag,prob) in tag_probs if prob > 0.0)
                #~ if smallest == 1.0:
                    #~ # This occasionally happens and messes things up
                    #~ # Just reserve a small amount for the zeros in this case
                    #~ smallest = 0.001
                #~ # Divide the smallest probability among the zero prob tags 
                #~ #  and discount the others
                #~ smooth_prob = smallest / zeros
                #~ discount = 1.0-(smallest)
                #~ tag_probs = [(tag, prob*discount if prob > 0.0 
                                                 #~ else smooth_prob) 
                                            #~ for (tag,prob) in tag_probs]
            #~ smoothed_tags.append(tag_probs)
        #~ print smoothed_tags
        
        signs = [[] for i in range(self.input_length)]
        # Get an actual sign for each word/tag combination
        for index,word in enumerate(self.tokens):
            for (tag,prob) in tags[index]:
                if tag not in skip_tags:
                    # Consult the grammar to get a suitable sign if we can
                    sign = self.grammar.get_sign_for_word_by_tag(
                                            word,
                                            tag,
                                            extra_features={
                                                'time' : self.times[index],
                                                'duration' : self.durations[index]
                                            })
                    signs[index].append((sign,tag, prob))
                
        self.batch_sizes = []
        for results in signs:
            # Work out the batches that these should be returned in
            self.batch_sizes.append(batch_sizes([p for __,__,p in results], self.tag_batch_ratio))
        return signs
示例#39
0
                                 options=options.file_options,
                                 allowed_types=['segmidi', 'bulk-segmidi'])
 if isinstance(input_data, SegmentedMidiInput):
     # Single input
     input_data = [input_data]
 
 
 # Work out how many results to print out
 if options.print_results == -1:
     print_up_to = None
 else:
     print_up_to = options.print_results
     
 
 # Process the labeler options
 lopt_dict = ModuleOption.process_option_string(options.labeler_options)
 # No point in getting more than one label, since we'll only use one
 lopt_dict['viterbi'] = True
 lopt_dict['nokey'] = True
 # Load the chord labeling model
 model_name = arguments[1]
 model = HPChordLabeler.load_model(model_name)
 
 ranks = []
 num_ranked = 0
 for midi_file in input_data:
     # Skip any inputs that don't have a gold sequence associated with them
     # We won't know what the correct answer is
     if options.gold_only and midi_file.gold is None:
         continue
     
示例#40
0
class NgramTaggerModel(TaggerModel):
    MODEL_TYPE = 'ngram'
    # Set up possible options for training
    TRAINING_OPTIONS = [
        ModuleOption('n', filter=int,
            help_text="Length of the n-grams which this model will use.",
            usage="n=N, where N is an integer. Defaults to bigrams", default=2),
        ModuleOption('backoff', filter=int,
            help_text="Number of orders of backoff to use. This must be "\
                "less than n. E.g. if using a trigram model (n=3) you can "\
                "set backoff=2 to back off to bigrams and from bigrams "\
                "to unigrams. Set to 0 to use no backoff at all (default).",
            usage="backoff=X, where X is an integer < n", default=0),
        ModuleOption('cutoff', filter=int,
            help_text="In estimating probabilities, treat any counts below "\
                "cutoff as zero",
            usage="cutoff=X, where X is an integer", default=0),
        ModuleOption('backoff_cutoff', filter=int,
            help_text="Apply a different cutoff setting to the backoff model. "\
                "Default is to use the same as the main model",
            usage="backoff_cutoff=X, where X is an integer"),
        ModuleOption('estimator', filter=choose_from_dict(ESTIMATORS),
            help_text="A way of constructing a probability model given "\
                "the set of counts from the data. Default is to use "\
                "laplace (add-one) smoothing.",
            usage="estimator=X, where X is one of: %s" % \
                ", ".join(ESTIMATORS.keys()), default=laplace_estimator),
        # Add the standard chord mapping option ("chord_mapping")
        get_chord_mapping_module_option(),
    ] + TaggerModel.TRAINING_OPTIONS

    def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs):
        """
        An n-gram model to be used as a tagging model. Uses NLTK to 
        represent, train and evaluate the n-gram model.
        
        """
        super(NgramTaggerModel, self).__init__(model_name, *args, **kwargs)
        self.model = model

        self.chordmap = get_chord_mapping(chordmap)
        self.chordmap_name = chordmap

        if self.options['n'] <= self.options['backoff']:
            # This is not allowed
            # We can only back off n-1 orders for an n-gram model
            raise TaggingModelError, "tried to load an n-gram model with "\
                "more orders of backoff than are possible (backing off "\
                "%d orders on a %d-gram model)" % \
                    (self.options['backoff'], self.options['n'])

    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()

        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name

        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that
        #  theoretically could occur, not just those that are seen -
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum(
            [["%d-%s" % (interval, chord) for chord in chord_types]
             for interval in range(12)], [])

        # Ignore unlabelled data
        ignores = ['']

        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff': self.options['backoff_cutoff']}

        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
            self.options['n'],
            training_data,
            label_dom,
            emission_dom=emission_dom,
            cutoff=self.options['cutoff'],
            backoff_order=self.options['backoff'],
            estimator=self.options['estimator'],
            ignore_list=ignores,
            backoff_kwargs=backoff_kwargs)

        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }

    @staticmethod
    def _load_model(data):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel

        model = PrecomputedNgramModel.from_picklable_dict(data['model'])
        name = data['name']
        chordmap = data.get("chordmap", None)
        return NgramTaggerModel(name, model=model, chordmap=chordmap)

    def _get_model_data(self):
        data = {
            'name': self.model_name,
            'model': self.model.to_picklable_dict(),
            'chordmap': self.chordmap_name,
        }
        return data

    def generate_chord_sequence(self, length=20):
        """
        Just for a laugh, use the trained n-gram to generate a chord 
        sequence and output it in a playable form.
        Returns a tuple: (chords, tags)
        
        @todo: this isn't implemented yet for n-grams. It's not a 
        high priority, but would be fun.
        
        """
        # Easily done, because the NgramModel already implements it itself
        raise NotImplementedError, "not yet done generation for n-grams"
        # This is what the other tagger did:

        from jazzparser.utils.chords import int_to_chord_numeral
        # Use the model to generate randomly
        rand_seq = self.model.random_sample(random.Random(), length)
        pitch = 0
        chords = []
        prochords, tags = zip(*rand_seq)
        # Convert the generated observations into readable chords
        for chord in prochords:
            interval, __, ctype = chord.partition("-")
            chords.append("%s%s" % (int_to_chord_numeral(pitch), ctype))
            pitch = (pitch + int(interval)) % 12
        return (chords, tags)

    def forward_probabilities(self, sequence):
        """ Interface to the NgramModel's forward_probabilities """
        return self.model.forward_probabilities(sequence)

    def forward_backward_probabilities(self, sequence):
        return self.model.gamma_probabilities(sequence, dictionary=True)

    def viterbi_probabilities(self, sequence):
        return self.model.viterbi_selector_probabilities(sequence)

    def _get_tags(self):
        return self.model.label_dom

    tags = property(_get_tags)

    #### Readable output of the parameters ####
    def _get_readable_params(self):
        try:
            text = ""

            # Include the stored model description
            text += self.model_description

            text += "\nNum emissions: %d\n" % self.model.num_emissions
            text += "\nShowing only probs for non-zero counts. "\
                    "Others may have a non-zero prob by smoothing\n"

            text += "\nChord mapping: %s:\n" % self.chordmap.name
            for (crdin, crdout) in self.chordmap.items():
                text += "  %s -> %s\n" % (crdin, crdout)

            # Emission distribution
            text += "\nEmission dist:\n"
            for label in sorted(self.model.label_dom):
                text += "  %s:\n" % label
                probs = reversed(sorted(
                            [(self.model.emission_dist[label].prob(em),em) for \
                                em in self.model.emission_dist[label].samples()]))
                for (prob, em) in probs:
                    text += "    %s: %s\n" % (em, prob)

            text += "\n\nTransition dist:\n"
            for history in sorted(self.model.label_dist.conditions()):
                text += "  %s\n" % str(history)
                dist = [(self.model.label_dist[history].prob(lab), lab)
                        for lab in self.model.label_dist[history].samples()]
                for prob, label in reversed(sorted(dist)):
                    text += "    %s: %s\n" % (str(label), prob)

            return text
        except AttributeError, err:
            # Catch this, because otherwise it just looks like the attribute
            #  (readable_parameters) doesn't exist (stupid Python behaviour)
            raise ValueError, "error generating model description "\
                            "(attribute error): %s" % err
示例#41
0
def command_line_input(filename=None, filetype=None, options="", \
        allowed_types=None, default_type=None):
    """
    Utility function for processing file input options from the command line.
    Pass in as args the values straight from the command line options to 
    select a filename, filetype and list of options.
    
    Typical command-line options for this purpose (for an optparse option parser C{op})::
     op.add_option("--file", "-f", dest="file", action="store", help="use a file to get input from")
     op.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Use '--filetype help' for a list of available types")
     op.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Use '--fopt help', with '--ft <type>', for a list of available options")
    Then you can call this function as::
     command_line_input(filename=options.file, filetype=options.filetype, options=options.file_options)
    
    @type allowed_types: list of strs
    @param allowed_types: types of input you want the user to be able to give.
        If not given, all types are allowed
    @type default_type: str
    @param default_type: filetype to assume if no other filetype is given
    @rtype: L{InputReader} subclass
    @return: the input wrapper of appropriate type, or None if no input file 
        was given
    
    """
    if allowed_types is None:
        allowed_types = get_input_type_names()

    if filetype is None and default_type is not None:
        filetype = default_type

    # Catch a request for filetype help
    if filetype is not None and filetype.lower() == "help":
        # Output possible file types
        print "Allowed input types: %s" % ", ".join(allowed_types)
        sys.exit(0)

    # Check that the filetype is valid and get the input type class if it is
    input_type = get_input_type(filetype)
    type_name = input_type_name(input_type)
    if input_type is None:
        raise InputTypeError, "Unknown filetype '%s'. Allowed types are: %s" % \
            (filetype, ", ".join(allowed_types))
    if type_name not in allowed_types:
        raise InputTypeError, "Cannot accept input of type '%s'. Allowed "\
            "types are: %s" % (filetype, ", ".join(allowed_types))

    if options is not None and options.lower() == "help":
        # Output help text
        from jazzparser.utils.options import options_help_text
        print options_help_text(input_type.FILE_INPUT_OPTIONS,
                                intro="Available options for input type %s" %
                                type_name)
        sys.exit(0)

    if filename is None:
        return None

    # First get a dict of the options
    file_options = ModuleOption.process_option_string(options)
    # Process the options as appropriate for this type
    file_options = input_type.process_option_dict(file_options)

    # Instantiate the input from the file as appropriate for the input type
    input_data = input_type.from_file(filename, file_options)
    return input_data
示例#42
0
def main():
    usage = "%prog [<options>] <model-name> <training-input>"
    description = "Training of PCFG models."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
        help="Number of partitions to divide the data into. "\
            "For train, divides the input file, trains a model on each "\
            "partition's complement and appends partition number to "\
            "the model names. For del, appends partition numbers to model "\
            "names and deletes all the models. Recache does similarly. "\
            "Has no effect for parse.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options")
    parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr")
    parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)
    
    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.WARN
    # Create a logger for training
    logger = create_logger(log_level = log_level,
                  name = "training",
                  stderr = True)
    
    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel
        
    # Parse the option string
    if options.training_opts is None:
        opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(PcfgModel.TRAINING_OPTIONS, 
                                            intro="Training options for PCFGs")
        sys.exit(0)
    else:
        opts = ModuleOption.process_option_dict(
                    ModuleOption.process_option_string(options.training_opts),
                    PcfgModel.TRAINING_OPTIONS)
    
    if len(arguments) == 0:
        print >>sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >>sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name
    
    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]
    
    if len(arguments) < 2:
        print >>sys.stderr, "Specify an input file to read sequence data from"
        sys.exit(1)
    # Read in the training data from the given file
    seqs = SequenceIndex.from_file(arguments[1])
    
    if options.partitions is not None:
        # Prepare each training partition
        datasets = holdout_partition(seqs.sequences, options.partitions)
    else:
        datasets = [seqs.sequences]
        
    for dataset,(parti,part_model) in zip(datasets,parts):
        # Train the named model on the sequence data
        model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, 
                                logger=logger)
        model.save()
        print "Trained model", part_model
示例#43
0
class ChordInput(Input):
    """
    Input wrapper for textual chord input.
    
    This is the simplest type of input, usually taken from the command line.
    
    You must provide a list of chord symbols and either a list of durations 
    or a list of times when constructing this. To process pure text (which 
    includes computing durations/times and splitting up chords), use 
    L{ChordInput.from_string}.
    
    """
    FILE_INPUT_OPTIONS = [
        ModuleOption('roman', filter=str_to_bool,
                     help_text="read chord symbols as roman numberals. "\
                        "Default is to assume note names",
                     usage="roman=B, where B is a boolean",
                     default=False),
    ]

    def __init__(self,
                 inputs,
                 durations=None,
                 times=None,
                 roman=False,
                 *args,
                 **kwargs):
        super(ChordInput, self).__init__(*args, **kwargs)

        self.inputs = inputs
        self.durations = durations
        self.times = times
        self.roman = roman

        # Compute the durations from times or vice versa
        if durations is None and times is None:
            raise ValueError, "cannot create a ChordInput with neither "\
                "times nor durations given"
        elif times is None:
            self.times = [
                sum(durations[:i], Fraction(0)) for i in range(len(durations))
            ]
        elif durations is None:
            from jazzparser.utils.base import group_pairs
            self.durations = [
                time1 - time0 for (time1, time0) in group_pairs(times)
            ] + [Fraction(1)]

        # Convert all strings to internal chord representation
        # Done now so we check the chords can all be understood before doing
        #  anything else
        self.chords = [
            Chord.from_name(name, roman=roman).to_db_mirror()
            for name in inputs
        ]
        for chord, dur in zip(self.chords, self.durations):
            chord.duration = dur

    @staticmethod
    def from_string(input, name="<string input>", roman=False):
        """
        Produce a wrapped-up version of the input directly from an input string, 
        which may come, for example, from the command line.
        
        """
        from jazzparser.utils.input import assign_durations, strip_input
        # Get durations from the original string before doing anything else
        durations = assign_durations(input)
        # Remove unwanted characters from the string
        input = strip_input(input)
        # Tokenise the string
        chords = input.split()
        return ChordInput(chords, durations=durations, name=name, roman=roman)

    def __str__(self):
        return " ".join(["%s" % i for i in self.inputs])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        return self.inputs[item]

    def slice(self, start=None, end=None):
        return ChordInput(self.inputs[start:end],
                          self.durations[start:end],
                          self.times[start:end],
                          name=self.name)

    @staticmethod
    def from_file(filename, options={}):
        # Read the whole contents of the file
        f = open(filename, 'r')
        try:
            data = f.read()
        finally:
            f.close()
        # Just treat the whole file as one sequence
        return ChordInput.from_string(data,
                                      name="File: %s" % filename,
                                      roman=options['roman'])

    def to_db_input(self):
        """
        This data type is useful for reading textual input. For internal 
        processing, however, it can be converted to a L{DbInput}, which 
        is generally more convenient to handle.
        
        """
        return DbInput(self.inputs,
                       durations=self.durations,
                       chords=self.chords)
示例#44
0
 def process_option_list(self, options):
     optdict = ModuleOption.process_option_string(options)
     self.options = ModuleOption.process_option_dict(optdict, self.tool_options)
示例#45
0
def main():
    def _check_args(args):
        if len(args) != 3:
            print >>sys.stderr, "Specify a tagger, model name and input file"
            sys.exit(1)
        return args[1],args[2]
    
    partitions,part_ids,options,arguments = prepare_evaluation_options(
        usage = "%prog [options] <tagger> <model-name> <input-file>",
        description = "Evaluate a tagging model by "\
            "tagging sequences from an input file. If the tagger doesn't "\
            "need a model name, use '-' as the model name.",
        check_args = _check_args,
        optparse_groups = [
            (("Tagging",),
                [(("--topt", "--tagger-options"), 
                    {'dest':"topts", 'action':"append", 'help':"options to pass to the tagger."}),
                ]),
            (("Output",), 
                [(("--no-model-info",), 
                    {'dest':"no_model_info", 'action':"store_true", 'help':"turns of outputing of information about the model being used before using it (useful for identifying output piped to a file later, but may be too verbose sometimes)"}),
                ]),
            (("Evaluation", "Type of evaluation and options"),
                [(("-a", "--agreement"), 
                    {'dest':"agreement", 'action':"store_true", 'help':"instead of doing any parses, just report the agreement of the tops tags with the gold standard tags."}),
                 (("--confusion",), 
                    {'dest':"confusion", 'action':"store_true", 'help':"print out confusion matrix after agreement calculation. Applies only in combination with --agreement"}),
                 (("-e", "--entropy"), 
                    {'dest':"entropy", 'action':"store_true", 'help':"instead of doing any parses, just report the entropy of the returned tag distribution with respect to the gold standard tags."}),
                 (("--tag-stats",), 
                    {'dest':"tag_stats", 'action':"store_true", 'help':"just output stats about the tags that the model assigns to this sequence (or these sequences)"}),
                 (("--topn",), 
                    {'dest':"topn", 'type':"int", 'action':"store", 'help':"when evaluating agreement consider the top N tags the tagger returns. By default, allows only the top one to count as a hit.", 'default':1}),
                ]),
        ],
    )
    
    grammar = Grammar()
    
    tagger_name = arguments[0]
    model_name = arguments[1]
    # Tagger shouldn't use a model in some cases
    no_tagger_model = model_name == "-"
    
    # Load the requested tagger class
    tagger_cls = get_tagger(tagger_name)
    topts = ModuleOption.process_option_string(options.topts)
    
    def _model_info(mname):
        """ Outputs info about the named model """
        if options.no_model_info:
            print >>sys.stderr, "Model %s" % mname
        else:
            # Can only output the nice model info if it's a ModelTagger
            if issubclass(tagger_cls, ModelTagger):
                print >>sys.stderr, "======== Model info ========"
                print >>sys.stderr, tagger_cls.MODEL_CLASS.load_model(mname).description
                print >>sys.stderr, "============================"
            else:
                print >>sys.stderr, "Tagger %s using model %s" % (tagger_cls.__name__, mname)
    
    num_parts = len(partitions)
    num_seqs = sum([len(p[0]) for p in partitions])
    
    ################# Evaluation ########################
    if options.tag_stats:
        raise NotImplementedError, "fix this if you want it"
        # Print out statistics for each partition, with its model
        if no_tagger_model:
            # There could be some circumstance in which we want to do this, 
            #  but I can't think what it is, so I'm not implementing it for now
            print >>sys.stderr, "Cannot run tag_stats with no tagger model"
            sys.exit(1)
        all_stats = {}
        for parti in range(num_parts):
            sequences,model,part_num = partitions[parti]
            # Output the model training info if requested
            _model_info(model)
            ######## This doesn't exist any more
            stats = sequences_top_tags_dict(tagger_cls, model, sequences, topn=options.topn)
            for tag,num in stats.items():
                if tag in all_stats:
                    all_stats[tag] += stats[tag]
                else:
                    all_stats[tag] = stats[tag]
        pprint_table(sys.stdout, list(reversed(sorted(all_stats.items(), key=lambda r:r[1]))), separator="|")
    elif options.agreement:
        # Print out agreement stats for each partition
        if no_tagger_model:
            # Same a tag_stats: probably no need for this ever
            print >>sys.stderr, "Cannot run agreement with no tagger model"
            sys.exit(1)
        correct = 0
        total = 0
        conf_mat = {}
        for parti in range(num_parts):
            sequences,model,part_num = partitions[parti]
            topts['model'] = model
            # Output the model training info if requested
            _model_info(model)
            pcorrect = 0
            ptotal = 0
            # Go through each sequence
            for seq in sequences:
                print >>sys.stderr, "Evaluating %s" % seq.string_name
                input = DbInput.from_sequence(seq)
                correct_tags = [chord.category for chord in seq.iterator()]
                cor,tot = tagger_agreement(input, grammar, tagger_cls, correct_tags, options=topts, confusion_matrix=conf_mat, topn=options.topn)
                pcorrect += cor
                ptotal += tot
                print "  Sequence: %.1f%%" % (float(cor)/tot*100)
                print "  So far: %.1f%%" % (float(pcorrect)/ptotal*100)
            print "Partition %d: %d / %d (%.2f%%)" % (part_num, pcorrect, ptotal, (float(pcorrect)/ptotal*100))
            correct += pcorrect
            total += ptotal
        if num_parts > 1:
            # Print out the overall stats
            print "%d / %d (%f%%)" % (correct,total,(float(correct)/total*100))
        if options.confusion:
            confusion_matrix(conf_mat) 
    elif options.entropy:
        print "Calculating cross-entropy of tagger with gold standard tags"
        entropy = 0.0
        num_chords = 0
        for parti in range(num_parts):
            sequences,model,part_num = partitions[parti]
            if not no_tagger_model:
                topts['model'] = model
                # Output the model training info if requested
                _model_info(model)
            pentropy = 0.0
            pnum_chords = 0
            # Compute the entropy for the partition model
            for seq in sequences:
                print >>sys.stderr, "Evaluating %s" % seq.string_name
                input = " ".join([str(chord) for chord in seq.iterator()])
                correct_tags = [chord.category for chord in seq.iterator()]
                ent,crds = tagger_entropy(input, grammar, tagger_cls, correct_tags, options=topts)
                pentropy += ent
                pnum_chords += crds
                print "   %f bits per chord" % (ent/crds)
            print "Partition %d: %f bits per chord (%d chords)" % (part_num, (pentropy/pnum_chords), pnum_chords)
            entropy += pentropy
            num_chords += pnum_chords
        # Print out the stats for all partitions together
        if num_parts > 1:
            print "%f bits per chord (%d chords)" % ((entropy/num_chords), num_chords)
    else:
        print >>sys.stderr, "Select an evaluation operation with one of the options"
        sys.exit(1)
示例#46
0
class DbInput(Input):
    """
    Wrapper for input from the database, rather than the command line.
    No point in reducing db input to a string, then reinterpreting it.
    
    If only one of C{times} and C{durations} is given, the other will 
    be computed from it. Computing C{times} from durations involves 
    assuming that the first chord occurs at time 0. Computing 
    C{durations} from C{times} involves assuming that the last chord 
    has a length of 1.
    
    At least one of C{times} and C{durations} must be given.
    
    We also store the id of the chord sequence that this came from (C{id}) and 
    the sequence representation itself (C{sequence}). This may be C{None} in 
    some cases.
    
    Confusingly (for historical reasons!), C{inputs} contains string chord 
    labels. C{chords} contains the db_mirrors representation of the chords.
    
    """
    FILE_INPUT_OPTIONS = [
        ModuleOption('index',
                     filter=int,
                     help_text="read the sequence with index (not id) X",
                     usage="index=X, where X is an int",
                     required=True),
    ]

    def __init__(self, inputs, durations=None, times=None, id=None, \
                    chords=None, sequence=None, *args, **kwargs):
        super(DbInput, self).__init__(*args, **kwargs)

        self.inputs = inputs
        self.durations = durations
        self.times = times
        self.id = id
        self.chords = chords
        self.sequence = sequence

        if durations is None and times is None:
            raise ValueError, "cannot create a DbInput with neither "\
                "times nor durations given"
        elif times is None:
            self.times = [sum(durations[:i]) for i in range(len(durations))]
        elif durations is None:
            from jazzparser.utils.base import group_pairs
            self.durations = [
                time1 - time0 for (time1, time0) in group_pairs(times)
            ] + [Fraction(1)]

    def get_gold_analysis(self):
        """
        Parses the annotations, if present, to get a gold analysis. Unlike 
        L{AnnotatedDbInput}, this input type cannot be assumed to have 
        annotations. It will therefore not raise an error if annotations 
        are missing or incomplete, but just return None.
        
        """
        from jazzparser.evaluation.parsing import parse_sequence_with_annotations
        from jazzparser.grammar import get_grammar
        from jazzparser.parsers import ParseError

        try:
            parses = parse_sequence_with_annotations(self,
                                                     get_grammar(),
                                                     allow_subparses=False)
        except ParseError:
            return None
        else:
            return parses[0].semantics

    @staticmethod
    def from_sequence(seq):
        """
        Creates a DbInput from a database representation of a sequence.
        
        """
        chords = list(seq)
        inputs = [str(chord) for chord in chords]
        durations = [chord.duration for chord in seq]
        return DbInput(inputs, durations=durations, name=seq.string_name, \
                        id=seq.id, chords=chords, sequence=seq)

    def __str__(self):
        return " ".join(["%s" % i for i in self.inputs])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        return self.inputs[item]

    def slice(self, start=None, end=None):
        if self.chords:
            chords = self.chords[start:end]
        else:
            chords = None
        return DbInput(self.inputs[start:end],
                       self.durations[start:end],
                       self.times[start:end],
                       id=self.id,
                       name=self.name,
                       chords=chords,
                       sequence=self.sequence)

    @staticmethod
    def from_file(filename, options={}):
        # Load up a sequence index file according to the filename
        seqs = SequenceIndex.from_file(filename)
        # Get a sequence by index from the file
        seq = seqs.sequence_by_index(options['index'])
        if seq is None:
            raise InputReadError("%d is not a valid sequence index in %s" % \
                (options['index'], filename))
        # Get the data from the sequence
        return DbInput.from_sequence(seq)
示例#47
0
 def process_option_dict(cls, optdict):
     return ModuleOption.process_option_dict(optdict,
                                             cls.FILE_INPUT_OPTIONS)
示例#48
0
class SegmentedMidiInput(Input):
    """
    Input wrapper for MIDI files with extra information about segmentation, 
    in the form it's needed for the Raphael and Stoddard model and midi 
    supertagging models: that is, offset (start of first bar) and bar length.
    
    Each segment is a midi L{midi.EventStream}. It also has the additional 
    attribute C{segment_start}, giving the tick time at which the segment 
    begins in the original midi stream.
    
    Optionally also stores a gold standard analysis in the form of a 
    db annotated chord sequence: see L{AnnotatedDbInput}.
    
    """
    FILE_INPUT_OPTIONS = [
        ModuleOption('time_unit', filter=float,
                     help_text="number of beats (by the MIDI file resolution) "\
                        "to take to be one time unit",
                     usage="time_unit=X, where X is an int or float",
                     required=False,
                     default=4),
        ModuleOption('tick_offset', filter=int,
                     help_text="time in MIDI ticks at which the first time "\
                        "unit begins",
                     usage="tick_offset=X, where X is an int",
                     required=False,
                     default=0),
        ModuleOption('truncate', filter=int,
                     help_text="truncate the input to this length.",
                     usage="truncate=L, where L is an integer"),
    ]
    SHELL_TOOLS = Input.SHELL_TOOLS + [
        tools.PlayMidiChunksTool(),
        tools.PrintMidiChunksTool()
    ]

    def __init__(self,
                 inputs,
                 time_unit=4,
                 tick_offset=0,
                 stream=None,
                 gold=None,
                 sequence_index=None,
                 *args,
                 **kwargs):
        """
        
        @type inputs: list of L{midi.EventStream}s
        @param stream: the midi data segments
        @type time_unit: int or float
        @param time_unit: number of beats to take as the basic unit 
            of time for observations
        @type tick_offset: int
        @param tick_offset: number of ticks after which the first bar begins
        
        """
        super(SegmentedMidiInput, self).__init__(*args, **kwargs)

        self.stream = stream
        self.time_unit = time_unit
        self.tick_offset = tick_offset
        self.inputs = inputs
        self.gold = gold
        self.sequence_index = sequence_index

        self.tick_unit = int(stream.resolution * time_unit)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        return self.inputs[item]

    def __str__(self):
        if self.name is not None:
            return "<MIDI: %s (%d)>" % (self.name, len(self))
        else:
            return "<MIDI: %d chunks>" % len(self)

    def slice(self, start=None, end=None):
        return SegmentedMidiInput(self.inputs[start:end],
                                  durations=self.durations[start:end],
                                  times=self.times[start:end],
                                  name=self.name,
                                  stream=self.stream,
                                  sequence_index=self.sequence_index)

    def get_gold_analysis(self):
        # This may be None if no analysis was in the input
        return self.gold

    @staticmethod
    def from_file(filename, options={}, gold=None, sequence_index=None):
        from midi import read_midifile
        from os.path import basename
        # Read are parse the midi file
        stream = read_midifile(filename)
        # Get the required segmentation parameters from the options
        time_unit = options['time_unit']
        tick_offset = options['tick_offset']
        # Use the filename as an identifier
        name = basename(filename)

        return SegmentedMidiInput.from_stream(stream,
                                              time_unit=time_unit,
                                              tick_offset=tick_offset,
                                              name=name,
                                              truncate=options['truncate'],
                                              gold=gold,
                                              only_notes=True,
                                              sequence_index=sequence_index)

    @staticmethod
    def from_stream(stream,
                    time_unit=4,
                    tick_offset=0,
                    name=None,
                    only_notes=True,
                    truncate=None,
                    gold=None,
                    sequence_index=None):
        """
        Creates a L{SegmentedMidiInput} from a midi event stream.
        
        @type only_notes: bool
        @param only_notes: if True, only includes note-on/note-off events in 
            the segments. If False, the stream will be sliced so that each 
            segment repeats things like program change events at the beginning.
            Including only notes, however, makes the preprocessing very much 
            faster
        
        """
        # Divide the stream up into slices of the right size
        # Number of ticks in each slice
        tick_unit = int(stream.resolution * time_unit)
        if len(stream.trackpool) == 0:
            end_time = 0
        else:
            end_time = max(stream.trackpool).tick

        if only_notes:
            from midi import EventStream, NoteOnEvent, NoteOffEvent, EndOfTrackEvent
            # Only include notes in the stream
            # This is much simpler and faster than the alternative
            events = [ev for ev in list(sorted(stream.trackpool)) if \
                        type(ev) in [NoteOnEvent, NoteOffEvent]]
            events = iter(events)
            try:
                current_event = events.next()
                # Get up to the start point in the stream
                while current_event.tick < tick_offset:
                    current_event = events.next()
            except StopIteration:
                # Got to the end of the stream before we even started
                inputs = []
            else:
                inputs = []
                for chunk_start in range(tick_offset, end_time, tick_unit):
                    chunk_end = chunk_start + tick_unit
                    slc = EventStream()
                    slc.add_track()
                    slc.format = stream.format
                    slc.resolution = stream.resolution
                    slc.segment_start = chunk_start

                    # Add all the note events in this time period
                    try:
                        while current_event.tick < chunk_end:
                            slc.add_event(current_event)
                            current_event = events.next()
                        # Add the end of track event
                        eot = EndOfTrackEvent()
                        eot.tick = chunk_end
                        slc.add_event(eot)
                    except StopIteration:
                        # Reached the end of the stream
                        inputs.append(slc)
                        break

                    inputs.append(slc)
        else:
            # Use slices to do all the necessary repetition of ongoing events
            from midi.slice import EventStreamSlice
            start_times = range(tick_offset, end_time, tick_unit)
            # First slice starts at the offset value
            slices = [
                EventStreamSlice(stream, chunk_start, chunk_start + tick_unit)
                for chunk_start in start_times
            ]
            inputs = [slc.to_event_stream(repeat_playing=False, cancel_playing=False) \
                                for slc in slices]
            # Associate the start time with each segment
            for slc, start_time in zip(inputs, start_times):
                slc.segment_start = start_time

        # Remove empty segments from the start and end
        current = 0
        # There's always one event - the end of track
        while len(inputs[current].trackpool) < 2:
            current += 1
        inputs = inputs[current:]
        # And the end
        current = len(inputs) - 1
        while len(inputs[current].trackpool) < 2:
            current -= 1
        inputs = inputs[:current + 1]

        if truncate is not None:
            inputs = inputs[:truncate]

        return SegmentedMidiInput(inputs,
                                  time_unit=time_unit,
                                  tick_offset=tick_offset,
                                  name=name,
                                  stream=stream,
                                  gold=gold,
                                  sequence_index=sequence_index)
示例#49
0
class SegmentedMidiBulkInput(BulkInput):
    """
    A CSV file containing midi file paths and the parameters for segmenting 
    each one.
    
    May store an index of a gold analysis with each input. This should appear 
    in column 4. If these are given, the first line of the file should specify 
    the path to the sequence input file as follows::
    
      GOLD: <relative path>
    
    Columns: filename, time unit, tick offset, ignore (bool, optional), gold id (int, optional)
    
    """
    INPUT_TYPE = SegmentedMidiInput
    FILE_INPUT_OPTIONS = [
        ModuleOption('truncate',
                     filter=int,
                     help_text="truncate each input to this length.",
                     usage="truncate=L, where L is an integer")
    ]
    SHELL_TOOLS = BulkInput.SHELL_TOOLS + [tools.PlayBulkMidiChunksTool()]

    def __init__(self, inputs):
        self.inputs = inputs

    def __str__(self):
        return "<bulk midi: %s>" % (" ".join([str(mid)
                                              for mid in self.inputs]))

    @staticmethod
    def writeln(csv,
                filename,
                time_unit=None,
                tick_offset=0,
                ignore=False,
                seq_index=None):
        """
        Writes a line to a segmidi bulk input file, opened as a CSV writer.
        
        """
        row = [
            "%s" % filename,
            "%f" % time_unit if time_unit else "2",
            "%d" % tick_offset, "TRUE" if ignore else "",
            "%d" % seq_index if seq_index is not None else ""
        ]
        csv.writerow(row)

    @staticmethod
    def from_file(filename, options={}):
        import csv, os
        # Read in the CSV file
        infile = open(filename, 'r')
        try:
            reader = csv.reader(infile)
            data = list(reader)
        finally:
            infile.close()

        base_path = os.path.abspath(os.path.dirname(filename))

        # Check the first line of the file for GOLD input
        if data[0][0].startswith("GOLD:"):
            gold_path = data[0][0].lstrip("GOLD:").strip()
            gold_path = os.path.join(base_path, gold_path)
            # Load the annotated data
            gold_data = AnnotatedDbBulkInput.from_file(gold_path)
            # Ignore this first line now
            data = data[1:]
        else:
            gold_data = None

        # Read the file's data and process it
        inputs = []
        for row in data:
            # Optional col 4 allows us to ignore rows for training while
            #  keeping their parameters in the file
            if len(row) > 3:
                ignore = str_to_bool(row[3])
            else:
                ignore = False

            if not ignore:
                filename = row[0]
                # Read in the midi file
                midi = os.path.join(base_path, filename)

                # Prepare the parameters
                if row[1]:
                    time_unit = float(row[1])
                else:
                    time_unit = 2.0

                if row[2]:
                    tick_offset = int(row[2])
                else:
                    tick_offset = 0

                if len(row) > 4 and gold_data is not None and row[4].strip():
                    # A gold sequence analysis was given: load it up
                    seq_index = int(row[4])
                    gold = gold_data[seq_index].get_gold_analysis()
                else:
                    seq_index = None
                    gold = None

                options = SegmentedMidiInput.process_option_dict({
                    'time_unit':
                    time_unit,
                    'tick_offset':
                    tick_offset,
                    'truncate':
                    options['truncate'],
                })
                inputs.append(
                    SegmentedMidiInput.from_file(midi,
                                                 options=options,
                                                 gold=gold,
                                                 sequence_index=seq_index))
        return SegmentedMidiBulkInput(inputs)
示例#50
0
class ChordBulkInput(BulkInput):
    """
    A file containing a list of textual chord sequences. This used to be 
    provided fully in the top-level parser script as input processing.
    
    """
    INPUT_TYPE = ChordInput
    FILE_INPUT_OPTIONS = [
        ModuleOption('start', filter=int,
                     help_text="line number to start reading from",
                     usage="start=X, where X is an int"),
        ModuleOption('end', filter=int,
                     help_text="line number at which to stop reading",
                     usage="end=X, where X is an int"),
        ModuleOption('roman', filter=str_to_bool,
                     help_text="read chord symbols as roman numberals. "\
                        "Default is to assume note names",
                     usage="roman=B, where B is a boolean",
                     default=False),
    ]

    def __init__(self, inputs, output_lines=None):
        self.inputs = inputs
        self.output_lines = output_lines

    @staticmethod
    def from_file(filename, options={}):
        f = open(filename, 'r')
        try:
            lines = f.readlines()
        finally:
            f.close()
        lines = [l.rstrip("\n") for l in lines]

        # Use the start and end line numbers if they were given
        if 'start' in options:
            lines = lines[options['start']:]
        if 'end' in options:
            lines = lines[:options['end']]

        # Do all the preprocessing
        output_lines = {}
        inputs = []
        sequence_name = None
        for line in lines:
            # If this is an output comment, output it and move to the next item
            if line.startswith(">>"):
                # If this is also a name definition, use it for the next sequence
                if line[2:].startswith("="):
                    sequence_name = line[3:-1]
                    output_lines[len(inputs)] = line[3:]
                else:
                    output_lines[len(inputs)] = line[2:]
                continue
            elif line.startswith("//"):
                # Non-printing comment
                # This could also be a name definition
                if line[2:].startswith("="):
                    output_lines[len(inputs)] = line[3:-1]
                continue
            elif len(line.strip()) == 0:
                # Ignore blank lines
                continue
            else:
                # Otherwise it's an actual chord sequence
                inputs.append(
                    ChordInput.from_string(line,
                                           name=sequence_name,
                                           roman=options['roman']))
                # Reset the sequence name
                sequence_name = None
        return ChordBulkInput(inputs, output_lines=output_lines)

    def to_db_inputs(self):
        """
        @see: L{ChordInput.to_db_input}
        """
        return DbBulkInput([chords.to_db_input() for chords in self.inputs])
def command_line_input(filename=None, filetype=None, options="", allowed_types=None, default_type=None):
    """
    Utility function for processing file input options from the command line.
    Pass in as args the values straight from the command line options to 
    select a filename, filetype and list of options.
    
    Typical command-line options for this purpose (for an optparse option parser C{op})::
     op.add_option("--file", "-f", dest="file", action="store", help="use a file to get input from")
     op.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Use '--filetype help' for a list of available types")
     op.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Use '--fopt help', with '--ft <type>', for a list of available options")
    Then you can call this function as::
     command_line_input(filename=options.file, filetype=options.filetype, options=options.file_options)
    
    @type allowed_types: list of strs
    @param allowed_types: types of input you want the user to be able to give.
        If not given, all types are allowed
    @type default_type: str
    @param default_type: filetype to assume if no other filetype is given
    @rtype: L{InputReader} subclass
    @return: the input wrapper of appropriate type, or None if no input file 
        was given
    
    """
    if allowed_types is None:
        allowed_types = get_input_type_names()

    if filetype is None and default_type is not None:
        filetype = default_type

    # Catch a request for filetype help
    if filetype is not None and filetype.lower() == "help":
        # Output possible file types
        print "Allowed input types: %s" % ", ".join(allowed_types)
        sys.exit(0)

    # Check that the filetype is valid and get the input type class if it is
    input_type = get_input_type(filetype)
    type_name = input_type_name(input_type)
    if input_type is None:
        raise InputTypeError, "Unknown filetype '%s'. Allowed types are: %s" % (filetype, ", ".join(allowed_types))
    if type_name not in allowed_types:
        raise InputTypeError, "Cannot accept input of type '%s'. Allowed " "types are: %s" % (
            filetype,
            ", ".join(allowed_types),
        )

    if options is not None and options.lower() == "help":
        # Output help text
        from jazzparser.utils.options import options_help_text

        print options_help_text(input_type.FILE_INPUT_OPTIONS, intro="Available options for input type %s" % type_name)
        sys.exit(0)

    if filename is None:
        return None

    # First get a dict of the options
    file_options = ModuleOption.process_option_string(options)
    # Process the options as appropriate for this type
    file_options = input_type.process_option_dict(file_options)

    # Instantiate the input from the file as appropriate for the input type
    input_data = input_type.from_file(filename, file_options)
    return input_data
示例#52
0
def main():
    usage = "%prog [options] <model_name> <in-file>"
    description = "Trains a chord labeling model using the given "\
        "input data. The data file may be a stored SequenceIndex file, or "\
        "any other type of bulk data file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        '-p',
        '--partitions',
        dest="partitions",
        action="store",
        type="int",
        help=
        "train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number."
    )
    parser.add_option(
        '--opts',
        dest="training_opts",
        action="append",
        help=
        "options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type."
    )
    # File input options
    parser.add_option(
        "--filetype",
        "--ft",
        dest="filetype",
        action="store",
        help=
        "select the file type for the input file. Same filetypes as jazzparser",
        default='bulk-db')
    parser.add_option(
        "--file-options",
        "--fopt",
        dest="file_options",
        action="store",
        help=
        "options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options."
    )
    # Logging output
    parser.add_option(
        '--log',
        dest="log",
        action="store",
        help=
        "file to output training logs to. Specify a base filename; <modelname>.log will be added to the end"
    )
    options, arguments = parse_args_with_config(parser)

    grammar = Grammar()

    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif "help" in [opt.lower() for opt in options.training_opts]:
        print options_help_text(HPChordLabeler.TRAINING_OPTIONS,
                                intro="Training options:")
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_string(
            options.training_opts)

    if len(arguments) < 2:
        print >> sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]

    # Load the sequence data
    # Only allow bulk types
    input_data = command_line_input(filename=filename,
                                    filetype=options.filetype,
                                    options=options.file_options,
                                    allowed_types=get_input_type_names(
                                        single=False, bulk=True))

    # Only partition the chord data, not the MIDI data
    if options.partitions is not None and not \
            (isinstance(input_data, MidiTaggerTrainingBulkInput) and \
             input_data.chords is not None):
        print >>sys.stderr, "Can only partition chord data and no chord data "\
            "was supplied"
        sys.exit(1)

    if options.partitions:
        # The input includes chord training data
        parts = input_data.chords.get_partitions(options.partitions)[1]
        models = [("%s%d" % (model_name,num),chord_data) \
            for num,chord_data in enumerate(parts)]
    else:
        models = [(model_name, None)]

    for part_name, chord_data in models:
        if options.log is not None:
            # Prepare a logger
            logfile = "%s%s.log" % (options.log, part_name)
            print "Logging output to file %s" % logfile
            logger = create_logger(filename=logfile)
        else:
            logger = None

        # Create a fresh model with this name
        model = HPChordLabeler.train(input_data,
                                     part_name,
                                     logger=logger,
                                     options=training_opts,
                                     chord_data=chord_data)
        print "Trained model %s" % (part_name)
 def process_option_dict(cls, optdict):
     return ModuleOption.process_option_dict(optdict, cls.FILE_INPUT_OPTIONS)
示例#54
0
def main():
    set_proc_title("jazzparser")
    ########################################################
    usage = "jazzparser [<options>]"
    description = "The main parser interface for the Jazz Parser"
    ## Process the input options
    optparser = OptionParser(usage=usage, description=description)
    ###
    # File input options
    group = OptionGroup(optparser, "Input", "Input type and location")
    optparser.add_option_group(group)
    group.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.")
    group.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords')
    group.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    group.add_option("--index", "--indices", dest="input_index", action="store", help="select individual inputs to process. Specify as a comma-separated list of indices. All inputs are loaded as usual, but only the ith input is processed, for each i in the list")
    group.add_option("--only-load", dest="only_load", action="store_true", help="don't do anything with the inputs, just load and list them. Handy for checking the inputs load and getting their indices")
    group.add_option("--partitions", dest="partitions", action="store", type="int", help="divide the input data into this number of partitions and use a different set of models for each. For any parser, tagger and backoff that takes a 'model' argument, the partition number will be appended to the given value")
    group.add_option("--seq-parts", "--sequence-partitions", dest="sequence_partitions", action="store", help="use a chord sequence index to partition the inputs. Input type (bulk) must support association of the inputs with chord sequences by id. Sequences in the given sequence index file are partitioned n ways (--partitions) and the inputs are processed according to their associated sequence.")
    group.add_option("--continue", "--skip-done", dest="skip_done", action="store_true", help="skip any inputs for which a readable results file already exists. This is useful for continuing a bulk job that was stopped in the middle")
    ###
    group = OptionGroup(optparser, "Parser", "Parser, supertagger and backoff parser")
    optparser.add_option_group(group)
    group.add_option("-d", "--derivations", dest="derivations", action="store_true", help="keep derivation logs during parse.")
    group.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    # Parser options
    group.add_option("-p", "--parser", dest="parser", action="store", help="use the named parser algorithm instead of the default. Use '-p help' to see the list of available parsers. Default: %s" % settings.DEFAULT_PARSER, default=settings.DEFAULT_PARSER)
    group.add_option("--popt", "--parser-options", dest="popts", action="append", help="specify options for the parser. Type '--popt help', using '--parser <name>' to select a parser module, to get a list of options.")
    # Tagger options
    group.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER)
    group.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.")
    # Backoff options
    group.add_option("-b", "--backoff", "--noparse", dest="backoff", action="store", help="use the named backoff model as a backoff if the parser produces no results")
    group.add_option("--bopt", "--backoff-options", "--backoff-options", "--npo", dest="backoff_opts", action="append", help="specify options for the  backoff model. Type '--npo help', using '--backoff <name>' to select a backoff modules, to get a list of options.")
    ###
    # Multiprocessing options
    group = OptionGroup(optparser, "Multiprocessing")
    optparser.add_option_group(group)
    group.add_option("--processes", dest="processes", action="store", type="int", help="number of processes to create to perform parses in parallel. Default: 1, i.e. no process pool. Use -1 to create a process for every input", default=1)
    ###
    # Output options
    group = OptionGroup(optparser, "Output")
    optparser.add_option_group(group)
    group.add_option("--output", dest="output", action="store", help="directory name to output parse results to. A filename specific to the individual input will be appended to this")
    group.add_option("--topn", dest="topn", action="store", type="int", help="limit the number of final results to store in the output file to the top n by probability. By default, stores all")
    group.add_option("--output-opts", "--oopts", dest="output_opts", action="store", help="options that affect the output formatting. Use '--output-opts help' for a list of options.")
    group.add_option("-a", "--atomic-results", dest="atoms_only", action="store_true", help="only include atomic categories in the results.")
    group.add_option("-l", "--latex", dest="latex", action="store_true", help="output all results as Latex source. Used to produce a whole Latex document, but doesn't any more")
    group.add_option("--all-times", dest="all_times", action="store_true", help="display all timing information on semantics in output.")
    group.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.")
    group.add_option("--time", dest="time", action="store_true", help="time how long the parse takes and output with the results.")
    group.add_option("--no-results", dest="no_results", action="store_true", help="don't print out the parse results at the end. Obviously you'll want to make sure they're going to a file (--output). This is useful for bulk parse jobs, where the results produce a lot of unnecessary output")
    group.add_option("--no-progress", dest="no_progress", action="store_true", help="don't output the summary of completed sequences after each one finishes")
    ###
    # Output analysis and harmonical
    group = OptionGroup(optparser, "Output processing", "Output analysis and harmonical")
    optparser.add_option_group(group)
    group.add_option("--harmonical", dest="harmonical", action="store", help="use the harmonical to play the chords justly intoned according to the top result and output to a wave file.")
    group.add_option("--enharmonical", dest="enharmonical", action="store", help="use the harmonical to play the chords in equal temperament and output to a wave file.")
    group.add_option("--midi", dest="midi", action="store_true", help="generate MIDI files from the harmonical, instead of wave files.")
    group.add_option("--tempo", dest="tempo", action="store", type=int, help="tempo to use for the generated music (see --harmonical/--enharmonical). Default: 120", default=120)
    group.add_option("--lh-analysis", dest="lh_analysis", action="store_true", help="output the Longuet-Higgins space interpretation of the semantics for each result.")
    group.add_option("--lh-coordinates", dest="lh_coord", action="store_true", help="like lh-analysis, but displays the coordinates of the points instead of their names.")
    ###
    # Logging options
    group = OptionGroup(optparser, "Logging")
    optparser.add_option_group(group)
    group.add_option("--long-progress", dest="long_progress", action="store_true", help="print a summary of the chart so far after each chord/word has been processed.")
    group.add_option("--progress", "--short-progress", dest="short_progress", action="store_true", help="print a small amount of information out during parsing to indicate progress.")
    group.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.")
    ###
    # Shell options
    group = OptionGroup(optparser, "Shell", "Interactive shell for inspecting results and parser state")
    optparser.add_option_group(group)
    group.add_option("-i", "--interactive", dest="interactive", action="store_true", help="enter interactive mode after parsing.")
    group.add_option("--error", dest="error_shell", action="store_true", help="catch any errors, report them and then enter the interactive shell. This also catches keyboard interrupts, so you can use it to halt parsing and enter the shell.")
    
    # Read in command line options and args
    options, clinput = parse_args_with_config(optparser)

    ########################### Option processing ####################
    
    # Get log level option first, so we can start using the logger
    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    # Set up a logger
    init_logging(log_level)
    
    if options.latex:
        settings.OPTIONS.OUTPUT_LATEX = True
    
    if options.logger:
        # Directory
        parse_logger_dir = options.logger
        check_directory(parse_logger_dir)
    else:
        parse_logger_dir = None
    
    ######## Grammar ########
    # Check the grammar actually exists
    grammar_names = get_grammar_names()
    if options.grammar is not None and options.grammar not in grammar_names:
        # This is not a valid grammar name
        logger.error("The grammar '%s' does not exist. Possible "\
            "grammars are: %s." % (options.grammar, ", ".join(grammar_names)))
        return 1
    grammar = get_grammar(options.grammar)
        
    ######## Parser ########
    # Load the requested parser
    from jazzparser.parsers import PARSERS
    if options.parser.lower() == "help":
        print "Available parsers are: %s" % ", ".join(PARSERS)
        return 0
    try:
        parser_cls = get_parser(options.parser)
    except ParserLoadError:
        logger.error("The parser '%s' could not be loaded. Possible "\
            "parsers are: %s" % (options.parser, ", ".join(PARSERS)))
        return 1
        
    # Get parser options
    if options.popts is not None:
        poptstr = options.popts
        if "help" in [s.strip().lower() for s in poptstr]:
            # Output this tagger's option help
            from jazzparser.utils.options import options_help_text
            print options_help_text(parser_cls.PARSER_OPTIONS, intro="Available options for selected parser")
            return 0
        poptstr = ":".join(poptstr)
    else:
        poptstr = ""
    popts = ModuleOption.process_option_string(poptstr)
    # Check that the options are valid
    try:
        parser_cls.check_options(popts)
    except ModuleOptionError, err:
        logger.error("Problem with parser options (--popt): %s" % err)
        return 1
def main():
    usage = "%prog [options] <model_name> <input-file>"
    description = (
        "Trains a model for the RaphSto chord labelling "
        "algorithm on a file that contains a list of midi files with "
        "training options"
    )
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        "-p",
        "--partitions",
        dest="partitions",
        action="store",
        type="int",
        help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.",
    )
    parser.add_option(
        "--opts",
        dest="opts",
        action="store",
        help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.",
    )
    parser.add_option(
        "--proc",
        "--processes",
        dest="processes",
        action="store",
        type="int",
        help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)",
        default=1,
    )
    parser.add_option(
        "--max-length",
        dest="max_length",
        action="store",
        type="int",
        help="limits the length of the training midi sequences in chunks",
    )
    parser.add_option(
        "--split-length",
        dest="split_length",
        action="store",
        type="int",
        help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot",
    )
    parser.add_option(
        "--min-length",
        dest="min_length",
        action="store",
        type="int",
        help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence",
    )
    parser.add_option(
        "--progress-out",
        dest="progress_out",
        action="store",
        help="output logging info to a file instead of the command line",
    )
    parser.add_option(
        "--init-model",
        dest="init_model",
        action="store",
        help="initialize the model using parameters from an already trained model",
    )
    parser.add_option(
        "--init-ctrans",
        dest="init_ctrans",
        action="store",
        help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability",
    )
    parser.add_option(
        "--chord-set",
        dest="chord_set",
        action="store",
        help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used",
    )
    parser.add_option(
        "-m",
        "--model-type",
        dest="model_type",
        action="store",
        help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()),
        default="standard",
    )
    options, arguments = parse_args_with_config(parser)

    if options.opts is not None and options.opts == "help":
        print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs")
        sys.exit(0)
    opts = ModuleOption.process_option_string(options.opts)

    if len(arguments) < 2:
        print >> sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]

    print >> sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(" ")
    # Create a logger to output the progress of the training to stdout or a file
    if options.progress_out is not None:
        stdout = False
        logfile = options.progress_out
        print >> sys.stderr, "Outputing logging info to %s" % logfile
    else:
        stdout = True
        logfile = None
        print >> sys.stderr, "Outputing logging to stdout"
    logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout)
    logger.info("Raphael and Stoddard HMM model training")

    if options.model_type not in MODEL_TYPES:
        print >> sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES)
        sys.exit(1)
    model_cls = MODEL_TYPES[options.model_type]

    if options.chord_set == "help":
        print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys())
        sys.exit(0)
    elif options.chord_set is not None:
        # Check this chord set exists
        if options.chord_set not in constants.CHORD_SETS:
            print >> sys.stderr, "Chord set '%s' does not exist" % options.chord_set
            sys.exit(1)
        else:
            logger.info("Using chord set '%s'" % options.chord_set)

    # Read in the training data
    midis = InputSourceFile(filename)
    handlers = midis.get_handlers()
    logger.info("Reading in %d midi files..." % len(midis.inputs))
    training_data = []
    for i, mh in enumerate(handlers):
        logger.info("%s: %s" % (i, midis.inputs[i][0]))
        emissions = mh.get_emission_stream()[0]
        if options.max_length is not None and len(emissions) > options.max_length:
            logger.info("Truncating file %d to %d chunks (was %d)" % (i, options.max_length, len(emissions)))
            emissions = emissions[: options.max_length]
        if options.split_length is not None:
            logger.info("Splitting sequence %d into sequence no longer " "than %d chunks" % (i, options.split_length))
            # Split up the sequence if it's too long
            while len(emissions) > options.split_length:
                training_data.append(emissions[: options.split_length])
                emissions = emissions[options.split_length :]
        training_data.append(emissions)

    if options.min_length is not None:
        # Make sure there are no sequences under the minimum length
        # Just throw away any that are
        before_chuck = len(training_data)
        training_data = [seq for seq in training_data if len(seq) >= options.min_length]
        if len(training_data) != before_chuck:
            logger.info(
                "Threw away %d short sequences (below %d chunks)"
                % ((before_chuck - len(training_data)), options.min_length)
            )

    logger.info(
        "Training on %d sequences. Lengths: %s"
        % (len(training_data), ", ".join(str(len(seq)) for seq in training_data))
    )

    if options.partitions is not None:
        parts = holdout_partition(training_data, options.partitions)
        models = [("%s%d" % (model_name, num), data) for num, data in enumerate(parts)]
    else:
        models = [(model_name, training_data)]

    # Number of processes to use
    if options.processes == -1:
        # Special value: means number of training sequences (one process per sequence)
        processes = len(training_data)
    else:
        processes = options.processes

    for part_name, data in models:
        # Instantiate a fresh model with this name
        logger.info("Training model '%s' on %d midis" % (part_name, len(data)))
        if options.init_model is not None:
            logger.info("Initializing using parameters from model '%s'" % options.init_model)
            # Load an already trained model as initialization
            model = model_cls.initialize_existing_model(options.init_model, model_name=part_name)
        else:
            # TODO: make these probs an option
            ctype_params = (0.5, 0.3, 0.2)
            logger.info("Initializing to naive chord types using parameters: " "%s, %s, %s" % ctype_params)
            init_kwargs = {"model_name": part_name}
            if options.chord_set is not None:
                # Specify a chord set for the model
                init_kwargs["chord_set"] = options.chord_set
            model = model_cls.initialize_chord_types(ctype_params, **init_kwargs)

            # Initialize the chord transition probabilities if given
            if options.init_ctrans is not None:
                logger.info("Initializing chord transition distribution to %s" % options.init_ctrans)
                model.set_chord_transition_probabilities(options.init_ctrans)
        # Retrain it with the loaded data
        trainer = model_cls.get_trainer()(model, options=opts)
        trainer.train(data, logger=logger, processes=processes, save_intermediate=True)
    print >> sys.stderr, "Training terminating at %s" % datetime.now().isoformat(" ")
示例#56
0
class DirectedCkyParser(Parser):
    """
    DirectedCkyParser is a special version of the CKY parser that tries 
    to produce a parse according to a pre-built derivation tree.
    
    Why?
    Canonical trees are stored implicitly in the Jazz corpus. We can 
    build the explicit structure of the trees, in accordance with the 
    implicit manual annotations, but this will not contain any signs 
    on internal nodes. The structure does not produce a parse in itself 
    or even verify that the sequence can be parsed with that structure.
    
    The purpose of the DirectedCkyParser is to take a description of 
    this annotated structure and actually perform the parse, packing 
    the chart with only those signs that the derivation structure 
    produces.
    
    The parser should be used with a tagger that assigns only those 
    signs that were annotated. Use the PretaggedTagger to do this.
    
    """
    PARSER_OPTIONS = Parser.PARSER_OPTIONS + [
        ModuleOption(
            'derivations',
            filter=bool,
            help_text="Store derivation traces along with the results",
            usage="derivations=X, where X is 'True' or 'False'.",
            default=None,
        ),
    ]

    def __init__(self, grammar, tagger, derivation_tree=None, *args, **kwargs):
        if derivation_tree is None:
            raise ValueError, "DirectedCkyParser must be instantiated "\
                "with a derivation tree in kwarg 'derivation_tree'."
        self.derivation_tree = derivation_tree
        super(DirectedCkyParser, self).__init__(grammar, tagger, *args,
                                                **kwargs)

    def _create_chart(self, *args, **kwargs):
        self.chart = Chart(self.grammar, *args, **kwargs)
        return self.chart

    def parse(self, derivations=False, summaries=False):
        """
        Run the parser on the input, using the specified tagger. Runs 
        the CKY parsing algorithm to do chart parsing. For details of 
        chart parsing, see Chart class.
        """
        if 'derivations' in self.options and self.options[
                'derivations'] is not None:
            derivations = self.options['derivations']

        # Find out from the tagger how long the input it read in was
        input_length = self.tagger.input_length
        # Create and initialise a chart for parsing
        # Don't initialise the chart with signs - we'll add signs gradually instead
        chart = self._create_chart(signs=[[]] * input_length,
                                   derivations=derivations)

        ##################################################
        ### Here is the parser itself

        # Only get signs from the tagger once: we expect to get them all first time
        # Add all the lexical signs to the chart
        for word in range(input_length):
            new_cat_pairs = self.tagger.get_signs_for_word(word)
            new_cats = [cat for (cat, tag, prob) in new_cat_pairs]
            chart.add_word_signs(new_cats, word, self.tagger.get_word(word))

        ##### Main parser loop: produce only the signs that we're directed to produce
        # Get a mapping from the tree's short rule names to the rule instances
        rule_mapping = self.grammar.formalism.PcfgParser.rule_short_names

        # Perform the parse bottom up by a depth-first left-to-right
        #  recursion on the derivation tree. Recursively parse children
        #  of each node, before applying rules for the node itself.
        def _fill_chart(start, tree_node):
            """
            Recursively fills the chart using the subtree rooted by 
            tree_node, using start as the leftmost node of the chart.
            Returns the resulting rightmost node covered by this 
            span.
            """
            if hasattr(tree_node, 'children') and len(tree_node.children) > 0:
                if len(tree_node.children) > 2:
                    raise DirectedParseError, "invalid derivation tree. "\
                        "Nodes may have up to 2 children. This node has "\
                        "%d: %s" % (len(tree_node.children), tree_node)
                ### An internal node
                # First recurse to the sub-parses
                sub_end = start
                middle = None
                for child in tree_node.children:
                    sub_end = _fill_chart(sub_end, child)
                    if middle is None:
                        # Store the first node after the start as the middle node
                        middle = sub_end
                # We now know where this span ends.
                end = sub_end
                # Apply the rule associated with the node
                try:
                    rule_details = rule_mapping[tree_node.rule]
                except KeyError:
                    raise DirectedParseError, "tree node %s specifies a "\
                        "rule '%s' which is not defined for this "\
                        "formalism. Are you using the right formalism "\
                        "for your data?" % (tree_node, tree_node.rule)
                rule_cls = self.grammar.formalism.rules[rule_details[0]]
                # Instantiate the rule
                rule_kwargs = {
                    'grammar': self.grammar,
                    'modalities': self.grammar.modality_tree,
                }
                rule_kwargs.update(rule_details[1])
                rule = rule_cls(**rule_kwargs)
                # Try applying the rule to the arguments we've generated
                # Check we have the right number of children
                if len(tree_node.children) != rule.arity:
                    raise DirectedParseError, "a node was encountered "\
                        "that does not have the right number of children "\
                        "for its rule. %s must have %d children." % \
                        (tree_node.rule, rule.arity)
                # Apply the rule to its one or two arguments
                if rule.arity == 1:
                    added = chart.apply_unary_rule(rule, start, end)
                    debug_inputs = "%s, [%s]" % (rule, ", ".join(
                        ["%s" % s for s in chart.get_signs(start, end)]))
                elif rule.arity == 2:
                    added = chart.apply_binary_rule(rule, start, middle, end)
                    debug_inputs = "%s, [%s] and [%s]" % (rule, ", ".join(
                        ["%s" % s
                         for s in chart.get_signs(start, middle)]), ", ".join(
                             ["%s" % s for s in chart.get_signs(middle, end)]))
                # If nothing was added to the chart, the rule must have failed
                if not added:
                    # No point in continuing, since stuff further up the
                    #  tree will inevitably fail
                    raise DirectedParseError, "failed to apply rule %s. "\
                        "Giving up on parse. "\
                        "Tree: %s. Inputs: %s." % \
                        (tree_node.rule, tree_node, debug_inputs)
            elif hasattr(tree_node, 'chord'):
                ### Leaf node
                # We assume this lines up with the correct position in
                #  the tags that the tagger has given us.
                # This arc is a leaf, so only has a span of 1.
                end = start + 1
            else:
                # Tree does not conform to correct interface
                raise DirectedParseError, "derivation tree for directed "\
                    "parse should be made up of internal trees with "\
                    "children and leaves with a chord attribute. This "\
                    "node is neither: %s" % tree_node
            return end

        rightmost = _fill_chart(0, self.derivation_tree)

        return chart.parses
示例#57
0
class CandcTagger(ModelTagger):
    """
    Superclass of both kinds of C&C tagger. Don't use this: use one 
    of the subclasses below.
    """
    MODEL_CLASS = CandcTaggerModel
    COMPATIBLE_FORMALISMS = [
        'music_roman',
        'music_keyspan',
        'music_halfspan',
    ]
    INPUT_TYPES = ['db', 'chords']
    # Probability ratio between one tag and the next that allows the 
    #  second to be returned in the same batch as the first
    TAG_BATCH_RATIO = 0.8
    DEFAULT_UNSEEN_TAG_PROB = 0.001
    
    TAGGER_OPTIONS = [
        ModuleOption('batch', filter=float, 
            help_text="Probability ratio between one tag and the next "\
                "that allows the second to be returned in the same batch.",
            usage="batch=X, where X is a floating point value between 0 and 1",
            default=TAG_BATCH_RATIO),
        ModuleOption('model', 
            help_text="Name of the C&C trained model to use. Use the C&C "\
                "training scripts to produce this.",
            usage="model=X, where X is the model name. Split up multi-level models with dots.",
            required=True),
        ModuleOption('unseen_tag_prob', filter=float, 
            help_text="Probability mass reserved on each word so that some "\
                "probability is assigned to tags never seen in the training "\
                "set. This is a form of plus-n smoothing. "\
                "Substracted from the total probability of tags for "\
                "each word and distributed evenly across all tags.", 
            usage="unseen_tag_prob=X, where X is a floating point value between 0 and 1",
            default=DEFAULT_UNSEEN_TAG_PROB),
        ModuleOption('last_batch', filter=str_to_bool, 
            help_text="Use all possible tags, including the last, lowest "\
                "probability batch, which typically acts as a bin for "\
                "all remaining tags", 
            usage="last_batch=X, where X is 'true' or 'false'",
            default=True),
    ] + ModelTagger.TAGGER_OPTIONS
    
    def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs):
        super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs)
        process_chord_input(self)
        
        if type(self) == CandcTagger:
            raise NotImplementedError, "Tried to instantiate CandcTagger "\
                "directly. You should use one of its subclasses."
        self.tag_batch_ratio = self.options['batch']
        model = self.options['model'].split('.')
        
        # Check that candc is available for supertagging
        if not os.path.exists(settings.CANDC.BASE_PATH):
            raise CandcConfigurationError, "The C&C parser base "\
                "directory %s does not exist" % settings.CANDC.BASE_PATH
        if not os.path.exists(settings.CANDC.MODELS_PATH):
            raise CandcConfigurationError, "The C&C parser models "\
                "directory %s does not exist" % settings.CANDC.MODELS_PATH
        candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command)
        if not os.path.exists(candc_cmd):
            raise CandcConfigurationError, "The C&C supertagger command "\
                "%s does not exist. Have you built it?" % candc_cmd
        # Check the model exists
        candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model))
        if not os.path.exists(candc_model):
            raise CandcConfigurationError, "The C&C model given (%s) "\
                "doesn't exist." % candc_model
        
        # Create a logger to dump the output to
        logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model))
        candc_logger = create_logger(filename=logfile)
        self.logger.info("Logging C&C output to %s" % logfile)
        # Note in the log what we're trying to tag
        candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input]))
        
        # Read in the list of tags to smooth over
        self.tag_list = read_tag_list(os.path.join(candc_model, "tags"))
        
        # Read in extra options
        opts_filename = os.path.join(candc_model, "jpopts")
        if not os.path.exists(opts_filename):
            self.extra_opts = {}
        else:
            with open(opts_filename, 'r') as opts_file:
                self.extra_opts = dict(
                    [line.strip("\n").split(":", 1) 
                        for line in opts_file.readlines()])
        # Pull the chord mapping out of the options
        self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None))
        
        # Spawn a process to do the tagging
        candc_command = [candc_cmd, "--model", candc_model, 
                        "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args
        self.tagger = Popen(candc_command, 
                            stdin=PIPE, stdout=PIPE, stderr=PIPE)
        candc_logger.info("C&C command: %s" % " ".join(candc_command))
            
        self.tokens = self.input
        # Build some observations from the tokens
        observations = [
            interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) 
                for ch1,ch2 in group_pairs(self.tokens+[None])
        ]
        # Add a dummy POS tag to each input item
        self.observations = ["%s|C" % t for t in observations]
        candc_logger.info("Input: %s" % " ".join(self.observations))
        
        # Run the tagger on this input
        try:
            tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations))
        except OSError, err:
            logger.error("Could not run the C&C supertagger (%s)" % err)
            candc_logger.error("Error: %s" % err)
            # Output the actual error that the command returned
            error = self.tagger.stderr.read()
            logger.error("C&C returned the error: %s" % error)
            candc_logger.error("C&C error: %s" % error)
            raise CandcTaggingError, "error running the C&C supertagger: %s" % error
        # C&C uses ANSI color commands in the output
        # Remove them
        tagger_out = remove_ansi_colors(tagger_out)
        tagger_err = remove_ansi_colors(tagger_err)
        # The tagger process should now be terminated. Check it didn't fall over
        return_code = self.tagger.returncode
        if return_code < 0:
            raise CandcTaggingError, "The C&C tagger terminated with return code %s. "\
                "Error output for the tagging: %s" % (return_code, tagger_err)
        
        # Format the string for slightly easier reading in the logfile
        log_output = tagger_out.replace("\t", ", ")
        output_lines = [line for line in log_output.split("\n") if line.strip()]
        log_output = "\n".join(["%d-%d: %s" % (i,i+1,outline) for (i,outline) in enumerate(output_lines)])
        candc_logger.info("Output: %s" % log_output)
        candc_logger.info("Stderr output: %s" % tagger_err)
        
        # Get the tags out of the tagger output.
        # We ignore the first two items (word and POS tag) and take the third (category)
        # The output format for the different taggers varies
        self.tags = self._tags_from_output(tagger_out)
        
        # Check for bogus tags
        # The tagger may return tags that can't actually be 
        #  instantiated with the word, since it doesn't know about 
        #  the lexicon: ignore them
        #print "\n".join(", ".join(tag for (sign,tag,prob) in taglist) for taglist in self.tags)
        self.tags = [
            [(sign,tag,prob) for (sign,tag,prob) in self.tags[time] \
                    if sign is not None] 
                for time in range(len(self.tags))]
示例#58
0
 def __init__(self, options={}):
     self.options = ModuleOption.process_option_dict(options, self.OPTIONS)
示例#59
0
def main():
    usage = "%prog [<options>]"
    description = "Runs a supertagger from the Jazz Parser to tag some input "\
        "but just outputs the results, rather than continuing to parse."
    optparser = OptionParser(usage=usage, description=description)
    
    # Tagger options
    optparser.add_option("-t", "--tagger", "--supertagger", dest="supertagger", action="store", help="run the parser using the named supertagger. Use '-t help' to see the list of available taggers. Default: %s" % settings.DEFAULT_SUPERTAGGER, default=settings.DEFAULT_SUPERTAGGER)
    optparser.add_option("--topt", "--tagger-options", dest="topts", action="append", help="specify options for the tagger. Type '--topt help', using '-u <name>' to select a tagger module, to get a list of options.")
    # Commonly-used misc
    optparser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    # File input options
    optparser.add_option("--file", "-f", dest="file", action="store", help="use a file to get parser input from. Use --filetype to specify the type of the file.")
    optparser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file (--file). Use '--filetype help' for a list of available types. Default: chords", default='chords')
    optparser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file (--file). Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    # Misc options
    optparser.add_option("-v", "--debug", dest="debug", action="store_true", help="output verbose debugging information.")
    optparser.add_option("-i", "--interactive", dest="interactive", action="store_true", help="instead of just outputing all tags in one go, wait for user input between each iteration of adaptive supertagging")
    # Logging options
    optparser.add_option("--logger", dest="logger", action="store", help="directory to put parser logging in. A filename based on an identifier for each individual input will be appended.")
    # Read in command line options and args
    options, clinput = parse_args_with_config(optparser)
    
    ########################### Option processing ####################
    if options.logger:
        # Directory
        parse_logger_dir = options.logger
        check_directory(parse_logger_dir)
    else:
        parse_logger_dir = None
    
    ######## Grammar ########
    # Read in the grammar
    grammar = get_grammar(options.grammar)
        
    ######## Supertagger ########
    # Load the supertagger requested
    if options.supertagger.lower() == "help":
        print "Available taggers are: %s" % ", ".join(TAGGERS)
        return 0
    try:
        tagger_cls = get_tagger(options.supertagger)
    except TaggerLoadError:
        logger.error("The tagger '%s' could not be loaded. Possible "\
            "taggers are: %s" % (options.supertagger, ", ".join(TAGGERS)))
        return 1
        
    # Get supertagger options before initializing the tagger
    if options.topts is not None:
        toptstr = options.topts
        if "help" in [s.strip().lower() for s in toptstr]:
            # Output this tagger's option help
            from jazzparser.utils.options import options_help_text
            print options_help_text(tagger_cls.TAGGER_OPTIONS, intro="Available options for selected tagger")
            return 0
        toptstr = ":".join(toptstr)
    else:
        toptstr = ""
    topts = ModuleOption.process_option_string(toptstr)
    # Check that the options are valid
    try:
        tagger_cls.check_options(topts)
    except ModuleOptionError, err:
        print "Problem with tagger options (--topt): %s" % err
        return 1