Exemplo n.º 1
0
def main():
    usage = "%prog [options] <model-type> <model_name> <in-file>"
    description = "Trains a supertagging model using the given "\
        "input data. Specify a model type (baseline1, etc) and a name to "\
        "identify it. The data file may be a stored SequenceIndex file, or "\
        "any other type of bulk data file. "\
        "This can only be used with the follow types of models: %s" % ", ".join(TRAINABLE_MODELS)
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    # File input options
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db')
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    # Logging output
    parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end")
    options, arguments = parse_args_with_config(parser)
    
    grammar = Grammar()
    
    # Get the model type first: we might not need the other args
    if len(arguments) == 0:
        print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
    model_type = arguments[0]
    
    if model_type not in TRAINABLE_MODELS:
        print >>sys.stderr, "'%s' is not a valid model type. Available taggers are: %s" % \
            (model_type, ", ".join(TRAINABLE_MODELS))
        sys.exit(1)
    if model_type not in TAGGERS:
        print >>sys.stderr, "'%s' isn't a registered model type. Check that "\
            "the name in TRAINABLE_MODELS is correct" % model_type
        sys.exit(1)
    
    tagger_cls = get_tagger(model_type)
    if not issubclass(tagger_cls, ModelTagger):
        print >>sys.stderr, "'%s' tagger cannot be trained with this script. Only model taggers can be." % (tagger_cls.__name__)
        sys.exit(1)
    model_cls = tagger_cls.MODEL_CLASS
    
    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__)
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_dict(
                            ModuleOption.process_option_string(options.training_opts), 
                            model_cls.TRAINING_OPTIONS)
    
    # Get the rest of the args
    if len(arguments) < 3:
        print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[2])
    model_name = arguments[1]

    # Load the sequence data
    # Only allow bulk types
    input_data = command_line_input(filename=filename, 
                                    filetype=options.filetype, 
                                    options=options.file_options,
                                    allowed_types=get_input_type_names(single=False, bulk=True))
    
    if options.partitions is not None and options.partitions > 1:
        parts = input_data.get_partitions(options.partitions)[1]
        models = [(tagger_cls.partition_model_name(model_name,num),seqs) for \
                                                num,seqs in enumerate(parts)]
    else:
        models = [(model_name,input_data)]
    
    for part_name,seqs in models:
        # Instantiate a fresh model with this name
        model = model_cls(part_name, options=training_opts)
        if options.log is not None:
            # Prepare a logger
            logfile = "%s%s.log" % (options.log, part_name)
            print "Logging output to file %s" % logfile
            logger = create_logger(filename=logfile)
        else:
            logger = None
            
        # Train the model with the loaded data
        model.train(seqs, logger=logger)
        model.save()
        print "Trained model %s" % (part_name)
Exemplo n.º 2
0
def main():
    usage = "%prog [<options>] <model-name> <training-input>"
    description = "Training of PCFG models."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
        help="Number of partitions to divide the data into. "\
            "For train, divides the input file, trains a model on each "\
            "partition's complement and appends partition number to "\
            "the model names. For del, appends partition numbers to model "\
            "names and deletes all the models. Recache does similarly. "\
            "Has no effect for parse.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options")
    parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr")
    parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)
    
    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.WARN
    # Create a logger for training
    logger = create_logger(log_level = log_level,
                  name = "training",
                  stderr = True)
    
    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel
        
    # Parse the option string
    if options.training_opts is None:
        opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(PcfgModel.TRAINING_OPTIONS, 
                                            intro="Training options for PCFGs")
        sys.exit(0)
    else:
        opts = ModuleOption.process_option_dict(
                    ModuleOption.process_option_string(options.training_opts),
                    PcfgModel.TRAINING_OPTIONS)
    
    if len(arguments) == 0:
        print >>sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >>sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name
    
    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]
    
    if len(arguments) < 2:
        print >>sys.stderr, "Specify an input file to read sequence data from"
        sys.exit(1)
    # Read in the training data from the given file
    seqs = SequenceIndex.from_file(arguments[1])
    
    if options.partitions is not None:
        # Prepare each training partition
        datasets = holdout_partition(seqs.sequences, options.partitions)
    else:
        datasets = [seqs.sequences]
        
    for dataset,(parti,part_model) in zip(datasets,parts):
        # Train the named model on the sequence data
        model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, 
                                logger=logger)
        model.save()
        print "Trained model", part_model
Exemplo n.º 3
0
def main():
    usage = "%prog [options] <model_name> <in-file>"
    description = "Trains a chord labeling model using the given "\
        "input data. The data file may be a stored SequenceIndex file, or "\
        "any other type of bulk data file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        '-p',
        '--partitions',
        dest="partitions",
        action="store",
        type="int",
        help=
        "train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number."
    )
    parser.add_option(
        '--opts',
        dest="training_opts",
        action="append",
        help=
        "options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type."
    )
    # File input options
    parser.add_option(
        "--filetype",
        "--ft",
        dest="filetype",
        action="store",
        help=
        "select the file type for the input file. Same filetypes as jazzparser",
        default='bulk-db')
    parser.add_option(
        "--file-options",
        "--fopt",
        dest="file_options",
        action="store",
        help=
        "options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options."
    )
    # Logging output
    parser.add_option(
        '--log',
        dest="log",
        action="store",
        help=
        "file to output training logs to. Specify a base filename; <modelname>.log will be added to the end"
    )
    options, arguments = parse_args_with_config(parser)

    grammar = Grammar()

    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif "help" in [opt.lower() for opt in options.training_opts]:
        print options_help_text(HPChordLabeler.TRAINING_OPTIONS,
                                intro="Training options:")
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_string(
            options.training_opts)

    if len(arguments) < 2:
        print >> sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]

    # Load the sequence data
    # Only allow bulk types
    input_data = command_line_input(filename=filename,
                                    filetype=options.filetype,
                                    options=options.file_options,
                                    allowed_types=get_input_type_names(
                                        single=False, bulk=True))

    # Only partition the chord data, not the MIDI data
    if options.partitions is not None and not \
            (isinstance(input_data, MidiTaggerTrainingBulkInput) and \
             input_data.chords is not None):
        print >>sys.stderr, "Can only partition chord data and no chord data "\
            "was supplied"
        sys.exit(1)

    if options.partitions:
        # The input includes chord training data
        parts = input_data.chords.get_partitions(options.partitions)[1]
        models = [("%s%d" % (model_name,num),chord_data) \
            for num,chord_data in enumerate(parts)]
    else:
        models = [(model_name, None)]

    for part_name, chord_data in models:
        if options.log is not None:
            # Prepare a logger
            logfile = "%s%s.log" % (options.log, part_name)
            print "Logging output to file %s" % logfile
            logger = create_logger(filename=logfile)
        else:
            logger = None

        # Create a fresh model with this name
        model = HPChordLabeler.train(input_data,
                                     part_name,
                                     logger=logger,
                                     options=training_opts,
                                     chord_data=chord_data)
        print "Trained model %s" % (part_name)
Exemplo n.º 4
0
def main():
    usage = "%prog [options] <model_name> <input-file>"
    description = (
        "Trains a model for the RaphSto chord labelling "
        "algorithm on a file that contains a list of midi files with "
        "training options"
    )
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        "-p",
        "--partitions",
        dest="partitions",
        action="store",
        type="int",
        help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.",
    )
    parser.add_option(
        "--opts",
        dest="opts",
        action="store",
        help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.",
    )
    parser.add_option(
        "--proc",
        "--processes",
        dest="processes",
        action="store",
        type="int",
        help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)",
        default=1,
    )
    parser.add_option(
        "--max-length",
        dest="max_length",
        action="store",
        type="int",
        help="limits the length of the training midi sequences in chunks",
    )
    parser.add_option(
        "--split-length",
        dest="split_length",
        action="store",
        type="int",
        help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot",
    )
    parser.add_option(
        "--min-length",
        dest="min_length",
        action="store",
        type="int",
        help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence",
    )
    parser.add_option(
        "--progress-out",
        dest="progress_out",
        action="store",
        help="output logging info to a file instead of the command line",
    )
    parser.add_option(
        "--init-model",
        dest="init_model",
        action="store",
        help="initialize the model using parameters from an already trained model",
    )
    parser.add_option(
        "--init-ctrans",
        dest="init_ctrans",
        action="store",
        help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability",
    )
    parser.add_option(
        "--chord-set",
        dest="chord_set",
        action="store",
        help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used",
    )
    parser.add_option(
        "-m",
        "--model-type",
        dest="model_type",
        action="store",
        help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()),
        default="standard",
    )
    options, arguments = parse_args_with_config(parser)

    if options.opts is not None and options.opts == "help":
        print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs")
        sys.exit(0)
    opts = ModuleOption.process_option_string(options.opts)

    if len(arguments) < 2:
        print >> sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]

    print >> sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(" ")
    # Create a logger to output the progress of the training to stdout or a file
    if options.progress_out is not None:
        stdout = False
        logfile = options.progress_out
        print >> sys.stderr, "Outputing logging info to %s" % logfile
    else:
        stdout = True
        logfile = None
        print >> sys.stderr, "Outputing logging to stdout"
    logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout)
    logger.info("Raphael and Stoddard HMM model training")

    if options.model_type not in MODEL_TYPES:
        print >> sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES)
        sys.exit(1)
    model_cls = MODEL_TYPES[options.model_type]

    if options.chord_set == "help":
        print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys())
        sys.exit(0)
    elif options.chord_set is not None:
        # Check this chord set exists
        if options.chord_set not in constants.CHORD_SETS:
            print >> sys.stderr, "Chord set '%s' does not exist" % options.chord_set
            sys.exit(1)
        else:
            logger.info("Using chord set '%s'" % options.chord_set)

    # Read in the training data
    midis = InputSourceFile(filename)
    handlers = midis.get_handlers()
    logger.info("Reading in %d midi files..." % len(midis.inputs))
    training_data = []
    for i, mh in enumerate(handlers):
        logger.info("%s: %s" % (i, midis.inputs[i][0]))
        emissions = mh.get_emission_stream()[0]
        if options.max_length is not None and len(emissions) > options.max_length:
            logger.info("Truncating file %d to %d chunks (was %d)" % (i, options.max_length, len(emissions)))
            emissions = emissions[: options.max_length]
        if options.split_length is not None:
            logger.info("Splitting sequence %d into sequence no longer " "than %d chunks" % (i, options.split_length))
            # Split up the sequence if it's too long
            while len(emissions) > options.split_length:
                training_data.append(emissions[: options.split_length])
                emissions = emissions[options.split_length :]
        training_data.append(emissions)

    if options.min_length is not None:
        # Make sure there are no sequences under the minimum length
        # Just throw away any that are
        before_chuck = len(training_data)
        training_data = [seq for seq in training_data if len(seq) >= options.min_length]
        if len(training_data) != before_chuck:
            logger.info(
                "Threw away %d short sequences (below %d chunks)"
                % ((before_chuck - len(training_data)), options.min_length)
            )

    logger.info(
        "Training on %d sequences. Lengths: %s"
        % (len(training_data), ", ".join(str(len(seq)) for seq in training_data))
    )

    if options.partitions is not None:
        parts = holdout_partition(training_data, options.partitions)
        models = [("%s%d" % (model_name, num), data) for num, data in enumerate(parts)]
    else:
        models = [(model_name, training_data)]

    # Number of processes to use
    if options.processes == -1:
        # Special value: means number of training sequences (one process per sequence)
        processes = len(training_data)
    else:
        processes = options.processes

    for part_name, data in models:
        # Instantiate a fresh model with this name
        logger.info("Training model '%s' on %d midis" % (part_name, len(data)))
        if options.init_model is not None:
            logger.info("Initializing using parameters from model '%s'" % options.init_model)
            # Load an already trained model as initialization
            model = model_cls.initialize_existing_model(options.init_model, model_name=part_name)
        else:
            # TODO: make these probs an option
            ctype_params = (0.5, 0.3, 0.2)
            logger.info("Initializing to naive chord types using parameters: " "%s, %s, %s" % ctype_params)
            init_kwargs = {"model_name": part_name}
            if options.chord_set is not None:
                # Specify a chord set for the model
                init_kwargs["chord_set"] = options.chord_set
            model = model_cls.initialize_chord_types(ctype_params, **init_kwargs)

            # Initialize the chord transition probabilities if given
            if options.init_ctrans is not None:
                logger.info("Initializing chord transition distribution to %s" % options.init_ctrans)
                model.set_chord_transition_probabilities(options.init_ctrans)
        # Retrain it with the loaded data
        trainer = model_cls.get_trainer()(model, options=opts)
        trainer.train(data, logger=logger, processes=processes, save_intermediate=True)
    print >> sys.stderr, "Training terminating at %s" % datetime.now().isoformat(" ")
Exemplo n.º 5
0
 def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs):
     super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     if type(self) == CandcTagger:
         raise NotImplementedError, "Tried to instantiate CandcTagger "\
             "directly. You should use one of its subclasses."
     self.tag_batch_ratio = self.options['batch']
     model = self.options['model'].split('.')
     
     # Check that candc is available for supertagging
     if not os.path.exists(settings.CANDC.BASE_PATH):
         raise CandcConfigurationError, "The C&C parser base "\
             "directory %s does not exist" % settings.CANDC.BASE_PATH
     if not os.path.exists(settings.CANDC.MODELS_PATH):
         raise CandcConfigurationError, "The C&C parser models "\
             "directory %s does not exist" % settings.CANDC.MODELS_PATH
     candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command)
     if not os.path.exists(candc_cmd):
         raise CandcConfigurationError, "The C&C supertagger command "\
             "%s does not exist. Have you built it?" % candc_cmd
     # Check the model exists
     candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model))
     if not os.path.exists(candc_model):
         raise CandcConfigurationError, "The C&C model given (%s) "\
             "doesn't exist." % candc_model
     
     # Create a logger to dump the output to
     logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model))
     candc_logger = create_logger(filename=logfile)
     self.logger.info("Logging C&C output to %s" % logfile)
     # Note in the log what we're trying to tag
     candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input]))
     
     # Read in the list of tags to smooth over
     self.tag_list = read_tag_list(os.path.join(candc_model, "tags"))
     
     # Read in extra options
     opts_filename = os.path.join(candc_model, "jpopts")
     if not os.path.exists(opts_filename):
         self.extra_opts = {}
     else:
         with open(opts_filename, 'r') as opts_file:
             self.extra_opts = dict(
                 [line.strip("\n").split(":", 1) 
                     for line in opts_file.readlines()])
     # Pull the chord mapping out of the options
     self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None))
     
     # Spawn a process to do the tagging
     candc_command = [candc_cmd, "--model", candc_model, 
                     "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args
     self.tagger = Popen(candc_command, 
                         stdin=PIPE, stdout=PIPE, stderr=PIPE)
     candc_logger.info("C&C command: %s" % " ".join(candc_command))
         
     self.tokens = self.input
     # Build some observations from the tokens
     observations = [
         interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) 
             for ch1,ch2 in group_pairs(self.tokens+[None])
     ]
     # Add a dummy POS tag to each input item
     self.observations = ["%s|C" % t for t in observations]
     candc_logger.info("Input: %s" % " ".join(self.observations))
     
     # Run the tagger on this input
     try:
         tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations))
     except OSError, err:
         logger.error("Could not run the C&C supertagger (%s)" % err)
         candc_logger.error("Error: %s" % err)
         # Output the actual error that the command returned
         error = self.tagger.stderr.read()
         logger.error("C&C returned the error: %s" % error)
         candc_logger.error("C&C error: %s" % error)
         raise CandcTaggingError, "error running the C&C supertagger: %s" % error
Exemplo n.º 6
0
def main():
    usage = "%prog [options] <model_name> <input-file>"
    description = "Trains a model for the RaphSto chord labelling "\
        "algorithm on a file that contains a list of midi files with "\
        "training options"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    parser.add_option('--proc', '--processes', dest="processes", action="store", type="int", help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)", default=1)
    parser.add_option('--max-length', dest="max_length", action="store", type="int", help="limits the length of the training midi sequences in chunks")
    parser.add_option('--split-length', dest="split_length", action="store", type="int", help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot")
    parser.add_option('--min-length', dest="min_length", action="store", type="int", help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence")
    parser.add_option('--progress-out', dest="progress_out", action="store", help="output logging info to a file instead of the command line")
    parser.add_option('--init-model', dest="init_model", action="store", help="initialize the model using parameters from an already trained model")
    parser.add_option('--init-ctrans', dest="init_ctrans", action="store", help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability")
    parser.add_option('--chord-set', dest="chord_set", action="store", help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used")
    parser.add_option('-m', '--model-type', dest="model_type", action="store", help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()), default="standard")
    options, arguments = parse_args_with_config(parser)
    
    if options.opts is not None and options.opts == "help":
        print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs")
        sys.exit(0)
    opts = ModuleOption.process_option_string(options.opts)
    
    if len(arguments) < 2:
        print >>sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]
    
    print >>sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(' ')
    # Create a logger to output the progress of the training to stdout or a file
    if options.progress_out is not None:
        stdout = False
        logfile = options.progress_out
        print >>sys.stderr, "Outputing logging info to %s" % logfile
    else:
        stdout = True
        logfile = None
        print >>sys.stderr, "Outputing logging to stdout"
    logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout)
    logger.info("Raphael and Stoddard HMM model training")
        
    if options.model_type not in MODEL_TYPES:
        print >>sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES)
        sys.exit(1)
    model_cls = MODEL_TYPES[options.model_type]
    
    if options.chord_set == "help":
        print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys())
        sys.exit(0)
    elif options.chord_set is not None:
        # Check this chord set exists
        if options.chord_set not in constants.CHORD_SETS:
            print >>sys.stderr, "Chord set '%s' does not exist" % options.chord_set
            sys.exit(1)
        else:
            logger.info("Using chord set '%s'" % options.chord_set)
    
    
    # Read in the training data
    midis = InputSourceFile(filename)
    handlers = midis.get_handlers()
    logger.info("Reading in %d midi files..." % len(midis.inputs))
    training_data = []
    for i,mh in enumerate(handlers):
        logger.info("%s: %s" % (i,midis.inputs[i][0]))
        emissions = mh.get_emission_stream()[0]
        if options.max_length is not None and len(emissions) > options.max_length:
            logger.info("Truncating file %d to %d chunks (was %d)" % \
                                    (i,options.max_length,len(emissions)))
            emissions = emissions[:options.max_length]
        if options.split_length is not None:
            logger.info("Splitting sequence %d into sequence no longer "\
                                "than %d chunks" % (i,options.split_length))
            # Split up the sequence if it's too long
            while len(emissions) > options.split_length:
                training_data.append(emissions[:options.split_length])
                emissions = emissions[options.split_length:]
        training_data.append(emissions)
    
    if options.min_length is not None:
        # Make sure there are no sequences under the minimum length
        # Just throw away any that are
        before_chuck = len(training_data)
        training_data = [seq for seq in training_data if len(seq) >= options.min_length]
        if len(training_data) != before_chuck:
            logger.info("Threw away %d short sequences (below %d chunks)" % \
                    ((before_chuck-len(training_data)), options.min_length))
    
    logger.info("Training on %d sequences. Lengths: %s" % \
                    (len(training_data), 
                     ", ".join(str(len(seq)) for seq in training_data)))
    
    if options.partitions is not None:
        parts = holdout_partition(training_data, options.partitions)
        models = [("%s%d" % (model_name,num),data) for num,data in enumerate(parts)]
    else:
        models = [(model_name,training_data)]
        
    # Number of processes to use
    if options.processes == -1:
        # Special value: means number of training sequences (one process per sequence)
        processes = len(training_data)
    else:
        processes = options.processes
    
    for part_name,data in models:
        # Instantiate a fresh model with this name
        logger.info("Training model '%s' on %d midis" % (part_name, len(data)))
        if options.init_model is not None:
            logger.info("Initializing using parameters from model '%s'" % \
                options.init_model)
            # Load an already trained model as initialization
            model = model_cls.initialize_existing_model(options.init_model, \
                model_name=part_name)
        else:
            # TODO: make these probs an option
            ctype_params = (0.5, 0.3, 0.2)
            logger.info("Initializing to naive chord types using parameters: "\
                "%s, %s, %s" % ctype_params)
            init_kwargs = { 'model_name' : part_name }
            if options.chord_set is not None:
                # Specify a chord set for the model
                init_kwargs['chord_set'] = options.chord_set
            model = model_cls.initialize_chord_types(ctype_params, **init_kwargs)
            
            # Initialize the chord transition probabilities if given
            if options.init_ctrans is not None:
                logger.info("Initializing chord transition distribution to %s" \
                    % options.init_ctrans)
                model.set_chord_transition_probabilities(options.init_ctrans)
        # Retrain it with the loaded data
        trainer = model_cls.get_trainer()(model, options=opts)
        trainer.train(data, logger=logger, processes=processes, save_intermediate=True)
    print >>sys.stderr, "Training terminating at %s" % datetime.now().isoformat(' ')
Exemplo n.º 7
0
def main():
    usage = "%prog [options] <model_name> <in-file>"
    description = "Trains a chord labeling model using the given "\
        "input data. The data file may be a stored SequenceIndex file, or "\
        "any other type of bulk data file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="training_opts", action="append", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    # File input options
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db')
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    # Logging output
    parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end")
    options, arguments = parse_args_with_config(parser)
    
    grammar = Grammar()
    
    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif "help" in [opt.lower() for opt in options.training_opts]:
        print options_help_text(HPChordLabeler.TRAINING_OPTIONS, intro="Training options:")
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_string(options.training_opts)
        
    if len(arguments) < 2:
        print >>sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]
    
    # Load the sequence data
    # Only allow bulk types
    input_data = command_line_input(filename=filename, 
                                    filetype=options.filetype, 
                                    options=options.file_options,
                                    allowed_types=get_input_type_names(single=False, bulk=True))
    
    # Only partition the chord data, not the MIDI data
    if options.partitions is not None and not \
            (isinstance(input_data, MidiTaggerTrainingBulkInput) and \
             input_data.chords is not None):
        print >>sys.stderr, "Can only partition chord data and no chord data "\
            "was supplied"
        sys.exit(1)
    
    if options.partitions:
        # The input includes chord training data
        parts = input_data.chords.get_partitions(options.partitions)[1]
        models = [("%s%d" % (model_name,num),chord_data) \
            for num,chord_data in enumerate(parts)]
    else:
        models = [(model_name,None)]
    
    for part_name,chord_data in models:
        if options.log is not None:
            # Prepare a logger
            logfile = "%s%s.log" % (options.log, part_name)
            print "Logging output to file %s" % logfile
            logger = create_logger(filename=logfile)
        else:
            logger = None
        
        # Create a fresh model with this name
        model = HPChordLabeler.train(input_data, part_name, 
                                     logger=logger, 
                                     options=training_opts,
                                     chord_data=chord_data)
        print "Trained model %s" % (part_name)
Exemplo n.º 8
0
def main():
    usage = "%prog [options] <model-type> <model_name> <in-file>"
    description = "Trains a supertagging model using the given "\
        "input data. Specify a model type (baseline1, etc) and a name to "\
        "identify it. The data file may be a stored SequenceIndex file, or "\
        "any other type of bulk data file. "\
        "This can only be used with the follow types of models: %s" % ", ".join(TRAINABLE_MODELS)
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        '-p',
        '--partitions',
        dest="partitions",
        action="store",
        type="int",
        help=
        "train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number."
    )
    parser.add_option(
        '--opts',
        dest="training_opts",
        action="store",
        help=
        "options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type."
    )
    # File input options
    parser.add_option(
        "--filetype",
        "--ft",
        dest="filetype",
        action="store",
        help=
        "select the file type for the input file. Same filetypes as jazzparser",
        default='bulk-db')
    parser.add_option(
        "--file-options",
        "--fopt",
        dest="file_options",
        action="store",
        help=
        "options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options."
    )
    # Logging output
    parser.add_option(
        '--log',
        dest="log",
        action="store",
        help=
        "file to output training logs to. Specify a base filename; <modelname>.log will be added to the end"
    )
    options, arguments = parse_args_with_config(parser)

    grammar = Grammar()

    # Get the model type first: we might not need the other args
    if len(arguments) == 0:
        print >> sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
    model_type = arguments[0]

    if model_type not in TRAINABLE_MODELS:
        print >>sys.stderr, "'%s' is not a valid model type. Available taggers are: %s" % \
            (model_type, ", ".join(TRAINABLE_MODELS))
        sys.exit(1)
    if model_type not in TAGGERS:
        print >>sys.stderr, "'%s' isn't a registered model type. Check that "\
            "the name in TRAINABLE_MODELS is correct" % model_type
        sys.exit(1)

    tagger_cls = get_tagger(model_type)
    if not issubclass(tagger_cls, ModelTagger):
        print >> sys.stderr, "'%s' tagger cannot be trained with this script. Only model taggers can be." % (
            tagger_cls.__name__)
        sys.exit(1)
    model_cls = tagger_cls.MODEL_CLASS

    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(model_cls.TRAINING_OPTIONS,
                                intro="Training options for %s" %
                                model_cls.__name__)
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_dict(
            ModuleOption.process_option_string(options.training_opts),
            model_cls.TRAINING_OPTIONS)

    # Get the rest of the args
    if len(arguments) < 3:
        print >> sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[2])
    model_name = arguments[1]

    # Load the sequence data
    # Only allow bulk types
    input_data = command_line_input(filename=filename,
                                    filetype=options.filetype,
                                    options=options.file_options,
                                    allowed_types=get_input_type_names(
                                        single=False, bulk=True))

    if options.partitions is not None and options.partitions > 1:
        parts = input_data.get_partitions(options.partitions)[1]
        models = [(tagger_cls.partition_model_name(model_name,num),seqs) for \
                                                num,seqs in enumerate(parts)]
    else:
        models = [(model_name, input_data)]

    for part_name, seqs in models:
        # Instantiate a fresh model with this name
        model = model_cls(part_name, options=training_opts)
        if options.log is not None:
            # Prepare a logger
            logfile = "%s%s.log" % (options.log, part_name)
            print "Logging output to file %s" % logfile
            logger = create_logger(filename=logfile)
        else:
            logger = None

        # Train the model with the loaded data
        model.train(seqs, logger=logger)
        model.save()
        print "Trained model %s" % (part_name)
Exemplo n.º 9
0
 # Process each input one by one
 all_results = []
 jobs = []
 
 for input in input_getter:
     if input:
         # Get an identifier for this input
         input_identifier = name_getter.next()
         print "Processing input: %s (%s)" % (input, input_identifier)
         
         # Get a filename for a logger for this input
         if parse_logger_dir:
             parse_logger = os.path.join(parse_logger_dir, "%s.log" % \
                                                 slugify(input_identifier))
             print >>sys.stderr, "Logging parser progress to %s" % parse_logger
             logger = create_logger(filename=parse_logger)
         else:
             logger = create_plain_stderr_logger()
         
         # Catch any errors and continue to the next input, instead of giving up
         try:
             if isinstance(input, str):
                 input = input.rstrip("\n")
                 if len(input) == 0:
                     return
                 input = ChordInput.from_string(input)
             
             logger.info("Tagging sequence (%d timesteps)" % len(input))
             # Prepare a suitable tagger component
             tagger = tagger_cls(grammar, input, options=topts.copy(), logger=logger)
             
Exemplo n.º 10
0
    # Process each input one by one
    all_results = []
    jobs = []

    for input in input_getter:
        if input:
            # Get an identifier for this input
            input_identifier = name_getter.next()
            print "Processing input: %s (%s)" % (input, input_identifier)

            # Get a filename for a logger for this input
            if parse_logger_dir:
                parse_logger = os.path.join(parse_logger_dir, "%s.log" % \
                                                    slugify(input_identifier))
                print >> sys.stderr, "Logging parser progress to %s" % parse_logger
                logger = create_logger(filename=parse_logger)
            else:
                logger = create_plain_stderr_logger()

            # Catch any errors and continue to the next input, instead of giving up
            try:
                if isinstance(input, str):
                    input = input.rstrip("\n")
                    if len(input) == 0:
                        return
                    input = ChordInput.from_string(input)

                logger.info("Tagging sequence (%d timesteps)" % len(input))
                # Prepare a suitable tagger component
                tagger = tagger_cls(grammar,
                                    input,
Exemplo n.º 11
0
def main():
    usage = "%prog [<options>] <model-name> <training-input>"
    description = "Training of PCFG models."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
        help="Number of partitions to divide the data into. "\
            "For train, divides the input file, trains a model on each "\
            "partition's complement and appends partition number to "\
            "the model names. For del, appends partition numbers to model "\
            "names and deletes all the models. Recache does similarly. "\
            "Has no effect for parse.")
    parser.add_option(
        '--opts',
        dest="training_opts",
        action="store",
        help=
        "options to pass to the model trainer. Type '--opts help' for a list of options"
    )
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="Output verbose logging information to stderr")
    parser.add_option("-g",
                      "--grammar",
                      dest="grammar",
                      action="store",
                      help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)

    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.WARN
    # Create a logger for training
    logger = create_logger(log_level=log_level, name="training", stderr=True)

    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel

    # Parse the option string
    if options.training_opts is None:
        opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(PcfgModel.TRAINING_OPTIONS,
                                intro="Training options for PCFGs")
        sys.exit(0)
    else:
        opts = ModuleOption.process_option_dict(
            ModuleOption.process_option_string(options.training_opts),
            PcfgModel.TRAINING_OPTIONS)

    if len(arguments) == 0:
        print >> sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >> sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name

    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i))
                 for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]

    if len(arguments) < 2:
        print >> sys.stderr, "Specify an input file to read sequence data from"
        sys.exit(1)
    # Read in the training data from the given file
    seqs = SequenceIndex.from_file(arguments[1])

    if options.partitions is not None:
        # Prepare each training partition
        datasets = holdout_partition(seqs.sequences, options.partitions)
    else:
        datasets = [seqs.sequences]

    for dataset, (parti, part_model) in zip(datasets, parts):
        # Train the named model on the sequence data
        model = PcfgModel.train(part_model,
                                dataset,
                                opts,
                                grammar=grammar,
                                logger=logger)
        model.save()
        print "Trained model", part_model
Exemplo n.º 12
0
def do_parse(grammar, tagger_cls, parser_cls, input, topts, popts, backoff, 
        npopts, options, identifier, multiprocessing=False, 
        logfile=None, partition=None):
    """
    Function called for each input to do tagging and parsing and return the 
    results. It's a separate function so that we can hand it over to worker 
    processes to do multiprocessing.
    
    @type logfile: str
    @param logfile: filename to send logging output to. If None, will log 
        to stderr
    
    """
    # If the input's a string, preprocess it
    if isinstance(input, str):
        input = input.rstrip("\n")
        if len(input) == 0:
            return
        input = ChordInput.from_string(input)
    
    print "Processing input: %s (%s)" % (input, identifier)
        
    if logfile is None:
        # Sending logging output to stderr
        logger = create_plain_stderr_logger()
    else:
        logger = create_logger(filename=logfile)
        print "Logging parser progress to %s" % logfile
    
    # Prepare an initial response
    # We'll fill in some values of this later
    response = {
        'tagger' : None,
        'parser' : None,
        'input' : input,
        'error' : None,
        'messages' : [],
        'time' : None,
        'identifier' : identifier,
        'results' : None,
        'timed_out' : False,
    }
    tagger = None
    parser = None
    messages = []
    
    if options.short_progress:
        # Only output the short form of the progress reports
        progress = 2
    elif options.long_progress:
        progress = 1
    else:
        progress = 0
    
    # Start a timer now to time the parse
    timer = ExecutionTimer(clock=True)
    
    # Catch any errors and continue to the next input, instead of giving up
    try:
        ######### Do that parsing thang
        logger.info("Tagging sequence (%d timesteps)" % len(input))
        
        # Prepare a suitable tagger component
        tagger = tagger_cls(grammar, input, options=topts.copy(), logger=logger)
        if not multiprocessing:
            response['tagger'] = tagger
        
        # Create a parser using this tagger
        parser = parser_cls(grammar, tagger, options=popts.copy(), 
                                backoff=backoff, 
                                backoff_options=npopts.copy(),
                                logger=logger)
        if not multiprocessing:
            response['parser'] = parser
        try:
            # Parse to produce a list of results
            results = parser.parse(derivations=options.derivations, summaries=progress)
        except (KeyboardInterrupt, Exception), err:
            if multiprocessing:
                # Don't go interactive if we're in a subprocess
                # Instead, just return with an error
                response.update({
                    'error' : exception_tuple(str_tb=True),
                })
                return response
            else:
                # Drop into the shell
                if type(err) == KeyboardInterrupt:
                    print "Dropping out on keyboard interrupt"
                    print "Entering shell: use 'chart' command to see current state of parse"
                elif options.error_shell:
                    print >> sys.stderr, "Error parsing %s" % str(input)
                    print >> sys.stderr, "The error was:"
                    traceback.print_exc(file=sys.stderr)
                # If we keyboard interrupted, always go into the shell, so 
                #  the user can see how far we got
                if options.error_shell or type(err) == KeyboardInterrupt:
                    # Instead of exiting, enter the interactive shell
                    print 
                    from jazzparser.shell import interactive_shell
                    env = {}
                    env.update(globals())
                    env.update(locals())
                    interactive_shell(parser.chart.parses,options,tagger,parser,
                                grammar.formalism,env,input_data=input)
                    return
                else:
                    raise
    except (KeyboardInterrupt, Exception), err:
        if multiprocessing:
            response.update({
                'error' : exception_tuple(str_tb=True),
            })
            return response
        else:
            if type(err) == KeyboardInterrupt:
                print "Exiting on keyboard interrupt"
                sys.exit(1)
            else:
                response.update({
                    'error' : exception_tuple(str_tb=True),
                    'messages' : messages,
                    'time' : timer.get_time(),
                })
                return response