def main(): usage = "%prog [options] <model-type> <model_name> <in-file>" description = "Trains a supertagging model using the given "\ "input data. Specify a model type (baseline1, etc) and a name to "\ "identify it. The data file may be a stored SequenceIndex file, or "\ "any other type of bulk data file. "\ "This can only be used with the follow types of models: %s" % ", ".join(TRAINABLE_MODELS) parser = OptionParser(usage=usage, description=description) parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.") parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.") # File input options parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") # Logging output parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end") options, arguments = parse_args_with_config(parser) grammar = Grammar() # Get the model type first: we might not need the other args if len(arguments) == 0: print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments" model_type = arguments[0] if model_type not in TRAINABLE_MODELS: print >>sys.stderr, "'%s' is not a valid model type. Available taggers are: %s" % \ (model_type, ", ".join(TRAINABLE_MODELS)) sys.exit(1) if model_type not in TAGGERS: print >>sys.stderr, "'%s' isn't a registered model type. Check that "\ "the name in TRAINABLE_MODELS is correct" % model_type sys.exit(1) tagger_cls = get_tagger(model_type) if not issubclass(tagger_cls, ModelTagger): print >>sys.stderr, "'%s' tagger cannot be trained with this script. Only model taggers can be." % (tagger_cls.__name__) sys.exit(1) model_cls = tagger_cls.MODEL_CLASS # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif options.training_opts.lower() == "help": print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__) sys.exit(0) else: training_opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), model_cls.TRAINING_OPTIONS) # Get the rest of the args if len(arguments) < 3: print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[2]) model_name = arguments[1] # Load the sequence data # Only allow bulk types input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=get_input_type_names(single=False, bulk=True)) if options.partitions is not None and options.partitions > 1: parts = input_data.get_partitions(options.partitions)[1] models = [(tagger_cls.partition_model_name(model_name,num),seqs) for \ num,seqs in enumerate(parts)] else: models = [(model_name,input_data)] for part_name,seqs in models: # Instantiate a fresh model with this name model = model_cls(part_name, options=training_opts) if options.log is not None: # Prepare a logger logfile = "%s%s.log" % (options.log, part_name) print "Logging output to file %s" % logfile logger = create_logger(filename=logfile) else: logger = None # Train the model with the loaded data model.train(seqs, logger=logger) model.save() print "Trained model %s" % (part_name)
def main(): usage = "%prog [<options>] <model-name> <training-input>" description = "Training of PCFG models." parser = OptionParser(usage=usage, description=description) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \ help="Number of partitions to divide the data into. "\ "For train, divides the input file, trains a model on each "\ "partition's complement and appends partition number to "\ "the model names. For del, appends partition numbers to model "\ "names and deletes all the models. Recache does similarly. "\ "Has no effect for parse.") parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options") parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr") parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") options, arguments = parse_args_with_config(parser) if options.debug: log_level = logging.DEBUG else: log_level = logging.WARN # Create a logger for training logger = create_logger(log_level = log_level, name = "training", stderr = True) # Load a grammar grammar = get_grammar(options.grammar) # Get the pcfg model class for the formalism PcfgModel = grammar.formalism.PcfgModel # Parse the option string if options.training_opts is None: opts = {} elif options.training_opts.lower() == "help": print options_help_text(PcfgModel.TRAINING_OPTIONS, intro="Training options for PCFGs") sys.exit(0) else: opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), PcfgModel.TRAINING_OPTIONS) if len(arguments) == 0: print >>sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >>sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] print "Model base name:", model_name if options.partitions is not None: parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)] else: parts = [(None, model_name)] if len(arguments) < 2: print >>sys.stderr, "Specify an input file to read sequence data from" sys.exit(1) # Read in the training data from the given file seqs = SequenceIndex.from_file(arguments[1]) if options.partitions is not None: # Prepare each training partition datasets = holdout_partition(seqs.sequences, options.partitions) else: datasets = [seqs.sequences] for dataset,(parti,part_model) in zip(datasets,parts): # Train the named model on the sequence data model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, logger=logger) model.save() print "Trained model", part_model
def main(): usage = "%prog [options] <model_name> <in-file>" description = "Trains a chord labeling model using the given "\ "input data. The data file may be a stored SequenceIndex file, or "\ "any other type of bulk data file." parser = OptionParser(usage=usage, description=description) parser.add_option( '-p', '--partitions', dest="partitions", action="store", type="int", help= "train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number." ) parser.add_option( '--opts', dest="training_opts", action="append", help= "options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type." ) # File input options parser.add_option( "--filetype", "--ft", dest="filetype", action="store", help= "select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option( "--file-options", "--fopt", dest="file_options", action="store", help= "options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options." ) # Logging output parser.add_option( '--log', dest="log", action="store", help= "file to output training logs to. Specify a base filename; <modelname>.log will be added to the end" ) options, arguments = parse_args_with_config(parser) grammar = Grammar() # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif "help" in [opt.lower() for opt in options.training_opts]: print options_help_text(HPChordLabeler.TRAINING_OPTIONS, intro="Training options:") sys.exit(0) else: training_opts = ModuleOption.process_option_string( options.training_opts) if len(arguments) < 2: print >> sys.stderr, "You must specify a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] # Load the sequence data # Only allow bulk types input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=get_input_type_names( single=False, bulk=True)) # Only partition the chord data, not the MIDI data if options.partitions is not None and not \ (isinstance(input_data, MidiTaggerTrainingBulkInput) and \ input_data.chords is not None): print >>sys.stderr, "Can only partition chord data and no chord data "\ "was supplied" sys.exit(1) if options.partitions: # The input includes chord training data parts = input_data.chords.get_partitions(options.partitions)[1] models = [("%s%d" % (model_name,num),chord_data) \ for num,chord_data in enumerate(parts)] else: models = [(model_name, None)] for part_name, chord_data in models: if options.log is not None: # Prepare a logger logfile = "%s%s.log" % (options.log, part_name) print "Logging output to file %s" % logfile logger = create_logger(filename=logfile) else: logger = None # Create a fresh model with this name model = HPChordLabeler.train(input_data, part_name, logger=logger, options=training_opts, chord_data=chord_data) print "Trained model %s" % (part_name)
def main(): usage = "%prog [options] <model_name> <input-file>" description = ( "Trains a model for the RaphSto chord labelling " "algorithm on a file that contains a list of midi files with " "training options" ) parser = OptionParser(usage=usage, description=description) parser.add_option( "-p", "--partitions", dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.", ) parser.add_option( "--opts", dest="opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.", ) parser.add_option( "--proc", "--processes", dest="processes", action="store", type="int", help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)", default=1, ) parser.add_option( "--max-length", dest="max_length", action="store", type="int", help="limits the length of the training midi sequences in chunks", ) parser.add_option( "--split-length", dest="split_length", action="store", type="int", help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot", ) parser.add_option( "--min-length", dest="min_length", action="store", type="int", help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence", ) parser.add_option( "--progress-out", dest="progress_out", action="store", help="output logging info to a file instead of the command line", ) parser.add_option( "--init-model", dest="init_model", action="store", help="initialize the model using parameters from an already trained model", ) parser.add_option( "--init-ctrans", dest="init_ctrans", action="store", help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability", ) parser.add_option( "--chord-set", dest="chord_set", action="store", help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used", ) parser.add_option( "-m", "--model-type", dest="model_type", action="store", help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()), default="standard", ) options, arguments = parse_args_with_config(parser) if options.opts is not None and options.opts == "help": print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs") sys.exit(0) opts = ModuleOption.process_option_string(options.opts) if len(arguments) < 2: print >> sys.stderr, "You must specify a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] print >> sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(" ") # Create a logger to output the progress of the training to stdout or a file if options.progress_out is not None: stdout = False logfile = options.progress_out print >> sys.stderr, "Outputing logging info to %s" % logfile else: stdout = True logfile = None print >> sys.stderr, "Outputing logging to stdout" logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout) logger.info("Raphael and Stoddard HMM model training") if options.model_type not in MODEL_TYPES: print >> sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES) sys.exit(1) model_cls = MODEL_TYPES[options.model_type] if options.chord_set == "help": print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys()) sys.exit(0) elif options.chord_set is not None: # Check this chord set exists if options.chord_set not in constants.CHORD_SETS: print >> sys.stderr, "Chord set '%s' does not exist" % options.chord_set sys.exit(1) else: logger.info("Using chord set '%s'" % options.chord_set) # Read in the training data midis = InputSourceFile(filename) handlers = midis.get_handlers() logger.info("Reading in %d midi files..." % len(midis.inputs)) training_data = [] for i, mh in enumerate(handlers): logger.info("%s: %s" % (i, midis.inputs[i][0])) emissions = mh.get_emission_stream()[0] if options.max_length is not None and len(emissions) > options.max_length: logger.info("Truncating file %d to %d chunks (was %d)" % (i, options.max_length, len(emissions))) emissions = emissions[: options.max_length] if options.split_length is not None: logger.info("Splitting sequence %d into sequence no longer " "than %d chunks" % (i, options.split_length)) # Split up the sequence if it's too long while len(emissions) > options.split_length: training_data.append(emissions[: options.split_length]) emissions = emissions[options.split_length :] training_data.append(emissions) if options.min_length is not None: # Make sure there are no sequences under the minimum length # Just throw away any that are before_chuck = len(training_data) training_data = [seq for seq in training_data if len(seq) >= options.min_length] if len(training_data) != before_chuck: logger.info( "Threw away %d short sequences (below %d chunks)" % ((before_chuck - len(training_data)), options.min_length) ) logger.info( "Training on %d sequences. Lengths: %s" % (len(training_data), ", ".join(str(len(seq)) for seq in training_data)) ) if options.partitions is not None: parts = holdout_partition(training_data, options.partitions) models = [("%s%d" % (model_name, num), data) for num, data in enumerate(parts)] else: models = [(model_name, training_data)] # Number of processes to use if options.processes == -1: # Special value: means number of training sequences (one process per sequence) processes = len(training_data) else: processes = options.processes for part_name, data in models: # Instantiate a fresh model with this name logger.info("Training model '%s' on %d midis" % (part_name, len(data))) if options.init_model is not None: logger.info("Initializing using parameters from model '%s'" % options.init_model) # Load an already trained model as initialization model = model_cls.initialize_existing_model(options.init_model, model_name=part_name) else: # TODO: make these probs an option ctype_params = (0.5, 0.3, 0.2) logger.info("Initializing to naive chord types using parameters: " "%s, %s, %s" % ctype_params) init_kwargs = {"model_name": part_name} if options.chord_set is not None: # Specify a chord set for the model init_kwargs["chord_set"] = options.chord_set model = model_cls.initialize_chord_types(ctype_params, **init_kwargs) # Initialize the chord transition probabilities if given if options.init_ctrans is not None: logger.info("Initializing chord transition distribution to %s" % options.init_ctrans) model.set_chord_transition_probabilities(options.init_ctrans) # Retrain it with the loaded data trainer = model_cls.get_trainer()(model, options=opts) trainer.train(data, logger=logger, processes=processes, save_intermediate=True) print >> sys.stderr, "Training terminating at %s" % datetime.now().isoformat(" ")
def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs): super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) if type(self) == CandcTagger: raise NotImplementedError, "Tried to instantiate CandcTagger "\ "directly. You should use one of its subclasses." self.tag_batch_ratio = self.options['batch'] model = self.options['model'].split('.') # Check that candc is available for supertagging if not os.path.exists(settings.CANDC.BASE_PATH): raise CandcConfigurationError, "The C&C parser base "\ "directory %s does not exist" % settings.CANDC.BASE_PATH if not os.path.exists(settings.CANDC.MODELS_PATH): raise CandcConfigurationError, "The C&C parser models "\ "directory %s does not exist" % settings.CANDC.MODELS_PATH candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command) if not os.path.exists(candc_cmd): raise CandcConfigurationError, "The C&C supertagger command "\ "%s does not exist. Have you built it?" % candc_cmd # Check the model exists candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model)) if not os.path.exists(candc_model): raise CandcConfigurationError, "The C&C model given (%s) "\ "doesn't exist." % candc_model # Create a logger to dump the output to logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model)) candc_logger = create_logger(filename=logfile) self.logger.info("Logging C&C output to %s" % logfile) # Note in the log what we're trying to tag candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input])) # Read in the list of tags to smooth over self.tag_list = read_tag_list(os.path.join(candc_model, "tags")) # Read in extra options opts_filename = os.path.join(candc_model, "jpopts") if not os.path.exists(opts_filename): self.extra_opts = {} else: with open(opts_filename, 'r') as opts_file: self.extra_opts = dict( [line.strip("\n").split(":", 1) for line in opts_file.readlines()]) # Pull the chord mapping out of the options self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None)) # Spawn a process to do the tagging candc_command = [candc_cmd, "--model", candc_model, "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args self.tagger = Popen(candc_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) candc_logger.info("C&C command: %s" % " ".join(candc_command)) self.tokens = self.input # Build some observations from the tokens observations = [ interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) for ch1,ch2 in group_pairs(self.tokens+[None]) ] # Add a dummy POS tag to each input item self.observations = ["%s|C" % t for t in observations] candc_logger.info("Input: %s" % " ".join(self.observations)) # Run the tagger on this input try: tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations)) except OSError, err: logger.error("Could not run the C&C supertagger (%s)" % err) candc_logger.error("Error: %s" % err) # Output the actual error that the command returned error = self.tagger.stderr.read() logger.error("C&C returned the error: %s" % error) candc_logger.error("C&C error: %s" % error) raise CandcTaggingError, "error running the C&C supertagger: %s" % error
def main(): usage = "%prog [options] <model_name> <input-file>" description = "Trains a model for the RaphSto chord labelling "\ "algorithm on a file that contains a list of midi files with "\ "training options" parser = OptionParser(usage=usage, description=description) parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.") parser.add_option('--opts', dest="opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.") parser.add_option('--proc', '--processes', dest="processes", action="store", type="int", help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)", default=1) parser.add_option('--max-length', dest="max_length", action="store", type="int", help="limits the length of the training midi sequences in chunks") parser.add_option('--split-length', dest="split_length", action="store", type="int", help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot") parser.add_option('--min-length', dest="min_length", action="store", type="int", help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence") parser.add_option('--progress-out', dest="progress_out", action="store", help="output logging info to a file instead of the command line") parser.add_option('--init-model', dest="init_model", action="store", help="initialize the model using parameters from an already trained model") parser.add_option('--init-ctrans', dest="init_ctrans", action="store", help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability") parser.add_option('--chord-set', dest="chord_set", action="store", help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used") parser.add_option('-m', '--model-type', dest="model_type", action="store", help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()), default="standard") options, arguments = parse_args_with_config(parser) if options.opts is not None and options.opts == "help": print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs") sys.exit(0) opts = ModuleOption.process_option_string(options.opts) if len(arguments) < 2: print >>sys.stderr, "You must specify a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] print >>sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(' ') # Create a logger to output the progress of the training to stdout or a file if options.progress_out is not None: stdout = False logfile = options.progress_out print >>sys.stderr, "Outputing logging info to %s" % logfile else: stdout = True logfile = None print >>sys.stderr, "Outputing logging to stdout" logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout) logger.info("Raphael and Stoddard HMM model training") if options.model_type not in MODEL_TYPES: print >>sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES) sys.exit(1) model_cls = MODEL_TYPES[options.model_type] if options.chord_set == "help": print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys()) sys.exit(0) elif options.chord_set is not None: # Check this chord set exists if options.chord_set not in constants.CHORD_SETS: print >>sys.stderr, "Chord set '%s' does not exist" % options.chord_set sys.exit(1) else: logger.info("Using chord set '%s'" % options.chord_set) # Read in the training data midis = InputSourceFile(filename) handlers = midis.get_handlers() logger.info("Reading in %d midi files..." % len(midis.inputs)) training_data = [] for i,mh in enumerate(handlers): logger.info("%s: %s" % (i,midis.inputs[i][0])) emissions = mh.get_emission_stream()[0] if options.max_length is not None and len(emissions) > options.max_length: logger.info("Truncating file %d to %d chunks (was %d)" % \ (i,options.max_length,len(emissions))) emissions = emissions[:options.max_length] if options.split_length is not None: logger.info("Splitting sequence %d into sequence no longer "\ "than %d chunks" % (i,options.split_length)) # Split up the sequence if it's too long while len(emissions) > options.split_length: training_data.append(emissions[:options.split_length]) emissions = emissions[options.split_length:] training_data.append(emissions) if options.min_length is not None: # Make sure there are no sequences under the minimum length # Just throw away any that are before_chuck = len(training_data) training_data = [seq for seq in training_data if len(seq) >= options.min_length] if len(training_data) != before_chuck: logger.info("Threw away %d short sequences (below %d chunks)" % \ ((before_chuck-len(training_data)), options.min_length)) logger.info("Training on %d sequences. Lengths: %s" % \ (len(training_data), ", ".join(str(len(seq)) for seq in training_data))) if options.partitions is not None: parts = holdout_partition(training_data, options.partitions) models = [("%s%d" % (model_name,num),data) for num,data in enumerate(parts)] else: models = [(model_name,training_data)] # Number of processes to use if options.processes == -1: # Special value: means number of training sequences (one process per sequence) processes = len(training_data) else: processes = options.processes for part_name,data in models: # Instantiate a fresh model with this name logger.info("Training model '%s' on %d midis" % (part_name, len(data))) if options.init_model is not None: logger.info("Initializing using parameters from model '%s'" % \ options.init_model) # Load an already trained model as initialization model = model_cls.initialize_existing_model(options.init_model, \ model_name=part_name) else: # TODO: make these probs an option ctype_params = (0.5, 0.3, 0.2) logger.info("Initializing to naive chord types using parameters: "\ "%s, %s, %s" % ctype_params) init_kwargs = { 'model_name' : part_name } if options.chord_set is not None: # Specify a chord set for the model init_kwargs['chord_set'] = options.chord_set model = model_cls.initialize_chord_types(ctype_params, **init_kwargs) # Initialize the chord transition probabilities if given if options.init_ctrans is not None: logger.info("Initializing chord transition distribution to %s" \ % options.init_ctrans) model.set_chord_transition_probabilities(options.init_ctrans) # Retrain it with the loaded data trainer = model_cls.get_trainer()(model, options=opts) trainer.train(data, logger=logger, processes=processes, save_intermediate=True) print >>sys.stderr, "Training terminating at %s" % datetime.now().isoformat(' ')
def main(): usage = "%prog [options] <model_name> <in-file>" description = "Trains a chord labeling model using the given "\ "input data. The data file may be a stored SequenceIndex file, or "\ "any other type of bulk data file." parser = OptionParser(usage=usage, description=description) parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.") parser.add_option('--opts', dest="training_opts", action="append", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.") # File input options parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.") # Logging output parser.add_option('--log', dest="log", action="store", help="file to output training logs to. Specify a base filename; <modelname>.log will be added to the end") options, arguments = parse_args_with_config(parser) grammar = Grammar() # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif "help" in [opt.lower() for opt in options.training_opts]: print options_help_text(HPChordLabeler.TRAINING_OPTIONS, intro="Training options:") sys.exit(0) else: training_opts = ModuleOption.process_option_string(options.training_opts) if len(arguments) < 2: print >>sys.stderr, "You must specify a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[1]) model_name = arguments[0] # Load the sequence data # Only allow bulk types input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=get_input_type_names(single=False, bulk=True)) # Only partition the chord data, not the MIDI data if options.partitions is not None and not \ (isinstance(input_data, MidiTaggerTrainingBulkInput) and \ input_data.chords is not None): print >>sys.stderr, "Can only partition chord data and no chord data "\ "was supplied" sys.exit(1) if options.partitions: # The input includes chord training data parts = input_data.chords.get_partitions(options.partitions)[1] models = [("%s%d" % (model_name,num),chord_data) \ for num,chord_data in enumerate(parts)] else: models = [(model_name,None)] for part_name,chord_data in models: if options.log is not None: # Prepare a logger logfile = "%s%s.log" % (options.log, part_name) print "Logging output to file %s" % logfile logger = create_logger(filename=logfile) else: logger = None # Create a fresh model with this name model = HPChordLabeler.train(input_data, part_name, logger=logger, options=training_opts, chord_data=chord_data) print "Trained model %s" % (part_name)
def main(): usage = "%prog [options] <model-type> <model_name> <in-file>" description = "Trains a supertagging model using the given "\ "input data. Specify a model type (baseline1, etc) and a name to "\ "identify it. The data file may be a stored SequenceIndex file, or "\ "any other type of bulk data file. "\ "This can only be used with the follow types of models: %s" % ", ".join(TRAINABLE_MODELS) parser = OptionParser(usage=usage, description=description) parser.add_option( '-p', '--partitions', dest="partitions", action="store", type="int", help= "train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number." ) parser.add_option( '--opts', dest="training_opts", action="store", help= "options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type." ) # File input options parser.add_option( "--filetype", "--ft", dest="filetype", action="store", help= "select the file type for the input file. Same filetypes as jazzparser", default='bulk-db') parser.add_option( "--file-options", "--fopt", dest="file_options", action="store", help= "options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options." ) # Logging output parser.add_option( '--log', dest="log", action="store", help= "file to output training logs to. Specify a base filename; <modelname>.log will be added to the end" ) options, arguments = parse_args_with_config(parser) grammar = Grammar() # Get the model type first: we might not need the other args if len(arguments) == 0: print >> sys.stderr, "You must specify a model type, a model name and an input data file as arguments" model_type = arguments[0] if model_type not in TRAINABLE_MODELS: print >>sys.stderr, "'%s' is not a valid model type. Available taggers are: %s" % \ (model_type, ", ".join(TRAINABLE_MODELS)) sys.exit(1) if model_type not in TAGGERS: print >>sys.stderr, "'%s' isn't a registered model type. Check that "\ "the name in TRAINABLE_MODELS is correct" % model_type sys.exit(1) tagger_cls = get_tagger(model_type) if not issubclass(tagger_cls, ModelTagger): print >> sys.stderr, "'%s' tagger cannot be trained with this script. Only model taggers can be." % ( tagger_cls.__name__) sys.exit(1) model_cls = tagger_cls.MODEL_CLASS # Handle any training options that were given on the command line if options.training_opts is None: training_opts = {} elif options.training_opts.lower() == "help": print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__) sys.exit(0) else: training_opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), model_cls.TRAINING_OPTIONS) # Get the rest of the args if len(arguments) < 3: print >> sys.stderr, "You must specify a model type, a model name and an input data file as arguments" sys.exit(1) filename = os.path.abspath(arguments[2]) model_name = arguments[1] # Load the sequence data # Only allow bulk types input_data = command_line_input(filename=filename, filetype=options.filetype, options=options.file_options, allowed_types=get_input_type_names( single=False, bulk=True)) if options.partitions is not None and options.partitions > 1: parts = input_data.get_partitions(options.partitions)[1] models = [(tagger_cls.partition_model_name(model_name,num),seqs) for \ num,seqs in enumerate(parts)] else: models = [(model_name, input_data)] for part_name, seqs in models: # Instantiate a fresh model with this name model = model_cls(part_name, options=training_opts) if options.log is not None: # Prepare a logger logfile = "%s%s.log" % (options.log, part_name) print "Logging output to file %s" % logfile logger = create_logger(filename=logfile) else: logger = None # Train the model with the loaded data model.train(seqs, logger=logger) model.save() print "Trained model %s" % (part_name)
# Process each input one by one all_results = [] jobs = [] for input in input_getter: if input: # Get an identifier for this input input_identifier = name_getter.next() print "Processing input: %s (%s)" % (input, input_identifier) # Get a filename for a logger for this input if parse_logger_dir: parse_logger = os.path.join(parse_logger_dir, "%s.log" % \ slugify(input_identifier)) print >>sys.stderr, "Logging parser progress to %s" % parse_logger logger = create_logger(filename=parse_logger) else: logger = create_plain_stderr_logger() # Catch any errors and continue to the next input, instead of giving up try: if isinstance(input, str): input = input.rstrip("\n") if len(input) == 0: return input = ChordInput.from_string(input) logger.info("Tagging sequence (%d timesteps)" % len(input)) # Prepare a suitable tagger component tagger = tagger_cls(grammar, input, options=topts.copy(), logger=logger)
# Process each input one by one all_results = [] jobs = [] for input in input_getter: if input: # Get an identifier for this input input_identifier = name_getter.next() print "Processing input: %s (%s)" % (input, input_identifier) # Get a filename for a logger for this input if parse_logger_dir: parse_logger = os.path.join(parse_logger_dir, "%s.log" % \ slugify(input_identifier)) print >> sys.stderr, "Logging parser progress to %s" % parse_logger logger = create_logger(filename=parse_logger) else: logger = create_plain_stderr_logger() # Catch any errors and continue to the next input, instead of giving up try: if isinstance(input, str): input = input.rstrip("\n") if len(input) == 0: return input = ChordInput.from_string(input) logger.info("Tagging sequence (%d timesteps)" % len(input)) # Prepare a suitable tagger component tagger = tagger_cls(grammar, input,
def main(): usage = "%prog [<options>] <model-name> <training-input>" description = "Training of PCFG models." parser = OptionParser(usage=usage, description=description) parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \ help="Number of partitions to divide the data into. "\ "For train, divides the input file, trains a model on each "\ "partition's complement and appends partition number to "\ "the model names. For del, appends partition numbers to model "\ "names and deletes all the models. Recache does similarly. "\ "Has no effect for parse.") parser.add_option( '--opts', dest="training_opts", action="store", help= "options to pass to the model trainer. Type '--opts help' for a list of options" ) parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr") parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.") options, arguments = parse_args_with_config(parser) if options.debug: log_level = logging.DEBUG else: log_level = logging.WARN # Create a logger for training logger = create_logger(log_level=log_level, name="training", stderr=True) # Load a grammar grammar = get_grammar(options.grammar) # Get the pcfg model class for the formalism PcfgModel = grammar.formalism.PcfgModel # Parse the option string if options.training_opts is None: opts = {} elif options.training_opts.lower() == "help": print options_help_text(PcfgModel.TRAINING_OPTIONS, intro="Training options for PCFGs") sys.exit(0) else: opts = ModuleOption.process_option_dict( ModuleOption.process_option_string(options.training_opts), PcfgModel.TRAINING_OPTIONS) if len(arguments) == 0: print >> sys.stderr, "Specify a model name" models = PcfgModel.list_models() print >> sys.stderr, "Available models: %s" % ", ".join(models) sys.exit(1) model_name = arguments[0] print "Model base name:", model_name if options.partitions is not None: parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)] else: parts = [(None, model_name)] if len(arguments) < 2: print >> sys.stderr, "Specify an input file to read sequence data from" sys.exit(1) # Read in the training data from the given file seqs = SequenceIndex.from_file(arguments[1]) if options.partitions is not None: # Prepare each training partition datasets = holdout_partition(seqs.sequences, options.partitions) else: datasets = [seqs.sequences] for dataset, (parti, part_model) in zip(datasets, parts): # Train the named model on the sequence data model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, logger=logger) model.save() print "Trained model", part_model
def do_parse(grammar, tagger_cls, parser_cls, input, topts, popts, backoff, npopts, options, identifier, multiprocessing=False, logfile=None, partition=None): """ Function called for each input to do tagging and parsing and return the results. It's a separate function so that we can hand it over to worker processes to do multiprocessing. @type logfile: str @param logfile: filename to send logging output to. If None, will log to stderr """ # If the input's a string, preprocess it if isinstance(input, str): input = input.rstrip("\n") if len(input) == 0: return input = ChordInput.from_string(input) print "Processing input: %s (%s)" % (input, identifier) if logfile is None: # Sending logging output to stderr logger = create_plain_stderr_logger() else: logger = create_logger(filename=logfile) print "Logging parser progress to %s" % logfile # Prepare an initial response # We'll fill in some values of this later response = { 'tagger' : None, 'parser' : None, 'input' : input, 'error' : None, 'messages' : [], 'time' : None, 'identifier' : identifier, 'results' : None, 'timed_out' : False, } tagger = None parser = None messages = [] if options.short_progress: # Only output the short form of the progress reports progress = 2 elif options.long_progress: progress = 1 else: progress = 0 # Start a timer now to time the parse timer = ExecutionTimer(clock=True) # Catch any errors and continue to the next input, instead of giving up try: ######### Do that parsing thang logger.info("Tagging sequence (%d timesteps)" % len(input)) # Prepare a suitable tagger component tagger = tagger_cls(grammar, input, options=topts.copy(), logger=logger) if not multiprocessing: response['tagger'] = tagger # Create a parser using this tagger parser = parser_cls(grammar, tagger, options=popts.copy(), backoff=backoff, backoff_options=npopts.copy(), logger=logger) if not multiprocessing: response['parser'] = parser try: # Parse to produce a list of results results = parser.parse(derivations=options.derivations, summaries=progress) except (KeyboardInterrupt, Exception), err: if multiprocessing: # Don't go interactive if we're in a subprocess # Instead, just return with an error response.update({ 'error' : exception_tuple(str_tb=True), }) return response else: # Drop into the shell if type(err) == KeyboardInterrupt: print "Dropping out on keyboard interrupt" print "Entering shell: use 'chart' command to see current state of parse" elif options.error_shell: print >> sys.stderr, "Error parsing %s" % str(input) print >> sys.stderr, "The error was:" traceback.print_exc(file=sys.stderr) # If we keyboard interrupted, always go into the shell, so # the user can see how far we got if options.error_shell or type(err) == KeyboardInterrupt: # Instead of exiting, enter the interactive shell print from jazzparser.shell import interactive_shell env = {} env.update(globals()) env.update(locals()) interactive_shell(parser.chart.parses,options,tagger,parser, grammar.formalism,env,input_data=input) return else: raise except (KeyboardInterrupt, Exception), err: if multiprocessing: response.update({ 'error' : exception_tuple(str_tb=True), }) return response else: if type(err) == KeyboardInterrupt: print "Exiting on keyboard interrupt" sys.exit(1) else: response.update({ 'error' : exception_tuple(str_tb=True), 'messages' : messages, 'time' : timer.get_time(), }) return response