예제 #1
0
def main():
    usage = "%prog [options] <in-file>"
    parser = OptionParser(usage=usage)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", default=DEFAULT_PARTITIONS, help="the number of partitions to use (default: %d)" % DEFAULT_PARTITIONS)
    parser.add_option("--ids", dest="ids", action="store_true", help="don't output any files - just print out a list of the ids of the sequences in each partition")
    options, arguments = parser.parse_args()
        
    if len(arguments) == 0:
        print >>sys.stderr, "You must specify an input data file"
        sys.exit(1)
    filename = os.path.abspath(arguments[0])
    
    # Read in the data file
    seqs = SequenceIndex.from_file(filename)
    
    part_pattern = "%s.part%%d" % filename
    heldout_pattern = "%s.heldout_part%%d" % filename
    # Divide the data up into partitions, with their complements
    parts = zip(partition(seqs.sequences, options.partitions), holdout_partition(seqs.sequences, options.partitions))
    # Save each partition and its complement
    for i,(part,heldout) in enumerate(parts):
        if options.ids:
            # Just print out a list of the ids in the partition
            print " ".join(["%d" % s.id for s in part])
        else:
            save_sequences(part_pattern % i, part)
            save_sequences(heldout_pattern % i, heldout)
            print >>sys.stderr, "Wrote partition %d to %s and %s" % (i,part_pattern % i,heldout_pattern % i)
예제 #2
0
def main():
    usage = "%prog [options] <model-type> <model_name> <in-file>"
    description = "Trains a backoff builder model using the given "\
        "input data. Specify a model type (ngram, etc) and a name to "\
        "identify it. The data file should be a stored SequenceIndex file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    # File input options
    parser.add_option("--filetype", "--ft", dest="filetype", action="store", help="select the file type for the input file. Same filetypes as jazzparser", default='bulk-db')
    parser.add_option("--file-options", "--fopt", dest="file_options", action="store", help="options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options.")
    options, arguments = parse_args_with_config(parser)
    
    if len(arguments) < 3:
        print >>sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[2])
    model_type = arguments[0]
    model_name = arguments[1]
    
    builder_cls = get_backoff_builder(model_type)
    model_cls = builder_cls.MODEL_CLASS
    
    # Load the sequence data from a dbinput file
    input_data = command_line_input(filename=filename, 
                                    filetype=options.filetype, 
                                    options=options.file_options,
                                    allowed_types=['bulk-db', 'bulk-db-annotated'])
    
    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(model_cls.TRAINING_OPTIONS, intro="Training options for %s" % model_cls.__name__)
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_dict(
                            ModuleOption.process_option_string(options.training_opts), 
                            model_cls.TRAINING_OPTIONS)
        
    if options.partitions is not None:
        parts = holdout_partition(input_data, options.partitions)
        models = [(builder_cls.partition_model_name(model_name,num),seqs) for \
                        num,seqs in enumerate(parts)]
    else:
        models = [(model_name,input_data)]
    
    for part_name,seqs in models:
        # Instantiate a fresh model with this name
        model = model_cls(part_name, options=training_opts)
        # Train it with the loaded data
        model.train(seqs)
        model.save()
        print "Trained model %s" % (part_name)
예제 #3
0
def main():
    usage = "%prog [options] <in-file>"
    parser = OptionParser(usage=usage)
    parser.add_option("-p",
                      "--partitions",
                      dest="partitions",
                      action="store",
                      type="int",
                      default=DEFAULT_PARTITIONS,
                      help="the number of partitions to use (default: %d)" %
                      DEFAULT_PARTITIONS)
    parser.add_option(
        "--ids",
        dest="ids",
        action="store_true",
        help=
        "don't output any files - just print out a list of the ids of the sequences in each partition"
    )
    options, arguments = parser.parse_args()

    if len(arguments) == 0:
        print >> sys.stderr, "You must specify an input data file"
        sys.exit(1)
    filename = os.path.abspath(arguments[0])

    # Read in the data file
    seqs = SequenceIndex.from_file(filename)

    part_pattern = "%s.part%%d" % filename
    heldout_pattern = "%s.heldout_part%%d" % filename
    # Divide the data up into partitions, with their complements
    parts = zip(partition(seqs.sequences, options.partitions),
                holdout_partition(seqs.sequences, options.partitions))
    # Save each partition and its complement
    for i, (part, heldout) in enumerate(parts):
        if options.ids:
            # Just print out a list of the ids in the partition
            print " ".join(["%d" % s.id for s in part])
        else:
            save_sequences(part_pattern % i, part)
            save_sequences(heldout_pattern % i, heldout)
            print >> sys.stderr, "Wrote partition %d to %s and %s" % (
                i, part_pattern % i, heldout_pattern % i)
from django.db.models import Q
from jazzparser.utils.data import holdout_partition, partition
import os.path, sys

NUM_PARTITIONS = 10
FILENAME = "partition"

# Build a list of the sequences to put in each partition
# Only include fully annotated sequences
print >> sys.stderr, "Building list of fully annotated sequences"
seqs = [
    seq.id for seq in ChordSequence.objects.filter(analysis_omitted=False)
    if seq.fully_annotated
]
partitions = zip(partition(seqs, NUM_PARTITIONS),
                 holdout_partition(seqs, NUM_PARTITIONS))

for i, parts in enumerate(partitions):
    part, rest = parts
    # Output two files for each partition
    part_file = "%s-%d" % (FILENAME, i)
    held_file = "%s-%d-heldout" % (FILENAME, i)
    print >> sys.stderr, "Outputing partition %d to %s and %s" % (i, part_file,
                                                                  held_file)
    # Output the partition's file
    query = Q(id__in=part)
    save_pickled_data(part_file, query)
    # Output the rest of the data
    query = Q(id__in=rest)
    save_pickled_data(held_file, query)
from apps.sequences.datautils import save_pickled_data
from apps.sequences.models import ChordSequence
from django.db.models import Q
from jazzparser.utils.data import holdout_partition, partition
import os.path, sys

NUM_PARTITIONS = 10
FILENAME = "partition"

# Build a list of the sequences to put in each partition
# Only include fully annotated sequences
print >>sys.stderr, "Building list of fully annotated sequences"
seqs = [seq.id for seq in 
                ChordSequence.objects.filter(analysis_omitted=False)
                if seq.fully_annotated]
partitions = zip(partition(seqs, NUM_PARTITIONS), holdout_partition(seqs, NUM_PARTITIONS))

for i,parts in enumerate(partitions):
    part, rest = parts
    # Output two files for each partition
    part_file = "%s-%d" % (FILENAME, i)
    held_file = "%s-%d-heldout" % (FILENAME, i)
    print >>sys.stderr, "Outputing partition %d to %s and %s" % (i, part_file, held_file)
    # Output the partition's file
    query = Q(id__in=part)
    save_pickled_data(part_file, query)
    # Output the rest of the data
    query = Q(id__in=rest)
    save_pickled_data(held_file, query)
예제 #6
0
def main():
    usage = "%prog [<options>] <model-name> <training-input>"
    description = "Training of PCFG models."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
        help="Number of partitions to divide the data into. "\
            "For train, divides the input file, trains a model on each "\
            "partition's complement and appends partition number to "\
            "the model names. For del, appends partition numbers to model "\
            "names and deletes all the models. Recache does similarly. "\
            "Has no effect for parse.")
    parser.add_option('--opts', dest="training_opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options")
    parser.add_option("--debug", dest="debug", action="store_true", help="Output verbose logging information to stderr")
    parser.add_option("-g", "--grammar", dest="grammar", action="store", help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)
    
    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.WARN
    # Create a logger for training
    logger = create_logger(log_level = log_level,
                  name = "training",
                  stderr = True)
    
    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel
        
    # Parse the option string
    if options.training_opts is None:
        opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(PcfgModel.TRAINING_OPTIONS, 
                                            intro="Training options for PCFGs")
        sys.exit(0)
    else:
        opts = ModuleOption.process_option_dict(
                    ModuleOption.process_option_string(options.training_opts),
                    PcfgModel.TRAINING_OPTIONS)
    
    if len(arguments) == 0:
        print >>sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >>sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name
    
    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i)) for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]
    
    if len(arguments) < 2:
        print >>sys.stderr, "Specify an input file to read sequence data from"
        sys.exit(1)
    # Read in the training data from the given file
    seqs = SequenceIndex.from_file(arguments[1])
    
    if options.partitions is not None:
        # Prepare each training partition
        datasets = holdout_partition(seqs.sequences, options.partitions)
    else:
        datasets = [seqs.sequences]
        
    for dataset,(parti,part_model) in zip(datasets,parts):
        # Train the named model on the sequence data
        model = PcfgModel.train(part_model, dataset, opts, grammar=grammar, 
                                logger=logger)
        model.save()
        print "Trained model", part_model
예제 #7
0
def main():
    usage = "%prog [options] <model_name> <input-file>"
    description = (
        "Trains a model for the RaphSto chord labelling "
        "algorithm on a file that contains a list of midi files with "
        "training options"
    )
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        "-p",
        "--partitions",
        dest="partitions",
        action="store",
        type="int",
        help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.",
    )
    parser.add_option(
        "--opts",
        dest="opts",
        action="store",
        help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.",
    )
    parser.add_option(
        "--proc",
        "--processes",
        dest="processes",
        action="store",
        type="int",
        help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)",
        default=1,
    )
    parser.add_option(
        "--max-length",
        dest="max_length",
        action="store",
        type="int",
        help="limits the length of the training midi sequences in chunks",
    )
    parser.add_option(
        "--split-length",
        dest="split_length",
        action="store",
        type="int",
        help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot",
    )
    parser.add_option(
        "--min-length",
        dest="min_length",
        action="store",
        type="int",
        help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence",
    )
    parser.add_option(
        "--progress-out",
        dest="progress_out",
        action="store",
        help="output logging info to a file instead of the command line",
    )
    parser.add_option(
        "--init-model",
        dest="init_model",
        action="store",
        help="initialize the model using parameters from an already trained model",
    )
    parser.add_option(
        "--init-ctrans",
        dest="init_ctrans",
        action="store",
        help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability",
    )
    parser.add_option(
        "--chord-set",
        dest="chord_set",
        action="store",
        help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used",
    )
    parser.add_option(
        "-m",
        "--model-type",
        dest="model_type",
        action="store",
        help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()),
        default="standard",
    )
    options, arguments = parse_args_with_config(parser)

    if options.opts is not None and options.opts == "help":
        print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs")
        sys.exit(0)
    opts = ModuleOption.process_option_string(options.opts)

    if len(arguments) < 2:
        print >> sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]

    print >> sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(" ")
    # Create a logger to output the progress of the training to stdout or a file
    if options.progress_out is not None:
        stdout = False
        logfile = options.progress_out
        print >> sys.stderr, "Outputing logging info to %s" % logfile
    else:
        stdout = True
        logfile = None
        print >> sys.stderr, "Outputing logging to stdout"
    logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout)
    logger.info("Raphael and Stoddard HMM model training")

    if options.model_type not in MODEL_TYPES:
        print >> sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES)
        sys.exit(1)
    model_cls = MODEL_TYPES[options.model_type]

    if options.chord_set == "help":
        print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys())
        sys.exit(0)
    elif options.chord_set is not None:
        # Check this chord set exists
        if options.chord_set not in constants.CHORD_SETS:
            print >> sys.stderr, "Chord set '%s' does not exist" % options.chord_set
            sys.exit(1)
        else:
            logger.info("Using chord set '%s'" % options.chord_set)

    # Read in the training data
    midis = InputSourceFile(filename)
    handlers = midis.get_handlers()
    logger.info("Reading in %d midi files..." % len(midis.inputs))
    training_data = []
    for i, mh in enumerate(handlers):
        logger.info("%s: %s" % (i, midis.inputs[i][0]))
        emissions = mh.get_emission_stream()[0]
        if options.max_length is not None and len(emissions) > options.max_length:
            logger.info("Truncating file %d to %d chunks (was %d)" % (i, options.max_length, len(emissions)))
            emissions = emissions[: options.max_length]
        if options.split_length is not None:
            logger.info("Splitting sequence %d into sequence no longer " "than %d chunks" % (i, options.split_length))
            # Split up the sequence if it's too long
            while len(emissions) > options.split_length:
                training_data.append(emissions[: options.split_length])
                emissions = emissions[options.split_length :]
        training_data.append(emissions)

    if options.min_length is not None:
        # Make sure there are no sequences under the minimum length
        # Just throw away any that are
        before_chuck = len(training_data)
        training_data = [seq for seq in training_data if len(seq) >= options.min_length]
        if len(training_data) != before_chuck:
            logger.info(
                "Threw away %d short sequences (below %d chunks)"
                % ((before_chuck - len(training_data)), options.min_length)
            )

    logger.info(
        "Training on %d sequences. Lengths: %s"
        % (len(training_data), ", ".join(str(len(seq)) for seq in training_data))
    )

    if options.partitions is not None:
        parts = holdout_partition(training_data, options.partitions)
        models = [("%s%d" % (model_name, num), data) for num, data in enumerate(parts)]
    else:
        models = [(model_name, training_data)]

    # Number of processes to use
    if options.processes == -1:
        # Special value: means number of training sequences (one process per sequence)
        processes = len(training_data)
    else:
        processes = options.processes

    for part_name, data in models:
        # Instantiate a fresh model with this name
        logger.info("Training model '%s' on %d midis" % (part_name, len(data)))
        if options.init_model is not None:
            logger.info("Initializing using parameters from model '%s'" % options.init_model)
            # Load an already trained model as initialization
            model = model_cls.initialize_existing_model(options.init_model, model_name=part_name)
        else:
            # TODO: make these probs an option
            ctype_params = (0.5, 0.3, 0.2)
            logger.info("Initializing to naive chord types using parameters: " "%s, %s, %s" % ctype_params)
            init_kwargs = {"model_name": part_name}
            if options.chord_set is not None:
                # Specify a chord set for the model
                init_kwargs["chord_set"] = options.chord_set
            model = model_cls.initialize_chord_types(ctype_params, **init_kwargs)

            # Initialize the chord transition probabilities if given
            if options.init_ctrans is not None:
                logger.info("Initializing chord transition distribution to %s" % options.init_ctrans)
                model.set_chord_transition_probabilities(options.init_ctrans)
        # Retrain it with the loaded data
        trainer = model_cls.get_trainer()(model, options=opts)
        trainer.train(data, logger=logger, processes=processes, save_intermediate=True)
    print >> sys.stderr, "Training terminating at %s" % datetime.now().isoformat(" ")
예제 #8
0
def main():
    usage = "%prog [options] <model_name> <input-file>"
    description = "Trains a model for the RaphSto chord labelling "\
        "algorithm on a file that contains a list of midi files with "\
        "training options"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option('-p', '--partitions', dest="partitions", action="store", type="int", help="train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number.")
    parser.add_option('--opts', dest="opts", action="store", help="options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type.")
    parser.add_option('--proc', '--processes', dest="processes", action="store", type="int", help="number of parallel processes to spawn for the training. Use -1 to spawn one per training sequence (after splitting: see split_length)", default=1)
    parser.add_option('--max-length', dest="max_length", action="store", type="int", help="limits the length of the training midi sequences in chunks")
    parser.add_option('--split-length', dest="split_length", action="store", type="int", help="limits the length of the training midi sequences in chunks, but instead of throwing away everything after the first N chunks, splits it off as if it were starting a new sequence. This is good for multiprocessing, since many short sequences can be multitasked, whilst few long ones cannot")
    parser.add_option('--min-length', dest="min_length", action="store", type="int", help="ignores any sequences under this number of chunks. This is useful with --split-length, which can leave very short sequences from the end of a split sequence")
    parser.add_option('--progress-out', dest="progress_out", action="store", help="output logging info to a file instead of the command line")
    parser.add_option('--init-model', dest="init_model", action="store", help="initialize the model using parameters from an already trained model")
    parser.add_option('--init-ctrans', dest="init_ctrans", action="store", help="initialize the chord transition distribution using these parameters. Comma-separated list of params given as C0->C1-P, where C0 and C1 are chords (I, II, etc) and P is a float probability")
    parser.add_option('--chord-set', dest="chord_set", action="store", help="use a chord set other than the default. Use value 'help' to see a list. Has no effect in combination with --init-model, since the old model's chord set will be used")
    parser.add_option('-m', '--model-type', dest="model_type", action="store", help="select a model type: one of %s (default: standard)" % ", ".join(mt for mt in MODEL_TYPES.keys()), default="standard")
    options, arguments = parse_args_with_config(parser)
    
    if options.opts is not None and options.opts == "help":
        print options_help_text(RaphstoBaumWelchTrainer.OPTIONS, intro="Training options for Raphael and Stoddard HMMs")
        sys.exit(0)
    opts = ModuleOption.process_option_string(options.opts)
    
    if len(arguments) < 2:
        print >>sys.stderr, "You must specify a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[1])
    model_name = arguments[0]
    
    print >>sys.stderr, "Raphsto training beginning at %s" % datetime.now().isoformat(' ')
    # Create a logger to output the progress of the training to stdout or a file
    if options.progress_out is not None:
        stdout = False
        logfile = options.progress_out
        print >>sys.stderr, "Outputing logging info to %s" % logfile
    else:
        stdout = True
        logfile = None
        print >>sys.stderr, "Outputing logging to stdout"
    logger = create_logger(name="raphsto_train", filename=logfile, stdout=stdout)
    logger.info("Raphael and Stoddard HMM model training")
        
    if options.model_type not in MODEL_TYPES:
        print >>sys.stderr, "Model type must be one of: %s" % ", ".join(mt for mt in MODEL_TYPES)
        sys.exit(1)
    model_cls = MODEL_TYPES[options.model_type]
    
    if options.chord_set == "help":
        print "Available chord sets: %s" % ", ".join(constants.CHORD_SETS.keys())
        sys.exit(0)
    elif options.chord_set is not None:
        # Check this chord set exists
        if options.chord_set not in constants.CHORD_SETS:
            print >>sys.stderr, "Chord set '%s' does not exist" % options.chord_set
            sys.exit(1)
        else:
            logger.info("Using chord set '%s'" % options.chord_set)
    
    
    # Read in the training data
    midis = InputSourceFile(filename)
    handlers = midis.get_handlers()
    logger.info("Reading in %d midi files..." % len(midis.inputs))
    training_data = []
    for i,mh in enumerate(handlers):
        logger.info("%s: %s" % (i,midis.inputs[i][0]))
        emissions = mh.get_emission_stream()[0]
        if options.max_length is not None and len(emissions) > options.max_length:
            logger.info("Truncating file %d to %d chunks (was %d)" % \
                                    (i,options.max_length,len(emissions)))
            emissions = emissions[:options.max_length]
        if options.split_length is not None:
            logger.info("Splitting sequence %d into sequence no longer "\
                                "than %d chunks" % (i,options.split_length))
            # Split up the sequence if it's too long
            while len(emissions) > options.split_length:
                training_data.append(emissions[:options.split_length])
                emissions = emissions[options.split_length:]
        training_data.append(emissions)
    
    if options.min_length is not None:
        # Make sure there are no sequences under the minimum length
        # Just throw away any that are
        before_chuck = len(training_data)
        training_data = [seq for seq in training_data if len(seq) >= options.min_length]
        if len(training_data) != before_chuck:
            logger.info("Threw away %d short sequences (below %d chunks)" % \
                    ((before_chuck-len(training_data)), options.min_length))
    
    logger.info("Training on %d sequences. Lengths: %s" % \
                    (len(training_data), 
                     ", ".join(str(len(seq)) for seq in training_data)))
    
    if options.partitions is not None:
        parts = holdout_partition(training_data, options.partitions)
        models = [("%s%d" % (model_name,num),data) for num,data in enumerate(parts)]
    else:
        models = [(model_name,training_data)]
        
    # Number of processes to use
    if options.processes == -1:
        # Special value: means number of training sequences (one process per sequence)
        processes = len(training_data)
    else:
        processes = options.processes
    
    for part_name,data in models:
        # Instantiate a fresh model with this name
        logger.info("Training model '%s' on %d midis" % (part_name, len(data)))
        if options.init_model is not None:
            logger.info("Initializing using parameters from model '%s'" % \
                options.init_model)
            # Load an already trained model as initialization
            model = model_cls.initialize_existing_model(options.init_model, \
                model_name=part_name)
        else:
            # TODO: make these probs an option
            ctype_params = (0.5, 0.3, 0.2)
            logger.info("Initializing to naive chord types using parameters: "\
                "%s, %s, %s" % ctype_params)
            init_kwargs = { 'model_name' : part_name }
            if options.chord_set is not None:
                # Specify a chord set for the model
                init_kwargs['chord_set'] = options.chord_set
            model = model_cls.initialize_chord_types(ctype_params, **init_kwargs)
            
            # Initialize the chord transition probabilities if given
            if options.init_ctrans is not None:
                logger.info("Initializing chord transition distribution to %s" \
                    % options.init_ctrans)
                model.set_chord_transition_probabilities(options.init_ctrans)
        # Retrain it with the loaded data
        trainer = model_cls.get_trainer()(model, options=opts)
        trainer.train(data, logger=logger, processes=processes, save_intermediate=True)
    print >>sys.stderr, "Training terminating at %s" % datetime.now().isoformat(' ')
예제 #9
0
def train_model(model,
                data_filename,
                holdout_partitions=0,
                train_params={},
                chordmap=None):
    """
    Train a C&C model by calling the C&C supertagger training routine.
    
    model should be a model name to train/retrain.
    data_filename should be the path to a training data file in the 
    hybrid C&C format.
    params is an optional dict of (string) parameter values to feed 
    to the C&C trainer. Only certain parameter values will be allowed. 
    These will override the default parameters in settings.CANDC. Set
    a parameter to None or an empty string to use C&C's default.
    
    """
    command = os.path.join(settings.CANDC.BASE_PATH, "bin", "train_super")

    # Process parameters that we'll use for training
    params = settings.CANDC.DEFAULT_TRAINING_PARAMS.copy()
    params.update(train_params)
    extra_args = []
    # Prepare the args for the training command with these parameters
    for key, val in params.items():
        if val is not None and val != "":
            extra_args.append('--%s' % key)
            extra_args.append(str(val))

    def _train(dest_model, filename):
        """ Train a model using the train_super command. """
        model_path = os.path.join(settings.CANDC.MODELS_PATH, *(dest_model))
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        # Store a list of possible tags so we can smooth over unseen ones
        generate_tag_list(os.path.join(model_path, "tags"))
        # Run the training
        command_args = [
            command,
            "--model",
            model_path,
            "--comment",
            "\"Model trained automatically on chord data in file %s\"" %
            data_filename,
            "--input",
            filename,
            # Tell C&C to put the tagdict in the model dir
            # Doc says this is the default, but it isn't...
            "--super-tagdict",
            "//tagdict"
        ] + extra_args
        print "Running: %s" % " ".join(command_args)
        trainer = Popen(command_args, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        trainer.wait()
        if trainer.returncode != 0:
            raise CandcTrainingError, "There was an error training a "\
                    "supertagger model from the file %s: \n%s" % (data_filename, trainer.stderr.read())
        else:
            print "Trained model %s:\n%s" % (
                model_path, remove_ansi_colors(
                    trainer.stderr.read()).strip("\n"))

    # Read the data in from the given filename
    in_file = open(data_filename, 'r')
    lines = in_file.readlines()
    # Convert into the C&C training format
    lines = training_data_to_candc(lines)

    model = model.split(".")

    if holdout_partitions:
        # Split up the data into n partitions and train on every
        #  n-1 subset of them.

        # Build the lists with each partition held out
        partitions = holdout_partition(lines, holdout_partitions)
        # Train on each partitioned set
        for i, partition in enumerate(partitions):
            print "Training partition %d" % i
            temp_file = NamedTemporaryFile()
            temp_file.write("\n".join(partition))
            temp_file.flush()
            # Train the model on this part of the data
            _train(model + ["partition-%s" % i], temp_file.name)
            temp_file.close()
    else:
        temp_file = NamedTemporaryFile()
        temp_file.write("\n".join(lines))
        temp_file.flush()
        # Just train on the whole data
        _train(model, temp_file.name)
        temp_file.close()
예제 #10
0
def main():
    usage = "%prog [options] <model-type> <model_name> <in-file>"
    description = "Trains a backoff builder model using the given "\
        "input data. Specify a model type (ngram, etc) and a name to "\
        "identify it. The data file should be a stored SequenceIndex file."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        '-p',
        '--partitions',
        dest="partitions",
        action="store",
        type="int",
        help=
        "train a number of partitions of the given data. Trains a model on the complement of each partition, so it can be tested on the partition. The models will be named <NAME>n, where <NAME> is the model name and n the partition number."
    )
    parser.add_option(
        '--opts',
        dest="training_opts",
        action="store",
        help=
        "options to pass to the model trainer. Type '--opts help' for a list of options for a particular model type."
    )
    # File input options
    parser.add_option(
        "--filetype",
        "--ft",
        dest="filetype",
        action="store",
        help=
        "select the file type for the input file. Same filetypes as jazzparser",
        default='bulk-db')
    parser.add_option(
        "--file-options",
        "--fopt",
        dest="file_options",
        action="store",
        help=
        "options for the input file. Type '--fopt help', using '--ft <type>' to select file type, for a list of available options."
    )
    options, arguments = parse_args_with_config(parser)

    if len(arguments) < 3:
        print >> sys.stderr, "You must specify a model type, a model name and an input data file as arguments"
        sys.exit(1)
    filename = os.path.abspath(arguments[2])
    model_type = arguments[0]
    model_name = arguments[1]

    builder_cls = get_backoff_builder(model_type)
    model_cls = builder_cls.MODEL_CLASS

    # Load the sequence data from a dbinput file
    input_data = command_line_input(
        filename=filename,
        filetype=options.filetype,
        options=options.file_options,
        allowed_types=['bulk-db', 'bulk-db-annotated'])

    # Handle any training options that were given on the command line
    if options.training_opts is None:
        training_opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(model_cls.TRAINING_OPTIONS,
                                intro="Training options for %s" %
                                model_cls.__name__)
        sys.exit(0)
    else:
        training_opts = ModuleOption.process_option_dict(
            ModuleOption.process_option_string(options.training_opts),
            model_cls.TRAINING_OPTIONS)

    if options.partitions is not None:
        parts = holdout_partition(input_data, options.partitions)
        models = [(builder_cls.partition_model_name(model_name,num),seqs) for \
                        num,seqs in enumerate(parts)]
    else:
        models = [(model_name, input_data)]

    for part_name, seqs in models:
        # Instantiate a fresh model with this name
        model = model_cls(part_name, options=training_opts)
        # Train it with the loaded data
        model.train(seqs)
        model.save()
        print "Trained model %s" % (part_name)
예제 #11
0
def main():
    usage = "%prog [<options>] <model-name> <training-input>"
    description = "Training of PCFG models."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-p", "--partitions", dest="partitions", action="store", type="int", \
        help="Number of partitions to divide the data into. "\
            "For train, divides the input file, trains a model on each "\
            "partition's complement and appends partition number to "\
            "the model names. For del, appends partition numbers to model "\
            "names and deletes all the models. Recache does similarly. "\
            "Has no effect for parse.")
    parser.add_option(
        '--opts',
        dest="training_opts",
        action="store",
        help=
        "options to pass to the model trainer. Type '--opts help' for a list of options"
    )
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="Output verbose logging information to stderr")
    parser.add_option("-g",
                      "--grammar",
                      dest="grammar",
                      action="store",
                      help="use the named grammar instead of the default.")
    options, arguments = parse_args_with_config(parser)

    if options.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.WARN
    # Create a logger for training
    logger = create_logger(log_level=log_level, name="training", stderr=True)

    # Load a grammar
    grammar = get_grammar(options.grammar)
    # Get the pcfg model class for the formalism
    PcfgModel = grammar.formalism.PcfgModel

    # Parse the option string
    if options.training_opts is None:
        opts = {}
    elif options.training_opts.lower() == "help":
        print options_help_text(PcfgModel.TRAINING_OPTIONS,
                                intro="Training options for PCFGs")
        sys.exit(0)
    else:
        opts = ModuleOption.process_option_dict(
            ModuleOption.process_option_string(options.training_opts),
            PcfgModel.TRAINING_OPTIONS)

    if len(arguments) == 0:
        print >> sys.stderr, "Specify a model name"
        models = PcfgModel.list_models()
        print >> sys.stderr, "Available models: %s" % ", ".join(models)
        sys.exit(1)
    model_name = arguments[0]
    print "Model base name:", model_name

    if options.partitions is not None:
        parts = [(i, "%s%d" % (model_name, i))
                 for i in range(options.partitions)]
    else:
        parts = [(None, model_name)]

    if len(arguments) < 2:
        print >> sys.stderr, "Specify an input file to read sequence data from"
        sys.exit(1)
    # Read in the training data from the given file
    seqs = SequenceIndex.from_file(arguments[1])

    if options.partitions is not None:
        # Prepare each training partition
        datasets = holdout_partition(seqs.sequences, options.partitions)
    else:
        datasets = [seqs.sequences]

    for dataset, (parti, part_model) in zip(datasets, parts):
        # Train the named model on the sequence data
        model = PcfgModel.train(part_model,
                                dataset,
                                opts,
                                grammar=grammar,
                                logger=logger)
        model.save()
        print "Trained model", part_model
예제 #12
0
def train_model(model, data_filename, holdout_partitions=0, train_params={}, 
                    chordmap=None):
    """
    Train a C&C model by calling the C&C supertagger training routine.
    
    model should be a model name to train/retrain.
    data_filename should be the path to a training data file in the 
    hybrid C&C format.
    params is an optional dict of (string) parameter values to feed 
    to the C&C trainer. Only certain parameter values will be allowed. 
    These will override the default parameters in settings.CANDC. Set
    a parameter to None or an empty string to use C&C's default.
    
    """
    command = os.path.join(settings.CANDC.BASE_PATH, "bin", "train_super")
    
    # Process parameters that we'll use for training
    params = settings.CANDC.DEFAULT_TRAINING_PARAMS.copy()
    params.update(train_params)
    extra_args = []
    # Prepare the args for the training command with these parameters
    for key,val in params.items():
        if val is not None and val != "":
            extra_args.append('--%s' % key)
            extra_args.append(str(val))
    
    def _train(dest_model, filename):
        """ Train a model using the train_super command. """
        model_path = os.path.join(settings.CANDC.MODELS_PATH, *(dest_model))
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        # Store a list of possible tags so we can smooth over unseen ones
        generate_tag_list(os.path.join(model_path, "tags"))
        # Run the training
        command_args = [command, "--model", model_path, "--comment", 
                        "\"Model trained automatically on chord data in file %s\"" % data_filename,
                        "--input", filename,
                        # Tell C&C to put the tagdict in the model dir
                        # Doc says this is the default, but it isn't...
                        "--super-tagdict", "//tagdict"] + extra_args
        print "Running: %s" % " ".join(command_args)
        trainer = Popen(command_args, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        trainer.wait()
        if trainer.returncode != 0:
            raise CandcTrainingError, "There was an error training a "\
                    "supertagger model from the file %s: \n%s" % (data_filename, trainer.stderr.read())
        else:
            print "Trained model %s:\n%s" % (model_path, remove_ansi_colors(trainer.stderr.read()).strip("\n"))
    
    # Read the data in from the given filename
    in_file = open(data_filename, 'r')
    lines = in_file.readlines()
    # Convert into the C&C training format
    lines = training_data_to_candc(lines)
    
    model = model.split(".")
    
    if holdout_partitions:
        # Split up the data into n partitions and train on every 
        #  n-1 subset of them.
        
        # Build the lists with each partition held out
        partitions = holdout_partition(lines, holdout_partitions)
        # Train on each partitioned set
        for i,partition in enumerate(partitions):
            print "Training partition %d" % i
            temp_file = NamedTemporaryFile()
            temp_file.write("\n".join(partition))
            temp_file.flush()
            # Train the model on this part of the data
            _train(model+["partition-%s" % i], temp_file.name)
            temp_file.close()
    else:
        temp_file = NamedTemporaryFile()
        temp_file.write("\n".join(lines))
        temp_file.flush()
        # Just train on the whole data
        _train(model, temp_file.name)
        temp_file.close()