def test_detect_input_type(self): # Load some input: DbInput dbi = DbInput.from_file(DB_SEQUENCES_FILE, {"index": 0}) # Run it through the preprocessor datatype, obj = detect_input_type(dbi) # Get the datatype from the type name lists datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Do the same with ChordInput ci = ChordInput.from_file(CHORDS_FILE, options={"roman": True}) datatype, obj = detect_input_type(ci) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try some bulk input bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE) datatype, obj = detect_input_type(bulk, allow_bulk=True) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try restricting the allowed type datatype, obj = detect_input_type(ci, allowed=["chords"]) # And this one should get rejected self.assertRaises(InputTypeError, detect_input_type, (ci,), {"allowed": "db"})
def test_detect_input_type(self): # Load some input: DbInput dbi = DbInput.from_file(DB_SEQUENCES_FILE, {'index': 0}) # Run it through the preprocessor datatype, obj = detect_input_type(dbi) # Get the datatype from the type name lists datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Do the same with ChordInput ci = ChordInput.from_file(CHORDS_FILE, options={'roman': True}) datatype, obj = detect_input_type(ci) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try some bulk input bulk = DbBulkInput.from_file(DB_SEQUENCES_FILE) datatype, obj = detect_input_type(bulk, allow_bulk=True) datatype2 = input_type_name(type(obj)) self.assertEqual(datatype, datatype2) # Try restricting the allowed type datatype, obj = detect_input_type(ci, allowed=['chords']) # And this one should get rejected self.assertRaises(InputTypeError, detect_input_type, (ci, ), {'allowed': 'db'})
def __init__(self, grammar, input, options={}, original_input=None, logger=None): """ The tagger must have reference to the grammar being used to parse the input. It must also be given the full input when instantiated. The format of this input will depend on the tagger: for example, it might be a string or a MIDI file. @param original_input: the input in its original, unprocessed form. This will usually be a string. This is optional, but in some circumstances things might fall apart if it hasn't been given. E.g. using a backoff model as backoff from a tagging model requires the original input to be passed to the backoff model. @param logger: optional progress logger. Logging will be sent to this during initialization of the tagger and tagging. If not given, the logging will be lost. Subclasses may access the logger (or a dummy logger if none was given) in C{self.logger}. """ self.grammar = grammar # Check the formalism is one that's allowed by this tagger formalism = self.grammar.formalism.get_name() if formalism not in self.COMPATIBLE_FORMALISMS: raise TaggerLoadError, "Formalism '%s' cannot be used with "\ "tagger '%s'" % (formalism,self.name) # Check what input type we've received and preprocess it datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES, \ errmess=" for use with tagger '%s'" % self.name) # Store this for the subclass to use as appropriate self.input = input if original_input is None: self.original_input = input else: self.original_input = original_input # Subclasses may redefine self.input to taste # We keep the original wrapped input somewhere where it's sure to remain self.wrapped_input = input # Initialize using tagger-specific options self.options = type(self).check_options(options) if logger is not None: self.logger = logger else: self.logger = create_dummy_logger()
def __init__(self, input, options={}, logger=None): # Initialize using tagger-specific options self.options = type(self).check_options(options) # Check what input type we've received and preprocess it datatype, input = detect_input_type(input, allowed=self.INPUT_TYPES) # Store this for the subclass to use as appropriate self.input = input self.original_input = input # Subclasses may redefine self.input to taste # We keep the original wrapped input somewhere where it's sure to remain self.wrapped_input = input # Make sure we have some logger if logger is None: # Output to stderr instead self.logger = create_plain_stderr_logger() else: self.logger = logger
def train(self, inputs, grammar=None, logger=None): """ @type inputs: L{jazzparser.data.input.MidiTaggerTrainingBulkInput} or list of L{jazzparser.data.input.Input}s @param inputs: training MIDI data. Annotated chord sequences should also be given (though this is optional) by loading a bulk db input file in the MidiTaggerTrainingBulkInput. """ if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() if len(inputs) == 0: # No data - nothing to do return # Check the type of one of the inputs - no guarantee they're all the # same, but there's something seriously weird going on if they're not input_type = detect_input_type(inputs[0], allowed=['segmidi']) # Get the chord training data too if it's been given if isinstance(inputs, MidiTaggerTrainingBulkInput) and \ inputs.chords is not None: chord_inputs = inputs.chords else: chord_inputs = None # Initialize the emission distribution for chord classes self.hmm = ChordClassHmm.initialize_chord_classes( self.options['ccprob'], self.options['maxnotes'], grammar, metric=self.options['metric'], illegal_transitions=self.options['illegal_transitions'], fixed_root_transitions=self.options['fixed_roots']) if chord_inputs: # If chord training data was given, initially train transition # distribution from this self.hmm.add_history("Training initial transition distribution "\ "from annotated chord data") self.hmm.train_transition_distribution(chord_inputs, grammar, \ contprob=self.options['contprob']) else: # Otherwise it gets left as a uniform distribution self.hmm.add_history("No annotated chord training data given. "\ "Transition distribution initialized to uniform.") # Get a Baum-Welch trainer to do the EM retraining # Pull out the options to pass to the trainer bw_opt_names = [opt.name for opt in ChordClassBaumWelchTrainer.OPTIONS] bw_opts = dict([(name,val) for (name,val) in self.options.items() \ if name in bw_opt_names]) retrainer = ChordClassBaumWelchTrainer(self.hmm, options=bw_opts) # Prepare a callback to save def _get_save_callback(): def _save_callback(): self.save() return _save_callback save_callback = _get_save_callback() # Do the Baum-Welch training retrainer.train(inputs, logger=logger, save_callback=save_callback) self.model_description = """\ Initial chord class emission prob: %(ccprob)f Initial self-transition prob: %(contprob)s Metrical model: %(metric)s """ % \ { 'ccprob' : self.options['ccprob'], 'metric' : self.options['metric'], 'contprob' : self.options['contprob'], }
def train(data, name, logger=None, options={}, chord_data=None): """ Initializes and trains an HMM in a supervised fashion using the given training data. """ if len(data) == 0: raise ModelTrainError, "empty training data set" # Prepare a dummy logger if none was given if logger is None: logger = create_dummy_logger() # Process the options dict options = HPChordLabeler.process_training_options(options) # Work out what kind of input data we've got # It should be a bulk input type: check what type the first input is input_type = detect_input_type(data[0], allowed=['segmidi', 'db-annotated']) logger.info(">>> Beginning training of HP chord labeler model '%s'" % name) # If we got midi tagger training data, it may include chord data as well if isinstance(data, MidiTaggerTrainingBulkInput) and \ data.chords is not None: if chord_data is None: # Use the chord data in the input data logger.info("Midi training data; chord corpus data available") chord_inputs = data.chords else: # Use the chord data that was given explicitly chord_inputs = chord_data midi_inputs = data elif isinstance(data, DbBulkInput): logger.info("Only chord corpus training data") # This was only chord input, no midi data chord_inputs = data midi_inputs = None else: chord_inputs = chord_data # Presumably this is another form of midi training data midi_inputs = data logger.info("Midi training data; no chord data was included") # Get the chord vocab from the options logger.info("Model chord vocabulary: %s" % options['vocab']) vocab, vocab_mapping = CHORD_VOCABS[options['vocab']] # Initialize a model according to the chord types logger.info("Initializing emission distributions to favour chord "\ "notes with chord probability %s" % (options['chordprob'])) model = HPChordLabeler.initialize_chords(options['chordprob'], \ options['maxnotes'], vocab, \ vocab_mapping, name=name) # If we have chord training data, use this to train the transition dist if chord_inputs is not None: logger.info("Training using chord data") # Construct the trees implicit in the annotations to get the # key of every chord logger.info("Preparing key data for annotated chord sequences") input_keys = [keys_for_sequence(dbinput) for dbinput in chord_inputs] # Run the supervised training of the transition distribution logger.info("Training transition distribution on chord sequences") model.train_transition_distribution(chord_inputs, input_keys) if midi_inputs is not None: logger.info("Training using midi data") # Preprocess the midi inputs so they're ready for the model training emissions = [midi_to_emission_stream(seq, remove_empty=False)[0] \ for seq in midi_inputs] # Use the midi data to train emission number dist logger.info("Training emission number distribution") model.train_emission_number_distribution(emissions) ####### EM unsupervised training on the midi data # Pull out the options to pass to the trainer # These are a subset of the model training options bw_opt_names = [opt.name for opt in HPBaumWelchTrainer.OPTIONS] bw_opts = dict([(name,val) for (name,val) in options.items() \ if name in bw_opt_names]) # Create a Baum-Welch trainer trainer = HPBaumWelchTrainer(model, bw_opts) # Do the Baum-Welch training model = trainer.train(emissions, logger=logger) logger.info("Training complete") return model