def train(self, data, grammar=None, logger=None): if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() model = HmmPathNgram.train(data, self.options['estimator'], grammar, cutoff=self.options['cutoff'], chord_map=self.options['chord_mapping'], order=self.options['n'], backoff_orders=self.options['backoff']) self.model = model # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(data), 'samples' : sum([len(s) for s in data], 0), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], }
def description(self): buff = StringIO() def _fdist_str(fd): return "FDist<%d>: %s" % (fd.N(), ", ".join("%s:%d" % pr for pr in fd.items())) def _cfd_str(cfd): fds = [(cond,cfd[cond]) for cond in cfd.conditions()] # Sort by N of each FD fds = reversed(sorted(fds, key=lambda (c,fd): fd.N())) return "\n".join("%s: %s" % (cond, _fdist_str(fd)) for (cond,fd) in fds) print >>buff, "Parent distribution:" print >>buff, _fdist_str(self._parent_counts) print >>buff print >>buff, "Expansion type distribution:" print >>buff, _cfd_str(self._expansion_type_counts) print >>buff print >>buff, "Head expansion distribution:" print >>buff, _cfd_str(self._head_expansion_counts) print >>buff print >>buff, "Non-head expansion distribution:" print >>buff, _cfd_str(self._non_head_expansion_counts) print >>buff print >>buff, "Lexical expansion distribution:" print >>buff, _cfd_str(self._lexical_counts) print >>buff print >>buff, "Possible words: %d" % self.word_bins print >>buff, "Possible categories: %d" % self.cat_bins print >>buff print >>buff, "Estimator: %s" % get_estimator_name(self._estimator) print >>buff, "Frequency cutoff: %d" % self.cutoff return buff.getvalue()
def description(self): buff = StringIO() def _fdist_str(fd): return "FDist<%d>: %s" % (fd.N(), ", ".join("%s:%d" % pr for pr in fd.items())) def _cfd_str(cfd): fds = [(cond, cfd[cond]) for cond in cfd.conditions()] # Sort by N of each FD fds = reversed(sorted(fds, key=lambda (c, fd): fd.N())) return "\n".join("%s: %s" % (cond, _fdist_str(fd)) for (cond, fd) in fds) print >> buff, "Parent distribution:" print >> buff, _fdist_str(self._parent_counts) print >> buff print >> buff, "Expansion type distribution:" print >> buff, _cfd_str(self._expansion_type_counts) print >> buff print >> buff, "Head expansion distribution:" print >> buff, _cfd_str(self._head_expansion_counts) print >> buff print >> buff, "Non-head expansion distribution:" print >> buff, _cfd_str(self._non_head_expansion_counts) print >> buff print >> buff, "Lexical expansion distribution:" print >> buff, _cfd_str(self._lexical_counts) print >> buff print >> buff, "Possible words: %d" % self.word_bins print >> buff, "Possible categories: %d" % self.cat_bins print >> buff print >> buff, "Estimator: %s" % get_estimator_name(self._estimator) print >> buff, "Frequency cutoff: %d" % self.cutoff return buff.getvalue()
def train(self, sequences, grammar=None, logger=None): if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() # We can only train on annotated chord sequence input if not isinstance(sequences, (DbBulkInput, AnnotatedDbBulkInput)): raise TaggerTrainingError, "can only train ngram-multi model "\ "on bulk db chord input (bulk-db or bulk-db-annotated). Got "\ "input of type '%s'" % type(sequences).__name__ if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']} # Get all the possible pos tags from the grammar schemata = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = list(set(self.options['chord_mapping'].values())) self.model = MultiChordNgramModel.train( sequences, schemata, chord_types, self.options['estimator'], cutoff=self.options['cutoff'], chord_map=self.options['chord_mapping'], order=self.options['n'], backoff_orders=self.options['backoff'], backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d\ """ % \ { 'est' : est_name, 'seqs' : len(sequences), 'cutoff' : self.options['cutoff'], 'chordmap' : self.options['chord_mapping'].name, 'order' : self.options['n'], 'backoff' : self.options['backoff'], }
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum( [["%d-%s" % (interval, chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff': self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }