def _load_model(data): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel model = PrecomputedNgramModel.from_picklable_dict(data['model']) name = data['name'] chordmap = data.get("chordmap", None) return NgramTaggerModel(name, model=model, chordmap=chordmap)
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }
def train(self, sequences, grammar=None, logger=None): from jazzparser.utils.nltk.ngram import PrecomputedNgramModel if grammar is None: from jazzparser.grammar import get_grammar # Load the default grammar grammar = get_grammar() N = self.options['n'] backoff = self.options['backoff'] chordmap = self.options['chord_mapping'] self.chordmap = chordmap self.chordmap_name = chordmap.name # Get data in the form of lists of (observation,tag) pairs training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \ for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)] for seq in sequences] # Get all the possible pos tags from the grammar label_dom = grammar.pos_tags # Build the emission domain to include all the observations that # theoretically could occur, not just those that are seen - # we might not see all interval/chord type pairs in the data. chord_types = chordmap.values() emission_dom = sum( [["%d-%s" % (interval, chord) for chord in chord_types] for interval in range(12)], []) # Ignore unlabelled data ignores = [''] if self.options['backoff_cutoff'] is None: backoff_kwargs = {} else: backoff_kwargs = {'cutoff': self.options['backoff_cutoff']} # Precompute the transition matrix and store it along with the model self.model = PrecomputedNgramModel.train( self.options['n'], training_data, label_dom, emission_dom=emission_dom, cutoff=self.options['cutoff'], backoff_order=self.options['backoff'], estimator=self.options['estimator'], ignore_list=ignores, backoff_kwargs=backoff_kwargs) # Add some model-specific info into the descriptive text # so we know how it was trained est_name = get_estimator_name(self.options['estimator']) self.model_description = """\ Model order: %(order)d Backoff orders: %(backoff)d Probability estimator: %(est)s Zero-count threshold: %(cutoff)d Chord mapping: %(chordmap)s Training sequences: %(seqs)d Training samples: %(samples)d\ """ % \ { 'est' : est_name, 'seqs' : len(training_data), 'samples' : len(sum(training_data, [])), 'order' : self.options['n'], 'backoff' : self.options['backoff'], 'cutoff' : self.options['cutoff'], 'chordmap' : self.chordmap_name, }
(3, 2, 0), # Trigram with cutoff # This probably ought to improve things (over those below) (3, 2, 2), ] for order,backoff,cutoff in PARAMS: print "*******************************" print "Order:", order print "Backoff:", backoff print "Smoothing: witten-bell" print "Cutoff:", cutoff model = PrecomputedNgramModel.train(order, training_data, label_dom=list(CHARS), emission_dom=list(CHARS), backoff_order=backoff, estimator=witten_bell_estimator, cutoff=cutoff, backoff_kwargs={'cutoff':0}) #~ # Take a look at some of the distributions #~ print "Some emission distributions" #~ print "%d labels, showing 10\n" % len(model.emission_dist.conditions()) #~ show_dist(model.emission_dist) #~ #~ print "\nSome transition distibrutions" #~ print "%d conditions, showing 5\n" % len(model.label_dist.conditions()) #~ show_dist(model.label_dist, limit=5) # Try decoding the test data correct = 0
(3, 2, 0), # Trigram with cutoff # This probably ought to improve things (over those below) (3, 2, 2), ] for order, backoff, cutoff in PARAMS: print "*******************************" print "Order:", order print "Backoff:", backoff print "Smoothing: witten-bell" print "Cutoff:", cutoff model = PrecomputedNgramModel.train(order, training_data, label_dom=list(CHARS), emission_dom=list(CHARS), backoff_order=backoff, estimator=witten_bell_estimator, cutoff=cutoff, backoff_kwargs={'cutoff': 0}) #~ # Take a look at some of the distributions #~ print "Some emission distributions" #~ print "%d labels, showing 10\n" % len(model.emission_dist.conditions()) #~ show_dist(model.emission_dist) #~ #~ print "\nSome transition distibrutions" #~ print "%d conditions, showing 5\n" % len(model.label_dist.conditions()) #~ show_dist(model.label_dist, limit=5) # Try decoding the test data correct = 0