Пример #1
0
    def train(self, data, grammar=None, logger=None):
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()

        model = HmmPathNgram.train(data,
                                   self.options['estimator'],
                                   grammar,
                                   cutoff=self.options['cutoff'],
                                   chord_map=self.options['chord_mapping'],
                                   order=self.options['n'],
                                   backoff_orders=self.options['backoff'])
        self.model = model

        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(data),
                'samples' : sum([len(s) for s in data], 0),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
            }
Пример #2
0
 def description(self):
     buff = StringIO()
     def _fdist_str(fd):
         return "FDist<%d>: %s" % (fd.N(), ", ".join("%s:%d" % pr for pr in fd.items()))
     def _cfd_str(cfd):
         fds = [(cond,cfd[cond]) for cond in cfd.conditions()]
         # Sort by N of each FD
         fds = reversed(sorted(fds, key=lambda (c,fd): fd.N()))
         return "\n".join("%s: %s" % (cond, _fdist_str(fd)) for (cond,fd) in fds)
     
     print >>buff, "Parent distribution:"
     print >>buff, _fdist_str(self._parent_counts)
     print >>buff
     print >>buff, "Expansion type distribution:"
     print >>buff, _cfd_str(self._expansion_type_counts)
     print >>buff
     print >>buff, "Head expansion distribution:"
     print >>buff, _cfd_str(self._head_expansion_counts)
     print >>buff
     print >>buff, "Non-head expansion distribution:"
     print >>buff, _cfd_str(self._non_head_expansion_counts)
     print >>buff
     print >>buff, "Lexical expansion distribution:"
     print >>buff, _cfd_str(self._lexical_counts)
     print >>buff
     print >>buff, "Possible words: %d" % self.word_bins
     print >>buff, "Possible categories: %d" % self.cat_bins
     print >>buff
     print >>buff, "Estimator: %s" % get_estimator_name(self._estimator)
     print >>buff, "Frequency cutoff: %d" % self.cutoff
     return buff.getvalue()
Пример #3
0
    def train(self, data, grammar=None, logger=None):
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        model = HmmPathNgram.train(data, self.options['estimator'], grammar, 
                                   cutoff=self.options['cutoff'], 
                                   chord_map=self.options['chord_mapping'],
                                   order=self.options['n'],
                                   backoff_orders=self.options['backoff'])
        self.model = model
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(data),
                'samples' : sum([len(s) for s in data], 0),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
            }
Пример #4
0
    def description(self):
        buff = StringIO()

        def _fdist_str(fd):
            return "FDist<%d>: %s" % (fd.N(), ", ".join("%s:%d" % pr
                                                        for pr in fd.items()))

        def _cfd_str(cfd):
            fds = [(cond, cfd[cond]) for cond in cfd.conditions()]
            # Sort by N of each FD
            fds = reversed(sorted(fds, key=lambda (c, fd): fd.N()))
            return "\n".join("%s: %s" % (cond, _fdist_str(fd))
                             for (cond, fd) in fds)

        print >> buff, "Parent distribution:"
        print >> buff, _fdist_str(self._parent_counts)
        print >> buff
        print >> buff, "Expansion type distribution:"
        print >> buff, _cfd_str(self._expansion_type_counts)
        print >> buff
        print >> buff, "Head expansion distribution:"
        print >> buff, _cfd_str(self._head_expansion_counts)
        print >> buff
        print >> buff, "Non-head expansion distribution:"
        print >> buff, _cfd_str(self._non_head_expansion_counts)
        print >> buff
        print >> buff, "Lexical expansion distribution:"
        print >> buff, _cfd_str(self._lexical_counts)
        print >> buff
        print >> buff, "Possible words: %d" % self.word_bins
        print >> buff, "Possible categories: %d" % self.cat_bins
        print >> buff
        print >> buff, "Estimator: %s" % get_estimator_name(self._estimator)
        print >> buff, "Frequency cutoff: %d" % self.cutoff
        return buff.getvalue()
Пример #5
0
    def train(self, sequences, grammar=None, logger=None):
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        # We can only train on annotated chord sequence input
        if not isinstance(sequences, (DbBulkInput, AnnotatedDbBulkInput)):
            raise TaggerTrainingError, "can only train ngram-multi model "\
                "on bulk db chord input (bulk-db or bulk-db-annotated). Got "\
                "input of type '%s'" % type(sequences).__name__
        
        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']}
        
        # Get all the possible pos tags from the grammar
        schemata = grammar.pos_tags
        # Build the emission domain to include all the observations that 
        #  theoretically could occur, not just those that are seen - 
        #  we might not see all interval/chord type pairs in the data.
        chord_types = list(set(self.options['chord_mapping'].values()))
        
        self.model = MultiChordNgramModel.train(
                                    sequences,
                                    schemata,
                                    chord_types,
                                    self.options['estimator'], 
                                    cutoff=self.options['cutoff'],
                                    chord_map=self.options['chord_mapping'],
                                    order=self.options['n'],
                                    backoff_orders=self.options['backoff'],
                                    backoff_kwargs=backoff_kwargs)
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(sequences),
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.options['chord_mapping'].name,
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
            }
Пример #6
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()
        
        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name
        
        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that 
        #  theoretically could occur, not just those that are seen - 
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum([["%d-%s" % (interval,chord) for chord in chord_types] for interval in range(12)], [])
        
        # Ignore unlabelled data
        ignores = ['']
        
        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff' : self.options['backoff_cutoff']}
        
        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
                            self.options['n'],
                            training_data,
                            label_dom,
                            emission_dom=emission_dom,
                            cutoff=self.options['cutoff'],
                            backoff_order=self.options['backoff'],
                            estimator=self.options['estimator'],
                            ignore_list=ignores,
                            backoff_kwargs=backoff_kwargs)
        
        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }
Пример #7
0
    def train(self, sequences, grammar=None, logger=None):
        from jazzparser.utils.nltk.ngram import PrecomputedNgramModel
        if grammar is None:
            from jazzparser.grammar import get_grammar
            # Load the default grammar
            grammar = get_grammar()

        N = self.options['n']
        backoff = self.options['backoff']
        chordmap = self.options['chord_mapping']
        self.chordmap = chordmap
        self.chordmap_name = chordmap.name

        # Get data in the form of lists of (observation,tag) pairs
        training_data = [[(observation_from_chord_pair(c1, c2, chordmap), c1cat) \
                                for ((c1,c2),c1cat) in zip(group_pairs(seq, none_final=True),seq.categories)]
                                    for seq in sequences]
        # Get all the possible pos tags from the grammar
        label_dom = grammar.pos_tags
        # Build the emission domain to include all the observations that
        #  theoretically could occur, not just those that are seen -
        #  we might not see all interval/chord type pairs in the data.
        chord_types = chordmap.values()
        emission_dom = sum(
            [["%d-%s" % (interval, chord) for chord in chord_types]
             for interval in range(12)], [])

        # Ignore unlabelled data
        ignores = ['']

        if self.options['backoff_cutoff'] is None:
            backoff_kwargs = {}
        else:
            backoff_kwargs = {'cutoff': self.options['backoff_cutoff']}

        # Precompute the transition matrix and store it along with the model
        self.model = PrecomputedNgramModel.train(
            self.options['n'],
            training_data,
            label_dom,
            emission_dom=emission_dom,
            cutoff=self.options['cutoff'],
            backoff_order=self.options['backoff'],
            estimator=self.options['estimator'],
            ignore_list=ignores,
            backoff_kwargs=backoff_kwargs)

        # Add some model-specific info into the descriptive text
        #  so we know how it was trained
        est_name = get_estimator_name(self.options['estimator'])
        self.model_description = """\
Model order: %(order)d
Backoff orders: %(backoff)d
Probability estimator: %(est)s
Zero-count threshold: %(cutoff)d
Chord mapping: %(chordmap)s
Training sequences: %(seqs)d
Training samples: %(samples)d\
""" % \
            {
                'est' : est_name,
                'seqs' : len(training_data),
                'samples' : len(sum(training_data, [])),
                'order' : self.options['n'],
                'backoff' : self.options['backoff'],
                'cutoff' : self.options['cutoff'],
                'chordmap' : self.chordmap_name,
            }