def __init__(self, grammar, input, options={}, *args, **kwargs): """ Tags using an ngram model backed by NLTK. """ super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs to get observations inpairs = group_pairs(self.input, none_final=True) # Convert the pairs into observations observations = [ observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs ] # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "viterbi": probabilities = self.model.viterbi_probabilities(observations) elif self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities( observations) word_tag_probs = [] for index, probs in enumerate(probabilities): features = { 'duration': self.durations[index], 'time': self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.tags: # Read a full sign out of the grammar sign = self.grammar.get_sign_for_word_by_tag( self.input[index], tag, extra_features=features) if sign is not None: # Read off the probability from the matrix probability = probs[tag] word_signs.append((sign, tag, probability)) # Randomly sort the list first to make sure equal probabilities are randomly ordered word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs] random.shuffle(word_signs) # Now sort by probability word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2]))) self._tagged_data.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __, __, p in word_signs]) if self.options['best']: # Only return one for each word self._batch_ranges = [[(0, 1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) # So far, this has assigned a probability to every possible # tag. We don't want the tagger ever to return the least # probably batch of tags, unless it's the only one. #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] # Transform these into a form that's easier to use for getting the signs self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes]
def __init__(self, grammar, input, options={}, *args, **kwargs): """ Tags using an ngram model backed by NLTK. """ super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs to get observations inpairs = group_pairs(self.input, none_final=True) # Convert the pairs into observations observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs] # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "viterbi": probabilities = self.model.viterbi_probabilities(observations) elif self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities(observations) word_tag_probs = [] for index,probs in enumerate(probabilities): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.tags: # Read a full sign out of the grammar sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) if sign is not None: # Read off the probability from the matrix probability = probs[tag] word_signs.append((sign, tag, probability)) # Randomly sort the list first to make sure equal probabilities are randomly ordered word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs] random.shuffle(word_signs) # Now sort by probability word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2]))) self._tagged_data.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __,__,p in word_signs]) if self.options['best']: # Only return one for each word self._batch_ranges = [[(0,1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) # So far, this has assigned a probability to every possible # tag. We don't want the tagger ever to return the least # probably batch of tags, unless it's the only one. #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] # Transform these into a form that's easier to use for getting the signs self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes]
def __init__(self, grammar, input, options={}, *args, **kwargs): super(MultiChordNgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_times = [] self._tagged_spans = [] self._batch_ranges = [] word_tag_probs = [] # Map the chord types as the model requires chord_map = self.model.chordmap if isinstance(self.wrapped_input, ChordInput): chords = self.wrapped_input.to_db_input().chords observations = [(chord.root, chord_map[chord.type]) for chord in chords] self.input = chords elif isinstance(self.wrapped_input, DbInput): observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords] elif isinstance(self.wrapped_input, WeightedChordLabelInput): observations = lattice_to_emissions(input, chord_map=chord_map) # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities(observations) # Filter out zero probability states and order by desc prob probabilities = [ reversed(sorted(\ [(state,prob) for (state,prob) in timestep.items() if prob > 0.0], \ key=lambda x:x[1])) \ for timestep in probabilities] for index,probs in enumerate(probabilities): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] for (state,prob) in probs: root,schema = state # Instantiate a sign for this state features['root'] = root signs = self.grammar.get_signs_for_tag(schema, features) # There should only be one of these if not signs: continue else: sign = signs[0] word_signs.append((sign, (root, schema), prob)) self._tagged_times.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __,__,p in word_signs]) if self.options['best']: # Only return one for each word batch_ranges = [[(0,1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio, max_batch=self.options['max_batch']) # Transform these into a form that's easier to use for getting the signs batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes] # Step through adding each to see which we should also add to combine # repetitions of identical schema,root pairs def prob_combiner(probs): return sum(probs, 0.0) / float(len(probs)) combiner = SpanCombiner() added = True offset = 0 while added: added = False batch_spans = [] for time in range(len(batch_ranges)): if offset < len(batch_ranges[time]): start, end = batch_ranges[time][offset] for sign_offset in range(start, end): sign, (root,schema), prob = self._tagged_times[time][sign_offset] added = True # Add the length 1 span batch_spans.append((time, time+1, (sign,(root,schema),prob))) # Add this to the combiner to see if it combines # with anything we've previously added combined = combiner.combine_edge( (time, time+1, (root,schema)), properties=prob, prop_combiner=prob_combiner) # Add each additional span with the same sign for (span_start, span_end) in combined: # Set the probability of the combined categories new_prob = combiner.edge_properties[ (span_start, span_end, (root,schema))] # Set timing properties of this spanning category features = { 'duration' : sum( self.durations[span_start:span_end]), 'time' : self.times[span_start], 'root' : root, } # Technically there could be multiple of these, # though in fact there never are new_signs = \ self.grammar.get_signs_for_tag(schema, features) for new_sign in new_signs: batch_spans.append( (span_start, span_end, (new_sign, (root,schema), new_prob))) self._tagged_spans.append(batch_spans) offset += 1