def __init__(self, grammar, input, options={}, *args, **kwargs): super(Baseline3Tagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs inpairs = group_pairs(self.input, none_final=True) # Get all the possible signs from the grammar for index,pair in enumerate(inpairs): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.category_count.keys(): sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) if sign is not None: probability = self.model.get_prob_cat_given_chord_pair(tag, *pair) word_signs.append((sign, tag, probability)) word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2]))) self._tagged_data.append(word_signs) # Work out the sizes of the batches to return these in batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio) # Transform these into a form that's easier to use for getting the signs so_far = 0 batch_ranges = [] for batch in batches: batch_ranges.append((so_far,so_far+batch)) so_far += batch self._batch_ranges.append(batch_ranges)
def _tags_from_output(self, output): tags = [] # Split up the output text to extract tags and probabilities for line in output.split("\n"): line = line.strip() if len(line): cols = line.split("\t") num_results = int(cols[2]) results = [] all_tags = [] # Get the tags and probs from the output for result_num in range(num_results): cat = cols[3+result_num*2] prob = float(cols[4+result_num*2]) results.append((cat, prob)) all_tags.append(cat) # Check all the tags are covered and add them with 0 prob if not for tag in self.tag_list: if tag not in all_tags: results.append((tag, 0.0)) tags.append(list(reversed(sorted(results, key=lambda x:x[1])))) if len(tags) != self.input_length: raise CandcTaggingError, "C&C output did not give a correct "\ "set of tags: %s" % output # Redistribute the tag probability to account for unseen tags if self.options['unseen_tag_prob'] > 0.0: unseen_prob = self.options['unseen_tag_prob'] # Scale down everything that has a probability prob_scale = 1.0 - unseen_prob for i in range(len(tags)): # Add reserved mass equally to every tag prob_add = unseen_prob / len(tags[i]) tags[i] = [(tag,(prob*prob_scale+prob_add)) for \ tag,prob in tags[i]] skip_tags = [] # Work out what tags we're going to ignore altogether if self.options['ignore-unknown']: for tag_sequence in tags: for tag,prob in tag_sequence: if tag not in self.grammar.families: # This tag's not in the grammar: just ignore it skip_tags.append(tag) logger.warn("Ignoring tag '%s', which is not in "\ "the grammar." % tag) #~ #### I've already done this above #~ # Some tags get given zero probability by the model, either because #~ # it's not smoothing enough, or because of rounding errors #~ # We do a basic smoothing here, giving everything with 0 probability #~ # a probability smaller than the smallest the model assigned #~ smoothed_tags = [] #~ for tag_probs in tags: #~ zeros = sum(prob == 0.0 for (tag,prob) in tag_probs) #~ # No need to smooth if everything got some prob #~ if zeros: #~ smallest = min(prob for (tag,prob) in tag_probs if prob > 0.0) #~ if smallest == 1.0: #~ # This occasionally happens and messes things up #~ # Just reserve a small amount for the zeros in this case #~ smallest = 0.001 #~ # Divide the smallest probability among the zero prob tags #~ # and discount the others #~ smooth_prob = smallest / zeros #~ discount = 1.0-(smallest) #~ tag_probs = [(tag, prob*discount if prob > 0.0 #~ else smooth_prob) #~ for (tag,prob) in tag_probs] #~ smoothed_tags.append(tag_probs) #~ print smoothed_tags signs = [[] for i in range(self.input_length)] # Get an actual sign for each word/tag combination for index,word in enumerate(self.tokens): for (tag,prob) in tags[index]: if tag not in skip_tags: # Consult the grammar to get a suitable sign if we can sign = self.grammar.get_sign_for_word_by_tag( word, tag, extra_features={ 'time' : self.times[index], 'duration' : self.durations[index] }) signs[index].append((sign,tag, prob)) self.batch_sizes = [] for results in signs: # Work out the batches that these should be returned in self.batch_sizes.append(batch_sizes([p for __,__,p in results], self.tag_batch_ratio)) return signs