def __init__(self, input, options={}, grammar=None, *args, **kwargs): super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs) process_chord_input(self) if grammar is None: self.grammar = get_grammar() else: self.grammar = grammar #### Tag the input sequence #### self._tagged_data = [] chord_map = self.model.model.chord_map if isinstance(self.wrapped_input, ChordInput): chords = self.wrapped_input.to_db_input().chords observations = [(chord.root, chord_map[chord.type]) for chord in chords] self.input = chords elif isinstance(self.wrapped_input, DbInput): observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords] elif isinstance(self.wrapped_input, WeightedChordLabelInput): observations = lattice_to_emissions(input, chord_map=chord_map) # Use the ngram model to get tag probabilities for each input by # computing the state occupation probability matrix path_probs = self.model.viterbi_paths(observations, self.options['paths']) self._paths = [ self.grammar.formalism.backoff_states_to_lf(zip(states,self.times)) for states,prob in path_probs] # Set the probability on each result for path,(states,prob) in zip(self._paths,path_probs): path.probability = prob
def __init__(self, grammar, input, options={}, *args, **kwargs): super(Baseline3Tagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs inpairs = group_pairs(self.input, none_final=True) # Get all the possible signs from the grammar for index,pair in enumerate(inpairs): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.category_count.keys(): sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) if sign is not None: probability = self.model.get_prob_cat_given_chord_pair(tag, *pair) word_signs.append((sign, tag, probability)) word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2]))) self._tagged_data.append(word_signs) # Work out the sizes of the batches to return these in batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio) # Transform these into a form that's easier to use for getting the signs so_far = 0 batch_ranges = [] for batch in batches: batch_ranges.append((so_far,so_far+batch)) so_far += batch self._batch_ranges.append(batch_ranges)
def __init__(self, input, options={}, grammar=None, *args, **kwargs): super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs) process_chord_input(self) if grammar is None: self.grammar = get_grammar() else: self.grammar = grammar #### Tag the input sequence #### self._tagged_data = [] chord_map = self.model.model.chord_map if isinstance(self.wrapped_input, ChordInput): chords = self.wrapped_input.to_db_input().chords observations = [(chord.root, chord_map[chord.type]) for chord in chords] self.input = chords elif isinstance(self.wrapped_input, DbInput): observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords] elif isinstance(self.wrapped_input, WeightedChordLabelInput): observations = lattice_to_emissions(input, chord_map=chord_map) # Use the ngram model to get tag probabilities for each input by # computing the state occupation probability matrix path_probs = self.model.viterbi_paths(observations, self.options['paths']) self._paths = [ self.grammar.formalism.backoff_states_to_lf(zip( states, self.times)) for states, prob in path_probs ] # Set the probability on each result for path, (states, prob) in zip(self._paths, path_probs): path.probability = prob
def __init__(self, grammar, input, options={}, *args, **kwargs): """ Tags using an ngram model backed by NLTK. """ super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs to get observations inpairs = group_pairs(self.input, none_final=True) # Convert the pairs into observations observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs] # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "viterbi": probabilities = self.model.viterbi_probabilities(observations) elif self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities(observations) word_tag_probs = [] for index,probs in enumerate(probabilities): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.tags: # Read a full sign out of the grammar sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features) if sign is not None: # Read off the probability from the matrix probability = probs[tag] word_signs.append((sign, tag, probability)) # Randomly sort the list first to make sure equal probabilities are randomly ordered word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs] random.shuffle(word_signs) # Now sort by probability word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2]))) self._tagged_data.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __,__,p in word_signs]) if self.options['best']: # Only return one for each word self._batch_ranges = [[(0,1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) # So far, this has assigned a probability to every possible # tag. We don't want the tagger ever to return the least # probably batch of tags, unless it's the only one. #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] # Transform these into a form that's easier to use for getting the signs self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes]
def __init__(self, *args, **kwargs): super(FullTagger, self).__init__(*args, **kwargs) process_chord_input(self)
def __init__(self, grammar, input, options={}, *args, **kwargs): """ Tags using an ngram model backed by NLTK. """ super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_data = [] self._batch_ranges = [] # Group the input into pairs to get observations inpairs = group_pairs(self.input, none_final=True) # Convert the pairs into observations observations = [ observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs ] # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "viterbi": probabilities = self.model.viterbi_probabilities(observations) elif self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities( observations) word_tag_probs = [] for index, probs in enumerate(probabilities): features = { 'duration': self.durations[index], 'time': self.times[index], } word_signs = [] # Now assign a probability to each tag, given the observation for tag in self.model.tags: # Read a full sign out of the grammar sign = self.grammar.get_sign_for_word_by_tag( self.input[index], tag, extra_features=features) if sign is not None: # Read off the probability from the matrix probability = probs[tag] word_signs.append((sign, tag, probability)) # Randomly sort the list first to make sure equal probabilities are randomly ordered word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs] random.shuffle(word_signs) # Now sort by probability word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2]))) self._tagged_data.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __, __, p in word_signs]) if self.options['best']: # Only return one for each word self._batch_ranges = [[(0, 1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio) # So far, this has assigned a probability to every possible # tag. We don't want the tagger ever to return the least # probably batch of tags, unless it's the only one. #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes] # Transform these into a form that's easier to use for getting the signs self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes]
def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs): super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) if type(self) == CandcTagger: raise NotImplementedError, "Tried to instantiate CandcTagger "\ "directly. You should use one of its subclasses." self.tag_batch_ratio = self.options['batch'] model = self.options['model'].split('.') # Check that candc is available for supertagging if not os.path.exists(settings.CANDC.BASE_PATH): raise CandcConfigurationError, "The C&C parser base "\ "directory %s does not exist" % settings.CANDC.BASE_PATH if not os.path.exists(settings.CANDC.MODELS_PATH): raise CandcConfigurationError, "The C&C parser models "\ "directory %s does not exist" % settings.CANDC.MODELS_PATH candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command) if not os.path.exists(candc_cmd): raise CandcConfigurationError, "The C&C supertagger command "\ "%s does not exist. Have you built it?" % candc_cmd # Check the model exists candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model)) if not os.path.exists(candc_model): raise CandcConfigurationError, "The C&C model given (%s) "\ "doesn't exist." % candc_model # Create a logger to dump the output to logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model)) candc_logger = create_logger(filename=logfile) self.logger.info("Logging C&C output to %s" % logfile) # Note in the log what we're trying to tag candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input])) # Read in the list of tags to smooth over self.tag_list = read_tag_list(os.path.join(candc_model, "tags")) # Read in extra options opts_filename = os.path.join(candc_model, "jpopts") if not os.path.exists(opts_filename): self.extra_opts = {} else: with open(opts_filename, 'r') as opts_file: self.extra_opts = dict( [line.strip("\n").split(":", 1) for line in opts_file.readlines()]) # Pull the chord mapping out of the options self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None)) # Spawn a process to do the tagging candc_command = [candc_cmd, "--model", candc_model, "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args self.tagger = Popen(candc_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) candc_logger.info("C&C command: %s" % " ".join(candc_command)) self.tokens = self.input # Build some observations from the tokens observations = [ interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) for ch1,ch2 in group_pairs(self.tokens+[None]) ] # Add a dummy POS tag to each input item self.observations = ["%s|C" % t for t in observations] candc_logger.info("Input: %s" % " ".join(self.observations)) # Run the tagger on this input try: tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations)) except OSError, err: logger.error("Could not run the C&C supertagger (%s)" % err) candc_logger.error("Error: %s" % err) # Output the actual error that the command returned error = self.tagger.stderr.read() logger.error("C&C returned the error: %s" % error) candc_logger.error("C&C error: %s" % error) raise CandcTaggingError, "error running the C&C supertagger: %s" % error
def __init__(self, grammar, input, options={}, *args, **kwargs): super(MultiChordNgramTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) #### Tag the input sequence #### self._tagged_times = [] self._tagged_spans = [] self._batch_ranges = [] word_tag_probs = [] # Map the chord types as the model requires chord_map = self.model.chordmap if isinstance(self.wrapped_input, ChordInput): chords = self.wrapped_input.to_db_input().chords observations = [(chord.root, chord_map[chord.type]) for chord in chords] self.input = chords elif isinstance(self.wrapped_input, DbInput): observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords] elif isinstance(self.wrapped_input, WeightedChordLabelInput): observations = lattice_to_emissions(input, chord_map=chord_map) # Use the ngram model to get tag probabilities for each input by # computing the forward probability matrix if self.options['decode'] == "forward": probabilities = self.model.forward_probabilities(observations) else: probabilities = self.model.forward_backward_probabilities(observations) # Filter out zero probability states and order by desc prob probabilities = [ reversed(sorted(\ [(state,prob) for (state,prob) in timestep.items() if prob > 0.0], \ key=lambda x:x[1])) \ for timestep in probabilities] for index,probs in enumerate(probabilities): features = { 'duration' : self.durations[index], 'time' : self.times[index], } word_signs = [] for (state,prob) in probs: root,schema = state # Instantiate a sign for this state features['root'] = root signs = self.grammar.get_signs_for_tag(schema, features) # There should only be one of these if not signs: continue else: sign = signs[0] word_signs.append((sign, (root, schema), prob)) self._tagged_times.append(word_signs) # Store the list of probabilities for tags, which we'll use # after we've tagged every word to work out the sizes # of the tag batches word_tag_probs.append([p for __,__,p in word_signs]) if self.options['best']: # Only return one for each word batch_ranges = [[(0,1)] for i in range(len(self.input))] else: # Work out the number of tags to return in each batch batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio, max_batch=self.options['max_batch']) # Transform these into a form that's easier to use for getting the signs batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \ for batches in batch_sizes] # Step through adding each to see which we should also add to combine # repetitions of identical schema,root pairs def prob_combiner(probs): return sum(probs, 0.0) / float(len(probs)) combiner = SpanCombiner() added = True offset = 0 while added: added = False batch_spans = [] for time in range(len(batch_ranges)): if offset < len(batch_ranges[time]): start, end = batch_ranges[time][offset] for sign_offset in range(start, end): sign, (root,schema), prob = self._tagged_times[time][sign_offset] added = True # Add the length 1 span batch_spans.append((time, time+1, (sign,(root,schema),prob))) # Add this to the combiner to see if it combines # with anything we've previously added combined = combiner.combine_edge( (time, time+1, (root,schema)), properties=prob, prop_combiner=prob_combiner) # Add each additional span with the same sign for (span_start, span_end) in combined: # Set the probability of the combined categories new_prob = combiner.edge_properties[ (span_start, span_end, (root,schema))] # Set timing properties of this spanning category features = { 'duration' : sum( self.durations[span_start:span_end]), 'time' : self.times[span_start], 'root' : root, } # Technically there could be multiple of these, # though in fact there never are new_signs = \ self.grammar.get_signs_for_tag(schema, features) for new_sign in new_signs: batch_spans.append( (span_start, span_end, (new_sign, (root,schema), new_prob))) self._tagged_spans.append(batch_spans) offset += 1