예제 #1
0
 def __init__(self, input, options={}, grammar=None, *args, **kwargs):
     super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs)
     process_chord_input(self)
     
     if grammar is None:
         self.grammar = get_grammar()
     else:
         self.grammar = grammar
     
     #### Tag the input sequence ####
     self._tagged_data = []
     
     chord_map = self.model.model.chord_map
     if isinstance(self.wrapped_input, ChordInput):
         chords = self.wrapped_input.to_db_input().chords
         observations = [(chord.root, chord_map[chord.type]) for chord in 
                             chords]
         self.input = chords
     elif isinstance(self.wrapped_input, DbInput):
         observations = [(chord.root, chord_map[chord.type]) for chord in 
                             self.wrapped_input.chords]
     elif isinstance(self.wrapped_input, WeightedChordLabelInput):
         observations = lattice_to_emissions(input, chord_map=chord_map)
         
     # Use the ngram model to get tag probabilities for each input by 
     # computing the state occupation probability matrix
     path_probs = self.model.viterbi_paths(observations, self.options['paths'])
     
     self._paths = [
         self.grammar.formalism.backoff_states_to_lf(zip(states,self.times))
                 for states,prob in path_probs]
     # Set the probability on each result
     for path,(states,prob) in zip(self._paths,path_probs):
         path.probability = prob
예제 #2
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     super(Baseline3Tagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_data = []
     self._batch_ranges = []
     # Group the input into pairs
     inpairs = group_pairs(self.input, none_final=True)
     # Get all the possible signs from the grammar
     for index,pair in enumerate(inpairs):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         word_signs = []
         # Now assign a probability to each tag, given the observation
         for tag in self.model.category_count.keys():
             sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
             if sign is not None:
                 probability = self.model.get_prob_cat_given_chord_pair(tag, *pair)
                 word_signs.append((sign, tag, probability))
         word_signs = list(reversed(sorted([(sign, tag, prob) for sign,tag,prob in word_signs], key=lambda x:x[2])))
         self._tagged_data.append(word_signs)
         
         # Work out the sizes of the batches to return these in
         batches = batch_sizes([p for __,__,p in word_signs], self.batch_ratio)
         # Transform these into a form that's easier to use for getting the signs
         so_far = 0
         batch_ranges = []
         for batch in batches:
             batch_ranges.append((so_far,so_far+batch))
             so_far += batch
         self._batch_ranges.append(batch_ranges)
예제 #3
0
    def __init__(self, input, options={}, grammar=None, *args, **kwargs):
        super(HmmPathBuilder, self).__init__(input, options, *args, **kwargs)
        process_chord_input(self)

        if grammar is None:
            self.grammar = get_grammar()
        else:
            self.grammar = grammar

        #### Tag the input sequence ####
        self._tagged_data = []

        chord_map = self.model.model.chord_map
        if isinstance(self.wrapped_input, ChordInput):
            chords = self.wrapped_input.to_db_input().chords
            observations = [(chord.root, chord_map[chord.type])
                            for chord in chords]
            self.input = chords
        elif isinstance(self.wrapped_input, DbInput):
            observations = [(chord.root, chord_map[chord.type])
                            for chord in self.wrapped_input.chords]
        elif isinstance(self.wrapped_input, WeightedChordLabelInput):
            observations = lattice_to_emissions(input, chord_map=chord_map)

        # Use the ngram model to get tag probabilities for each input by
        # computing the state occupation probability matrix
        path_probs = self.model.viterbi_paths(observations,
                                              self.options['paths'])

        self._paths = [
            self.grammar.formalism.backoff_states_to_lf(zip(
                states, self.times)) for states, prob in path_probs
        ]
        # Set the probability on each result
        for path, (states, prob) in zip(self._paths, path_probs):
            path.probability = prob
예제 #4
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     """
     Tags using an ngram model backed by NLTK.
     
     """
     super(NgramTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_data = []
     self._batch_ranges = []
     # Group the input into pairs to get observations
     inpairs = group_pairs(self.input, none_final=True)
     # Convert the pairs into observations
     observations = [observation_from_chord_pair(pair[0], pair[1], self.model.chordmap) for pair in inpairs]
     
     # Use the ngram model to get tag probabilities for each input by 
     # computing the forward probability matrix
     if self.options['decode'] == "viterbi":
         probabilities = self.model.viterbi_probabilities(observations)
     elif self.options['decode'] == "forward":
         probabilities = self.model.forward_probabilities(observations)
     else:
         probabilities = self.model.forward_backward_probabilities(observations)
         
     word_tag_probs = []
     
     for index,probs in enumerate(probabilities):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         word_signs = []
         # Now assign a probability to each tag, given the observation
         for tag in self.model.tags:
             # Read a full sign out of the grammar
             sign = self.grammar.get_sign_for_word_by_tag(self.input[index], tag, extra_features=features)
             if sign is not None:
                 # Read off the probability from the matrix
                 probability = probs[tag]
                 word_signs.append((sign, tag, probability))
         
         # Randomly sort the list first to make sure equal probabilities are randomly ordered
         word_signs = [(sign, tag, prob) for sign,tag,prob in word_signs]
         random.shuffle(word_signs)
         # Now sort by probability
         word_signs = list(reversed(sorted(word_signs, key=lambda x:x[2])))
         self._tagged_data.append(word_signs)
         
         # Store the list of probabilities for tags, which we'll use 
         #  after we've tagged every word to work out the sizes
         #  of the tag batches
         word_tag_probs.append([p for __,__,p in word_signs])
     
     if self.options['best']:
         # Only return one for each word
         self._batch_ranges = [[(0,1)] for i in range(len(self.input))]
     else:
         # Work out the number of tags to return in each batch
         batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
         # So far, this has assigned a probability to every possible 
         #  tag. We don't want the tagger ever to return the least 
         #  probably batch of tags, unless it's the only one.
         #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes]
         # Transform these into a form that's easier to use for getting the signs
         self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                 for batches in batch_sizes]
예제 #5
0
 def __init__(self, *args, **kwargs):
     super(FullTagger, self).__init__(*args, **kwargs)
     process_chord_input(self)
예제 #6
0
    def __init__(self, grammar, input, options={}, *args, **kwargs):
        """
        Tags using an ngram model backed by NLTK.
        
        """
        super(NgramTagger, self).__init__(grammar, input, options, *args,
                                          **kwargs)
        process_chord_input(self)

        #### Tag the input sequence ####
        self._tagged_data = []
        self._batch_ranges = []
        # Group the input into pairs to get observations
        inpairs = group_pairs(self.input, none_final=True)
        # Convert the pairs into observations
        observations = [
            observation_from_chord_pair(pair[0], pair[1], self.model.chordmap)
            for pair in inpairs
        ]

        # Use the ngram model to get tag probabilities for each input by
        # computing the forward probability matrix
        if self.options['decode'] == "viterbi":
            probabilities = self.model.viterbi_probabilities(observations)
        elif self.options['decode'] == "forward":
            probabilities = self.model.forward_probabilities(observations)
        else:
            probabilities = self.model.forward_backward_probabilities(
                observations)

        word_tag_probs = []

        for index, probs in enumerate(probabilities):
            features = {
                'duration': self.durations[index],
                'time': self.times[index],
            }
            word_signs = []
            # Now assign a probability to each tag, given the observation
            for tag in self.model.tags:
                # Read a full sign out of the grammar
                sign = self.grammar.get_sign_for_word_by_tag(
                    self.input[index], tag, extra_features=features)
                if sign is not None:
                    # Read off the probability from the matrix
                    probability = probs[tag]
                    word_signs.append((sign, tag, probability))

            # Randomly sort the list first to make sure equal probabilities are randomly ordered
            word_signs = [(sign, tag, prob) for sign, tag, prob in word_signs]
            random.shuffle(word_signs)
            # Now sort by probability
            word_signs = list(reversed(sorted(word_signs, key=lambda x: x[2])))
            self._tagged_data.append(word_signs)

            # Store the list of probabilities for tags, which we'll use
            #  after we've tagged every word to work out the sizes
            #  of the tag batches
            word_tag_probs.append([p for __, __, p in word_signs])

        if self.options['best']:
            # Only return one for each word
            self._batch_ranges = [[(0, 1)] for i in range(len(self.input))]
        else:
            # Work out the number of tags to return in each batch
            batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio)
            # So far, this has assigned a probability to every possible
            #  tag. We don't want the tagger ever to return the least
            #  probably batch of tags, unless it's the only one.
            #batch_sizes = [batches[:-1] if len(batches) > 1 else batches for batches in batch_sizes]
            # Transform these into a form that's easier to use for getting the signs
            self._batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                    for batches in batch_sizes]
예제 #7
0
 def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs):
     super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     if type(self) == CandcTagger:
         raise NotImplementedError, "Tried to instantiate CandcTagger "\
             "directly. You should use one of its subclasses."
     self.tag_batch_ratio = self.options['batch']
     model = self.options['model'].split('.')
     
     # Check that candc is available for supertagging
     if not os.path.exists(settings.CANDC.BASE_PATH):
         raise CandcConfigurationError, "The C&C parser base "\
             "directory %s does not exist" % settings.CANDC.BASE_PATH
     if not os.path.exists(settings.CANDC.MODELS_PATH):
         raise CandcConfigurationError, "The C&C parser models "\
             "directory %s does not exist" % settings.CANDC.MODELS_PATH
     candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command)
     if not os.path.exists(candc_cmd):
         raise CandcConfigurationError, "The C&C supertagger command "\
             "%s does not exist. Have you built it?" % candc_cmd
     # Check the model exists
     candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model))
     if not os.path.exists(candc_model):
         raise CandcConfigurationError, "The C&C model given (%s) "\
             "doesn't exist." % candc_model
     
     # Create a logger to dump the output to
     logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model))
     candc_logger = create_logger(filename=logfile)
     self.logger.info("Logging C&C output to %s" % logfile)
     # Note in the log what we're trying to tag
     candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input]))
     
     # Read in the list of tags to smooth over
     self.tag_list = read_tag_list(os.path.join(candc_model, "tags"))
     
     # Read in extra options
     opts_filename = os.path.join(candc_model, "jpopts")
     if not os.path.exists(opts_filename):
         self.extra_opts = {}
     else:
         with open(opts_filename, 'r') as opts_file:
             self.extra_opts = dict(
                 [line.strip("\n").split(":", 1) 
                     for line in opts_file.readlines()])
     # Pull the chord mapping out of the options
     self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None))
     
     # Spawn a process to do the tagging
     candc_command = [candc_cmd, "--model", candc_model, 
                     "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args
     self.tagger = Popen(candc_command, 
                         stdin=PIPE, stdout=PIPE, stderr=PIPE)
     candc_logger.info("C&C command: %s" % " ".join(candc_command))
         
     self.tokens = self.input
     # Build some observations from the tokens
     observations = [
         interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) 
             for ch1,ch2 in group_pairs(self.tokens+[None])
     ]
     # Add a dummy POS tag to each input item
     self.observations = ["%s|C" % t for t in observations]
     candc_logger.info("Input: %s" % " ".join(self.observations))
     
     # Run the tagger on this input
     try:
         tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations))
     except OSError, err:
         logger.error("Could not run the C&C supertagger (%s)" % err)
         candc_logger.error("Error: %s" % err)
         # Output the actual error that the command returned
         error = self.tagger.stderr.read()
         logger.error("C&C returned the error: %s" % error)
         candc_logger.error("C&C error: %s" % error)
         raise CandcTaggingError, "error running the C&C supertagger: %s" % error
예제 #8
0
 def __init__(self, grammar, input, options={}, *args, **kwargs):
     super(MultiChordNgramTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     #### Tag the input sequence ####
     self._tagged_times = []
     self._tagged_spans = []
     self._batch_ranges = []
     word_tag_probs = []
     
     # Map the chord types as the model requires
     chord_map = self.model.chordmap
     
     if isinstance(self.wrapped_input, ChordInput):
         chords = self.wrapped_input.to_db_input().chords
         observations = [(chord.root, chord_map[chord.type]) for chord in chords]
         self.input = chords
     elif isinstance(self.wrapped_input, DbInput):
         observations = [(chord.root, chord_map[chord.type]) for chord in self.wrapped_input.chords]
     elif isinstance(self.wrapped_input, WeightedChordLabelInput):
         observations = lattice_to_emissions(input, chord_map=chord_map)
         
     # Use the ngram model to get tag probabilities for each input by 
     # computing the forward probability matrix
     if self.options['decode'] == "forward":
         probabilities = self.model.forward_probabilities(observations)
     else:
         probabilities = self.model.forward_backward_probabilities(observations)
     
     # Filter out zero probability states and order by desc prob
     probabilities = [
         reversed(sorted(\
             [(state,prob) for (state,prob) in timestep.items() if prob > 0.0], \
                 key=lambda x:x[1])) \
             for timestep in probabilities]
     
     for index,probs in enumerate(probabilities):
         features = {
             'duration' : self.durations[index],
             'time' : self.times[index],
         }
         
         word_signs = []
         for (state,prob) in probs:
             root,schema = state
             # Instantiate a sign for this state
             features['root'] = root
             signs = self.grammar.get_signs_for_tag(schema, features)
             # There should only be one of these
             if not signs:
                 continue
             else:
                 sign = signs[0]
             word_signs.append((sign, (root, schema), prob))
         
         self._tagged_times.append(word_signs)
         
         # Store the list of probabilities for tags, which we'll use 
         #  after we've tagged every word to work out the sizes
         #  of the tag batches
         word_tag_probs.append([p for __,__,p in word_signs])
     
     if self.options['best']:
         # Only return one for each word
         batch_ranges = [[(0,1)] for i in range(len(self.input))]
     else:
         # Work out the number of tags to return in each batch
         batch_sizes = beamed_batch_sizes(word_tag_probs, self.batch_ratio, max_batch=self.options['max_batch'])
         # Transform these into a form that's easier to use for getting the signs
         batch_ranges = [[(sum(batches[:i]),sum(batches[:i+1])) for i in range(len(batches))] \
                                 for batches in batch_sizes]
     
     # Step through adding each to see which we should also add to combine 
     #  repetitions of identical schema,root pairs
     def prob_combiner(probs):
         return sum(probs, 0.0) / float(len(probs))
     combiner = SpanCombiner()
     added = True
     offset = 0
     while added:
         added = False
         batch_spans = []
         for time in range(len(batch_ranges)):
             if offset < len(batch_ranges[time]):
                 start, end = batch_ranges[time][offset]
                 for sign_offset in range(start, end):
                     sign, (root,schema), prob = self._tagged_times[time][sign_offset]
                     added = True
                     # Add the length 1 span
                     batch_spans.append((time, time+1, (sign,(root,schema),prob)))
                     # Add this to the combiner to see if it combines 
                     #  with anything we've previously added
                     combined = combiner.combine_edge(
                                         (time, time+1, (root,schema)),
                                         properties=prob,
                                         prop_combiner=prob_combiner)
                     # Add each additional span with the same sign
                     for (span_start, span_end) in combined:
                         # Set the probability of the combined categories
                         new_prob = combiner.edge_properties[
                                     (span_start, span_end, (root,schema))]
                         # Set timing properties of this spanning category
                         features = {
                             'duration' : sum(
                                     self.durations[span_start:span_end]),
                             'time' : self.times[span_start],
                             'root' : root,
                         }
                         # Technically there could be multiple of these, 
                         #  though in fact there never are
                         new_signs = \
                             self.grammar.get_signs_for_tag(schema, features)
                         for new_sign in new_signs:
                             batch_spans.append(
                                 (span_start, span_end, 
                                     (new_sign, (root,schema), new_prob)))
         self._tagged_spans.append(batch_spans)
         offset += 1