示例#1
0
 def _load_model(data):
     from .model import MultiChordNgramModel
     
     model = MultiChordNgramModel.from_picklable_dict(data['model'])
     name = data['name']
     chordmap = get_chord_mapping(data.get('chordmap', None))
     return MultiChordNgramTaggerModel(name, model=model, chordmap=chordmap)
示例#2
0
 def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs):
     """
     An n-gram model to be used as a tagging model. Uses NLTK to 
     represent, train and evaluate the n-gram model.
     
     """
     super(MultiChordNgramTaggerModel, self).__init__(model_name, *args, **kwargs)
     self.model = model
     if chordmap is None:
         chordmap = get_chord_mapping()
     self.chordmap = chordmap
示例#3
0
 def _load_model(name, data):
     obj = HalfspanPcfgModel(
             name = name,
             cutoff = data['cutoff'],
             cat_bins = data['cat_bins'],
             estimator = data['estimator'],
             lexical = data.get('lexical', True),
             chordmap = get_chord_mapping(data.get('chordmap', None)),
             parent_counts = dict_to_object(data['parents']),
             expansion_type_counts = dict_to_object(data['expansions']),
             head_expansion_counts = dict_to_object(data['heads']),
             non_head_expansion_counts = dict_to_object(data['non_heads']),
             lexical_counts = dict_to_object(data['words']),
             grammar = data['grammar'],
         )
     return obj
示例#4
0
 def _load_model(name, data):
     obj = HalfspanPcfgModel(
         name=name,
         cutoff=data['cutoff'],
         cat_bins=data['cat_bins'],
         estimator=data['estimator'],
         lexical=data.get('lexical', True),
         chordmap=get_chord_mapping(data.get('chordmap', None)),
         parent_counts=dict_to_object(data['parents']),
         expansion_type_counts=dict_to_object(data['expansions']),
         head_expansion_counts=dict_to_object(data['heads']),
         non_head_expansion_counts=dict_to_object(data['non_heads']),
         lexical_counts=dict_to_object(data['words']),
         grammar=data['grammar'],
     )
     return obj
示例#5
0
 def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs):
     """
     An n-gram model to be used as a tagging model. Uses NLTK to 
     represent, train and evaluate the n-gram model.
     
     """
     super(NgramTaggerModel, self).__init__(model_name, *args, **kwargs)
     self.model = model
     
     self.chordmap = get_chord_mapping(chordmap)
     self.chordmap_name = chordmap
     
     if self.options['n'] <= self.options['backoff']:
         # This is not allowed
         # We can only back off n-1 orders for an n-gram model
         raise TaggingModelError, "tried to load an n-gram model with "\
             "more orders of backoff than are possible (backing off "\
             "%d orders on a %d-gram model)" % \
                 (self.options['backoff'], self.options['n'])
示例#6
0
    def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs):
        """
        An n-gram model to be used as a tagging model. Uses NLTK to 
        represent, train and evaluate the n-gram model.
        
        """
        super(NgramTaggerModel, self).__init__(model_name, *args, **kwargs)
        self.model = model

        self.chordmap = get_chord_mapping(chordmap)
        self.chordmap_name = chordmap

        if self.options['n'] <= self.options['backoff']:
            # This is not allowed
            # We can only back off n-1 orders for an n-gram model
            raise TaggingModelError, "tried to load an n-gram model with "\
                "more orders of backoff than are possible (backing off "\
                "%d orders on a %d-gram model)" % \
                    (self.options['backoff'], self.options['n'])
示例#7
0
    def from_picklable_dict(cls, data):
        from jazzparser.utils.nltk.storage import dict_to_object

        if data['backoff_model'] is not None:
            backoff_model = cls.from_picklable_dict(data['backoff_model'])
        else:
            backoff_model = None

        return cls(data['order'],
                   dict_to_object(data['point_transition_counts']),
                   dict_to_object(data['fn_transition_counts']),
                   dict_to_object(data['type_emission_counts']),
                   dict_to_object(data['subst_emission_counts']),
                   data['estimator'],
                   backoff_model,
                   get_chord_mapping(data['chord_map']),
                   data['vector_dom'],
                   data['point_dom'],
                   history=data.get('history', ''))
示例#8
0
 def from_picklable_dict(cls, data):
     from jazzparser.utils.nltk.storage import dict_to_object
     
     if data['backoff_model'] is not None:
         backoff_model = cls.from_picklable_dict(data['backoff_model'])
     else:
         backoff_model = None
     
     return cls(data['order'],
                 dict_to_object(data['point_transition_counts']),
                 dict_to_object(data['fn_transition_counts']),
                 dict_to_object(data['type_emission_counts']),
                 dict_to_object(data['subst_emission_counts']),
                 data['estimator'],
                 backoff_model,
                 get_chord_mapping(data['chord_map']),
                 data['vector_dom'],
                 data['point_dom'],
                 history=data.get('history', ''))
示例#9
0
 def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs):
     super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs)
     process_chord_input(self)
     
     if type(self) == CandcTagger:
         raise NotImplementedError, "Tried to instantiate CandcTagger "\
             "directly. You should use one of its subclasses."
     self.tag_batch_ratio = self.options['batch']
     model = self.options['model'].split('.')
     
     # Check that candc is available for supertagging
     if not os.path.exists(settings.CANDC.BASE_PATH):
         raise CandcConfigurationError, "The C&C parser base "\
             "directory %s does not exist" % settings.CANDC.BASE_PATH
     if not os.path.exists(settings.CANDC.MODELS_PATH):
         raise CandcConfigurationError, "The C&C parser models "\
             "directory %s does not exist" % settings.CANDC.MODELS_PATH
     candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command)
     if not os.path.exists(candc_cmd):
         raise CandcConfigurationError, "The C&C supertagger command "\
             "%s does not exist. Have you built it?" % candc_cmd
     # Check the model exists
     candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model))
     if not os.path.exists(candc_model):
         raise CandcConfigurationError, "The C&C model given (%s) "\
             "doesn't exist." % candc_model
     
     # Create a logger to dump the output to
     logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model))
     candc_logger = create_logger(filename=logfile)
     self.logger.info("Logging C&C output to %s" % logfile)
     # Note in the log what we're trying to tag
     candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input]))
     
     # Read in the list of tags to smooth over
     self.tag_list = read_tag_list(os.path.join(candc_model, "tags"))
     
     # Read in extra options
     opts_filename = os.path.join(candc_model, "jpopts")
     if not os.path.exists(opts_filename):
         self.extra_opts = {}
     else:
         with open(opts_filename, 'r') as opts_file:
             self.extra_opts = dict(
                 [line.strip("\n").split(":", 1) 
                     for line in opts_file.readlines()])
     # Pull the chord mapping out of the options
     self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None))
     
     # Spawn a process to do the tagging
     candc_command = [candc_cmd, "--model", candc_model, 
                     "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args
     self.tagger = Popen(candc_command, 
                         stdin=PIPE, stdout=PIPE, stderr=PIPE)
     candc_logger.info("C&C command: %s" % " ".join(candc_command))
         
     self.tokens = self.input
     # Build some observations from the tokens
     observations = [
         interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) 
             for ch1,ch2 in group_pairs(self.tokens+[None])
     ]
     # Add a dummy POS tag to each input item
     self.observations = ["%s|C" % t for t in observations]
     candc_logger.info("Input: %s" % " ".join(self.observations))
     
     # Run the tagger on this input
     try:
         tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations))
     except OSError, err:
         logger.error("Could not run the C&C supertagger (%s)" % err)
         candc_logger.error("Error: %s" % err)
         # Output the actual error that the command returned
         error = self.tagger.stderr.read()
         logger.error("C&C returned the error: %s" % error)
         candc_logger.error("C&C error: %s" % error)
         raise CandcTaggingError, "error running the C&C supertagger: %s" % error