def _load_model(data): from .model import MultiChordNgramModel model = MultiChordNgramModel.from_picklable_dict(data['model']) name = data['name'] chordmap = get_chord_mapping(data.get('chordmap', None)) return MultiChordNgramTaggerModel(name, model=model, chordmap=chordmap)
def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs): """ An n-gram model to be used as a tagging model. Uses NLTK to represent, train and evaluate the n-gram model. """ super(MultiChordNgramTaggerModel, self).__init__(model_name, *args, **kwargs) self.model = model if chordmap is None: chordmap = get_chord_mapping() self.chordmap = chordmap
def _load_model(name, data): obj = HalfspanPcfgModel( name = name, cutoff = data['cutoff'], cat_bins = data['cat_bins'], estimator = data['estimator'], lexical = data.get('lexical', True), chordmap = get_chord_mapping(data.get('chordmap', None)), parent_counts = dict_to_object(data['parents']), expansion_type_counts = dict_to_object(data['expansions']), head_expansion_counts = dict_to_object(data['heads']), non_head_expansion_counts = dict_to_object(data['non_heads']), lexical_counts = dict_to_object(data['words']), grammar = data['grammar'], ) return obj
def _load_model(name, data): obj = HalfspanPcfgModel( name=name, cutoff=data['cutoff'], cat_bins=data['cat_bins'], estimator=data['estimator'], lexical=data.get('lexical', True), chordmap=get_chord_mapping(data.get('chordmap', None)), parent_counts=dict_to_object(data['parents']), expansion_type_counts=dict_to_object(data['expansions']), head_expansion_counts=dict_to_object(data['heads']), non_head_expansion_counts=dict_to_object(data['non_heads']), lexical_counts=dict_to_object(data['words']), grammar=data['grammar'], ) return obj
def __init__(self, model_name, model=None, chordmap=None, *args, **kwargs): """ An n-gram model to be used as a tagging model. Uses NLTK to represent, train and evaluate the n-gram model. """ super(NgramTaggerModel, self).__init__(model_name, *args, **kwargs) self.model = model self.chordmap = get_chord_mapping(chordmap) self.chordmap_name = chordmap if self.options['n'] <= self.options['backoff']: # This is not allowed # We can only back off n-1 orders for an n-gram model raise TaggingModelError, "tried to load an n-gram model with "\ "more orders of backoff than are possible (backing off "\ "%d orders on a %d-gram model)" % \ (self.options['backoff'], self.options['n'])
def from_picklable_dict(cls, data): from jazzparser.utils.nltk.storage import dict_to_object if data['backoff_model'] is not None: backoff_model = cls.from_picklable_dict(data['backoff_model']) else: backoff_model = None return cls(data['order'], dict_to_object(data['point_transition_counts']), dict_to_object(data['fn_transition_counts']), dict_to_object(data['type_emission_counts']), dict_to_object(data['subst_emission_counts']), data['estimator'], backoff_model, get_chord_mapping(data['chord_map']), data['vector_dom'], data['point_dom'], history=data.get('history', ''))
def __init__(self, grammar, input, options={}, dict_cutoff=5, *args, **kwargs): super(CandcTagger, self).__init__(grammar, input, options, *args, **kwargs) process_chord_input(self) if type(self) == CandcTagger: raise NotImplementedError, "Tried to instantiate CandcTagger "\ "directly. You should use one of its subclasses." self.tag_batch_ratio = self.options['batch'] model = self.options['model'].split('.') # Check that candc is available for supertagging if not os.path.exists(settings.CANDC.BASE_PATH): raise CandcConfigurationError, "The C&C parser base "\ "directory %s does not exist" % settings.CANDC.BASE_PATH if not os.path.exists(settings.CANDC.MODELS_PATH): raise CandcConfigurationError, "The C&C parser models "\ "directory %s does not exist" % settings.CANDC.MODELS_PATH candc_cmd = os.path.join(settings.CANDC.BASE_PATH, "bin", self.command) if not os.path.exists(candc_cmd): raise CandcConfigurationError, "The C&C supertagger command "\ "%s does not exist. Have you built it?" % candc_cmd # Check the model exists candc_model = os.path.join(settings.CANDC.MODELS_PATH, *(model)) if not os.path.exists(candc_model): raise CandcConfigurationError, "The C&C model given (%s) "\ "doesn't exist." % candc_model # Create a logger to dump the output to logfile = os.path.join(settings.CANDC.LOG_DIRECTORY, "-".join(model)) candc_logger = create_logger(filename=logfile) self.logger.info("Logging C&C output to %s" % logfile) # Note in the log what we're trying to tag candc_logger.info("Tagging: %s" % " ".join([str(crd) for crd in self.input])) # Read in the list of tags to smooth over self.tag_list = read_tag_list(os.path.join(candc_model, "tags")) # Read in extra options opts_filename = os.path.join(candc_model, "jpopts") if not os.path.exists(opts_filename): self.extra_opts = {} else: with open(opts_filename, 'r') as opts_file: self.extra_opts = dict( [line.strip("\n").split(":", 1) for line in opts_file.readlines()]) # Pull the chord mapping out of the options self.chordmap = get_chord_mapping(self.extra_opts.get('chordmap', None)) # Spawn a process to do the tagging candc_command = [candc_cmd, "--model", candc_model, "--dict_cutoff", "%d" % dict_cutoff]+self.extra_args self.tagger = Popen(candc_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) candc_logger.info("C&C command: %s" % " ".join(candc_command)) self.tokens = self.input # Build some observations from the tokens observations = [ interval_observation_from_chord_string_pair(ch1,ch2,type_mapping=self.chordmap) for ch1,ch2 in group_pairs(self.tokens+[None]) ] # Add a dummy POS tag to each input item self.observations = ["%s|C" % t for t in observations] candc_logger.info("Input: %s" % " ".join(self.observations)) # Run the tagger on this input try: tagger_out, tagger_err = self.tagger.communicate(" ".join(self.observations)) except OSError, err: logger.error("Could not run the C&C supertagger (%s)" % err) candc_logger.error("Error: %s" % err) # Output the actual error that the command returned error = self.tagger.stderr.read() logger.error("C&C returned the error: %s" % error) candc_logger.error("C&C error: %s" % error) raise CandcTaggingError, "error running the C&C supertagger: %s" % error