def sentence_ruleapplication(sentence, filtering=None, timeout=None, **kwargs): applied_rules = dict() if filtering is not None and not filtering.filter(sentence): logging.info('Skipping sentence with id %s (%s) due to filtering (graph size %d)' % ( sentence.id, sentence.orig_id, len(sentence.source.graph))) return sentence.id, applied_rules if sentence.source.graph is not None: logging.debug('Starting rule application for sentence %s' % sentence.id) try: if timeout: with to.timeout(seconds=timeout): applied_rules = ruleapplication(sentence.source.graph, **kwargs) else: applied_rules = ruleapplication(sentence.source.graph, **kwargs) logging.info('Extracted %d possible source sides for sentence with id %s (%s)' % ( len(applied_rules), sentence.id, sentence.orig_id)) except to.TimeoutError: logging.warn('Rule application for sentence with id %s (%s) failed due to timeout after %d seconds' % ( sentence.id, sentence.orig_id, timeout)) return sentence.id, applied_rules
def top_level_pruning(input_dir, output_dir, prune_threshold, n_best, openfst, max_duration=600): """ Read lattices from disk and prune + union them with n shortest paths. :param input_dir: Input lattice directory path :param output_dir: Output lattice directory path :param prune_threshold: Integer indicating how much to prune :param n_best: Integer indicating how many best outputs to store :param openfst: OpenFST utility functions object :return: """ # Iterate over files in input directory for i in sorted(os.listdir(input_dir)): if not i.endswith(".fst"): continue filename_in = os.path.join(input_dir, i) filename_out = os.path.join(output_dir, i) # Read a sentence lattice back into memory logging.info('Reading lattice stored at %s' % (filename_in, )) fsa = openfst.read_fst(filename_in) # Compute its n_best_list start_time = time.time() logging.info( 'FSA has %d states and %d arcs before top level pruning.' % (openfst.num_states(fsa), openfst.num_arcs(fsa))) try: with to.timeout(seconds=max_duration): fsa = fst_pruning(fsa, prune_threshold, n_best, openfst) except to.TimeoutError: logging.warn( 'Top-level pruning of FSA %s was terminated due to exceeding the timeout limit of %d seconds' % (filename_in, max_duration)) reduced_n_best = int(n_best / 2.0) logging.info('Performing TLP with reduced n_best size of %d.' % (reduced_n_best, )) fsa = fst_pruning(fsa, prune_threshold, reduced_n_best, openfst) logging.info( 'FSA has %d states and %d arcs after top level pruning, performed in %.3f seconds.' % (openfst.num_states(fsa), openfst.num_arcs(fsa), time.time() - start_time)) logging.info('Writing lattice to disk at %s' % (filename_out, )) openfst.write_fst(fsa, filename_out)
def dataset_translation_rulextraction(sentence_collection, idx=True, filtering=None, timeout=None, **kwargs): read_graphs(sentence_collection, idx=idx) rules = list() for sentence in sentence_collection: if idx: tok = sentence.target.tokenized_idx else: tok = sentence.target.tokenized_text if filtering is not None and not filtering.filter(sentence): logging.info( 'Skipping sentence with id %s (%s) due to filtering (source token number %d, graph size %d)' % (sentence.id, sentence.orig_id, len(sentence.source.tokenized_text), len(sentence.source.graph))) continue if sentence.source.graph is not None and tok is not None and sentence.alignment is not None and sentence.alignment.sgtt is not None: logging.debug('Starting rule extraction for sentence %s' % sentence.id) alignments = dict(index for plain, index in sentence.alignment.sgtt) alignment_dict = utility.create_alignment_dict( alignments, sentence.source.graph) tok = [unicode(x) for x in tok] try: if timeout: with to.timeout(seconds=timeout): sentence_rules = rulextraction(sentence.source.graph, tok, alignment_dict, **kwargs) else: sentence_rules = rulextraction(sentence.source.graph, tok, alignment_dict, **kwargs) rules.extend(sentence_rules) logging.info( 'Extracted %d rules from sentence with id %s (%s)' % (len(sentence_rules), sentence.id, sentence.orig_id)) except to.TimeoutError: logging.warn( 'Rule extraction for sentence with id %s (%s) failed due to timeout after %d seconds' % (sentence.id, sentence.orig_id, timeout)) continue return rules
def decode_sentence(self, index, sentence, grammar_path, coverage_path, feature_weights_dict, local_pruning_lm, openfst, translation_lats, alilats_dir, hiero_dir, hiero_lats, prune_reference_shortest_path): """ Decode a single sentence in alignment mode and save its lattice to disk. :param index: Sentence index in the dataset :param sentence: Sentence object :param grammar_path: Path to the sentence specific grammars :param coverage_path: Path to the sentence specific coverages :param feature_weights_dict: Dictionary of feature names and their weights :param local_pruning_lm: Language model FSA for local pruning :param openfst: OpenFST utility functions object :param translation_lats: Directory path storing translation lattices :param alilats_dir: Directory path for storing alignment lattices :param hiero_lats: Directory path storing Hiero translation lattices - used for hiero only mode :param hiero_dir: Hiero RTN directory path :param prune_reference_shortest_path: Number of best reference hypotheses extracted from translation FSA :return: """ logging.info('Starting decoding of sentence with index %d and id %s' % (index, sentence.id)) start_time = time.time() if self.hiero_only: logging.info( 'Starting decoding of sentence %s in hiero only mode.' % (sentence.id, )) sentence_lattice = self.hiero_only_mode( sentence, hiero_dir, hiero_lats, prune_reference_shortest_path, openfst) elif self.filtering is not None and not self.filtering.filter( sentence): # If sentence is filtered out for HSST decoding, try to use hiero logging.info('Sentence %s filtered out of HSST decoding.' % (sentence.id, )) if hiero_dir is None: logging.info('Returning empty result for sentence %s.' % (sentence.id, )) return else: logging.info( 'Starting decoding of sentence %s in hiero only mode.' % (sentence.id, )) sentence_lattice = self.hiero_only_mode( sentence, hiero_dir, hiero_lats, prune_reference_shortest_path, openfst) else: hiero_subdir = None if hiero_dir is not None: hiero_subdir = os.path.join(hiero_dir, sentence.id) try: # Load sentence specific grammar logging.info('Loading grammar for sentence %s from %s.' % (sentence.id, grammar_path)) sentence_grammar = utility.load_sentence_specific_grammar( sentence.id, grammar_path) logging.info('Loading coverage for sentence %s from %s.' % (sentence.id, coverage_path)) sentence_coverage = utility.load_sentence_specific_coverage( sentence.id, coverage_path) with to.timeout(seconds=self.timeout): # Create reference and reference substring FSAs reference_fsa, reference_subs_fsa = self.create_reference_acceptors( sentence.id, translation_lats, openfst, prune_reference_shortest_path) if hiero_subdir is None: logging.info( 'Starting decoding of sentence %s in HSST only mode.' % (sentence.id, )) sentence_lattice = self.hsst_only_mode( sentence, sentence_grammar, sentence_coverage, feature_weights_dict, local_pruning_lm, reference_fsa, reference_subs_fsa, openfst) else: logging.info( 'Starting decoding of sentence %s in hiero+HSST mode.' % (sentence.id, )) sentence_lattice = self.hsst_hiero_mode( sentence, sentence_grammar, sentence_coverage, feature_weights_dict, local_pruning_lm, reference_fsa, reference_subs_fsa, openfst, hiero_subdir) except to.TimeoutError: logging.warn( 'Decoding of sentence %s (%s) was terminated due to exceeding the timeout limit of %d seconds' % (sentence.id, sentence.orig_id, self.timeout)) if hiero_dir is None or not self.hiero_backoff: sentence_lattice = openfst.create_empty_fst() else: # Relaunch decoding using Hiero-only mode logging.info( 'Starting decoding of sentence %s in hiero only mode.' % (sentence.id, )) sentence_lattice = self.hiero_only_mode( sentence, hiero_dir, hiero_lats, prune_reference_shortest_path, openfst) except MemoryError: logging.error( 'Decoding of sentence %s (%s) was terminated due to exceeding allowed memory after %d seconds' % (sentence.id, sentence.orig_id, time.time() - start_time)) if hiero_dir is None or not self.hiero_backoff: sentence_lattice = openfst.create_empty_fst() else: # Relaunch decoding using Hiero-only mode logging.info( 'Starting decoding of sentence %s in hiero only mode.' % (sentence.id, )) sentence_lattice = self.hiero_only_mode( sentence, hiero_dir, hiero_lats, prune_reference_shortest_path, openfst) except MissingFileException: logging.exception('Missing file') if hiero_dir is None or not self.hiero_backoff: sentence_lattice = openfst.create_empty_fst() else: # Relaunch decoding using Hiero-only mode logging.info( 'Starting decoding of sentence %s in hiero only mode.' % (sentence.id, )) sentence_lattice = self.hiero_only_mode( sentence, hiero_dir, hiero_lats, prune_reference_shortest_path, openfst) # If decoding failed, skip storing lattice to the disk if sentence_lattice is None: logging.error( 'Decoding of sentence with index %d and id %s failed in %.3f seconds' % (index, sentence.id, time.time() - start_time)) sentence_lattice = openfst.create_empty_fst() logging.info( 'Decoding of sentence with index %d and id %s finished in %.3f seconds' % (index, sentence.id, time.time() - start_time)) # Store lattice to disk current_time = time.time() lattice_path = os.path.join(alilats_dir, '%s.fst' % sentence.id) openfst.store_lattice(sentence_lattice, lattice_path) logging.info('Lattice stored to disk as %s in %.3f seconds' % (lattice_path, time.time() - current_time))
def decode_sentence(self, index, sentence, grammar_path, coverage_path, feature_weights_dict, local_pruning_lm, openfst, sentence_id_map, lats_dir, hiero_dir): """ Decode a single sentence and save its lattice to disk (without LM applied). :param index: Sentence index in the dataset :param sentence: Sentence object :param grammar_path: Path to the sentence specific grammars :param coverage_path: Path to the sentence specific coverages :param feature_weights_dict: Dictionary of feature names and their weights :param local_pruning_lm: Language model FSA for local pruning :param openfst: OpenFST utility functions object :param sentence_id_map: Dictionary for tracking decoded sentence ids for applying LM :param lats_dir: Directory path for storing lattices :param hiero_dir: Hiero RTN directory path :return: """ logging.info('Starting decoding of sentence with index %d and id %s' % (index, sentence.id)) start_time = time.time() if self.hiero_only: logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,)) sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst) elif self.filtering is not None and not self.filtering.filter(sentence): # If sentence is filtered out for HSST decoding, try to use hiero logging.info('Sentence %s filtered out of HSST decoding.' % (sentence.id,)) if hiero_dir is None or not self.hiero_backoff: logging.info('Returning empty result for sentence %s.' % (sentence.id,)) return Sentence(sentence.id) else: logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,)) sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst) else: try: # Load sentence specific grammar logging.info('Loading grammar for sentence %s from %s.' % (sentence.id, grammar_path)) sentence_grammar = utility.load_sentence_specific_grammar(sentence.id, grammar_path) logging.info('Loading coverage for sentence %s from %s.' % (sentence.id, coverage_path)) sentence_coverage = utility.load_sentence_specific_coverage(sentence.id, coverage_path) with to.timeout(seconds=self.timeout): if hiero_dir is None: logging.info('Starting decoding of sentence %s in HSST only mode.' % (sentence.id,)) sentence_lattice = self.hsst_only_mode( sentence, sentence_grammar, sentence_coverage, feature_weights_dict, local_pruning_lm, openfst ) else: logging.info('Starting decoding of sentence %s in hiero+HSST mode.' % (sentence.id,)) sentence_lattice = self.hsst_hiero_mode( sentence, sentence_grammar, sentence_coverage, feature_weights_dict, local_pruning_lm, openfst, hiero_dir ) except to.TimeoutError: logging.warn('Decoding of sentence %s (%s) was terminated due to exceeding the timeout limit of %d seconds' % ( sentence.id, sentence.orig_id, self.timeout )) if hiero_dir is None or not self.hiero_backoff: # Return empty result sentence to indicate decoding was attempted logging.error('Decoding of sentence %s (%s) was terminated due to timeout and no backup possible. Sentence will not be decoded.' % ( sentence.id, sentence.orig_id )) return Sentence(sentence.id) else: # Relaunch decoding using Hiero-only mode logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,)) sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst) except MemoryError: logging.error( 'Decoding of sentence %s (%s) was terminated due to exceeding allowed memory after %d seconds' % ( sentence.id, sentence.orig_id, time.time() - start_time)) if hiero_dir is None or not self.hiero_backoff: logging.error('Decoding of sentence %s (%s) was terminated due to exceeding maximum memory and no backup possible. Sentence will not be decoded.' % ( sentence.id, sentence.orig_id )) return Sentence(sentence.id) else: # Relaunch decoding using Hiero-only mode logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,)) sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst) # If decoding failed, skip storing lattice to the disk if sentence_lattice is None: logging.info('Decoding of sentence with index %d and id %s failed in %.3f seconds' % ( index, sentence.id, time.time() - start_time)) # Return empty result sentence to indicate decoding was attempted return Sentence(sentence.id) logging.info('Decoding of sentence with index %d and id %s finished in %.3f seconds' % ( index, sentence.id, time.time() - start_time)) # Store sentence lattice to disk with a mapped name to support applylm tool. helpers.store_sentence_lattice_to_disk(sentence_lattice, sentence, sentence_id_map, lats_dir, openfst) # Return sentence marking it as successfully decoded return Sentence(sentence.id, result=Result())