Пример #1
0
def sentence_ruleapplication(sentence, filtering=None, timeout=None, **kwargs):

    applied_rules = dict()

    if filtering is not None and not filtering.filter(sentence):
        logging.info('Skipping sentence with id %s (%s) due to filtering (graph size %d)' % (
            sentence.id, sentence.orig_id, len(sentence.source.graph)))
        return sentence.id, applied_rules

    if sentence.source.graph is not None:
        logging.debug('Starting rule application for sentence %s' % sentence.id)

        try:
            if timeout:
                with to.timeout(seconds=timeout):
                    applied_rules = ruleapplication(sentence.source.graph, **kwargs)
            else:
                applied_rules = ruleapplication(sentence.source.graph, **kwargs)

            logging.info('Extracted %d possible source sides for sentence with id %s (%s)' % (
                len(applied_rules), sentence.id, sentence.orig_id))

        except to.TimeoutError:
            logging.warn('Rule application for sentence with id %s (%s) failed due to timeout after %d seconds' % (
                sentence.id, sentence.orig_id, timeout))

    return sentence.id, applied_rules
Пример #2
0
def top_level_pruning(input_dir,
                      output_dir,
                      prune_threshold,
                      n_best,
                      openfst,
                      max_duration=600):
    """
    Read lattices from disk and prune + union them with n shortest paths.
    :param input_dir: Input lattice directory path
    :param output_dir: Output lattice directory path
    :param prune_threshold: Integer indicating how much to prune
    :param n_best: Integer indicating how many best outputs to store
    :param openfst: OpenFST utility functions object
    :return:
    """

    # Iterate over files in input directory
    for i in sorted(os.listdir(input_dir)):
        if not i.endswith(".fst"):
            continue

        filename_in = os.path.join(input_dir, i)
        filename_out = os.path.join(output_dir, i)

        # Read a sentence lattice back into memory
        logging.info('Reading lattice stored at %s' % (filename_in, ))
        fsa = openfst.read_fst(filename_in)

        # Compute its n_best_list
        start_time = time.time()

        logging.info(
            'FSA has %d states and %d arcs before top level pruning.' %
            (openfst.num_states(fsa), openfst.num_arcs(fsa)))

        try:
            with to.timeout(seconds=max_duration):
                fsa = fst_pruning(fsa, prune_threshold, n_best, openfst)

        except to.TimeoutError:
            logging.warn(
                'Top-level pruning of FSA %s was terminated due to exceeding the timeout limit of %d seconds'
                % (filename_in, max_duration))

            reduced_n_best = int(n_best / 2.0)
            logging.info('Performing TLP with reduced n_best size of %d.' %
                         (reduced_n_best, ))
            fsa = fst_pruning(fsa, prune_threshold, reduced_n_best, openfst)

        logging.info(
            'FSA has %d states and %d arcs after top level pruning, performed in %.3f seconds.'
            % (openfst.num_states(fsa), openfst.num_arcs(fsa),
               time.time() - start_time))

        logging.info('Writing lattice to disk at %s' % (filename_out, ))
        openfst.write_fst(fsa, filename_out)
Пример #3
0
def dataset_translation_rulextraction(sentence_collection,
                                      idx=True,
                                      filtering=None,
                                      timeout=None,
                                      **kwargs):
    read_graphs(sentence_collection, idx=idx)

    rules = list()

    for sentence in sentence_collection:
        if idx:
            tok = sentence.target.tokenized_idx
        else:
            tok = sentence.target.tokenized_text

        if filtering is not None and not filtering.filter(sentence):
            logging.info(
                'Skipping sentence with id %s (%s) due to filtering (source token number %d, graph size %d)'
                % (sentence.id, sentence.orig_id,
                   len(sentence.source.tokenized_text),
                   len(sentence.source.graph)))
            continue

        if sentence.source.graph is not None and tok is not None and sentence.alignment is not None and sentence.alignment.sgtt is not None:
            logging.debug('Starting rule extraction for sentence %s' %
                          sentence.id)

            alignments = dict(index
                              for plain, index in sentence.alignment.sgtt)
            alignment_dict = utility.create_alignment_dict(
                alignments, sentence.source.graph)

            tok = [unicode(x) for x in tok]

            try:
                if timeout:
                    with to.timeout(seconds=timeout):
                        sentence_rules = rulextraction(sentence.source.graph,
                                                       tok, alignment_dict,
                                                       **kwargs)
                else:
                    sentence_rules = rulextraction(sentence.source.graph, tok,
                                                   alignment_dict, **kwargs)

                rules.extend(sentence_rules)
                logging.info(
                    'Extracted %d rules from sentence with id %s (%s)' %
                    (len(sentence_rules), sentence.id, sentence.orig_id))

            except to.TimeoutError:
                logging.warn(
                    'Rule extraction for sentence with id %s (%s) failed due to timeout after %d seconds'
                    % (sentence.id, sentence.orig_id, timeout))
                continue

    return rules
Пример #4
0
    def decode_sentence(self, index, sentence, grammar_path, coverage_path,
                        feature_weights_dict, local_pruning_lm, openfst,
                        translation_lats, alilats_dir, hiero_dir, hiero_lats,
                        prune_reference_shortest_path):
        """
        Decode a single sentence in alignment mode and save its lattice to disk.
        :param index: Sentence index in the dataset
        :param sentence: Sentence object
        :param grammar_path: Path to the sentence specific grammars
        :param coverage_path: Path to the sentence specific coverages
        :param feature_weights_dict: Dictionary of feature names and their weights
        :param local_pruning_lm: Language model FSA for local pruning
        :param openfst: OpenFST utility functions object
        :param translation_lats: Directory path storing translation lattices
        :param alilats_dir: Directory path for storing alignment lattices
        :param hiero_lats: Directory path storing Hiero translation lattices - used for hiero only mode
        :param hiero_dir: Hiero RTN directory path
        :param prune_reference_shortest_path: Number of best reference hypotheses extracted from translation FSA
        :return:
        """

        logging.info('Starting decoding of sentence with index %d and id %s' %
                     (index, sentence.id))
        start_time = time.time()

        if self.hiero_only:
            logging.info(
                'Starting decoding of sentence %s in hiero only mode.' %
                (sentence.id, ))
            sentence_lattice = self.hiero_only_mode(
                sentence, hiero_dir, hiero_lats, prune_reference_shortest_path,
                openfst)

        elif self.filtering is not None and not self.filtering.filter(
                sentence):
            # If sentence is filtered out for HSST decoding, try to use hiero
            logging.info('Sentence %s filtered out of HSST decoding.' %
                         (sentence.id, ))

            if hiero_dir is None:
                logging.info('Returning empty result for sentence %s.' %
                             (sentence.id, ))
                return

            else:
                logging.info(
                    'Starting decoding of sentence %s in hiero only mode.' %
                    (sentence.id, ))
                sentence_lattice = self.hiero_only_mode(
                    sentence, hiero_dir, hiero_lats,
                    prune_reference_shortest_path, openfst)

        else:

            hiero_subdir = None
            if hiero_dir is not None:
                hiero_subdir = os.path.join(hiero_dir, sentence.id)

            try:
                # Load sentence specific grammar
                logging.info('Loading grammar for sentence %s from %s.' %
                             (sentence.id, grammar_path))
                sentence_grammar = utility.load_sentence_specific_grammar(
                    sentence.id, grammar_path)
                logging.info('Loading coverage for sentence %s from %s.' %
                             (sentence.id, coverage_path))
                sentence_coverage = utility.load_sentence_specific_coverage(
                    sentence.id, coverage_path)

                with to.timeout(seconds=self.timeout):
                    # Create reference and reference substring FSAs
                    reference_fsa, reference_subs_fsa = self.create_reference_acceptors(
                        sentence.id, translation_lats, openfst,
                        prune_reference_shortest_path)

                    if hiero_subdir is None:
                        logging.info(
                            'Starting decoding of sentence %s in HSST only mode.'
                            % (sentence.id, ))
                        sentence_lattice = self.hsst_only_mode(
                            sentence, sentence_grammar, sentence_coverage,
                            feature_weights_dict, local_pruning_lm,
                            reference_fsa, reference_subs_fsa, openfst)

                    else:
                        logging.info(
                            'Starting decoding of sentence %s in hiero+HSST mode.'
                            % (sentence.id, ))
                        sentence_lattice = self.hsst_hiero_mode(
                            sentence, sentence_grammar, sentence_coverage,
                            feature_weights_dict, local_pruning_lm,
                            reference_fsa, reference_subs_fsa, openfst,
                            hiero_subdir)

            except to.TimeoutError:
                logging.warn(
                    'Decoding of sentence %s (%s) was terminated due to exceeding the timeout limit of %d seconds'
                    % (sentence.id, sentence.orig_id, self.timeout))

                if hiero_dir is None or not self.hiero_backoff:
                    sentence_lattice = openfst.create_empty_fst()

                else:
                    # Relaunch decoding using Hiero-only mode
                    logging.info(
                        'Starting decoding of sentence %s in hiero only mode.'
                        % (sentence.id, ))
                    sentence_lattice = self.hiero_only_mode(
                        sentence, hiero_dir, hiero_lats,
                        prune_reference_shortest_path, openfst)

            except MemoryError:
                logging.error(
                    'Decoding of sentence %s (%s) was terminated due to exceeding allowed memory after %d seconds'
                    %
                    (sentence.id, sentence.orig_id, time.time() - start_time))

                if hiero_dir is None or not self.hiero_backoff:
                    sentence_lattice = openfst.create_empty_fst()

                else:
                    # Relaunch decoding using Hiero-only mode
                    logging.info(
                        'Starting decoding of sentence %s in hiero only mode.'
                        % (sentence.id, ))
                    sentence_lattice = self.hiero_only_mode(
                        sentence, hiero_dir, hiero_lats,
                        prune_reference_shortest_path, openfst)

            except MissingFileException:
                logging.exception('Missing file')

                if hiero_dir is None or not self.hiero_backoff:
                    sentence_lattice = openfst.create_empty_fst()

                else:
                    # Relaunch decoding using Hiero-only mode
                    logging.info(
                        'Starting decoding of sentence %s in hiero only mode.'
                        % (sentence.id, ))
                    sentence_lattice = self.hiero_only_mode(
                        sentence, hiero_dir, hiero_lats,
                        prune_reference_shortest_path, openfst)

        # If decoding failed, skip storing lattice to the disk
        if sentence_lattice is None:
            logging.error(
                'Decoding of sentence with index %d and id %s failed in %.3f seconds'
                % (index, sentence.id, time.time() - start_time))
            sentence_lattice = openfst.create_empty_fst()

        logging.info(
            'Decoding of sentence with index %d and id %s finished in %.3f seconds'
            % (index, sentence.id, time.time() - start_time))

        # Store lattice to disk
        current_time = time.time()
        lattice_path = os.path.join(alilats_dir, '%s.fst' % sentence.id)
        openfst.store_lattice(sentence_lattice, lattice_path)

        logging.info('Lattice stored to disk as %s in %.3f seconds' %
                     (lattice_path, time.time() - current_time))
Пример #5
0
    def decode_sentence(self, index, sentence, grammar_path, coverage_path, feature_weights_dict, local_pruning_lm,
                        openfst, sentence_id_map, lats_dir, hiero_dir):
        """
        Decode a single sentence and save its lattice to disk (without LM applied).
        :param index: Sentence index in the dataset
        :param sentence: Sentence object
        :param grammar_path: Path to the sentence specific grammars
        :param coverage_path: Path to the sentence specific coverages
        :param feature_weights_dict: Dictionary of feature names and their weights
        :param local_pruning_lm: Language model FSA for local pruning
        :param openfst: OpenFST utility functions object
        :param sentence_id_map: Dictionary for tracking decoded sentence ids for applying LM
        :param lats_dir: Directory path for storing lattices
        :param hiero_dir: Hiero RTN directory path
        :return:
        """

        logging.info('Starting decoding of sentence with index %d and id %s' % (index, sentence.id))
        start_time = time.time()

        if self.hiero_only:
            logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,))
            sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst)

        elif self.filtering is not None and not self.filtering.filter(sentence):
            # If sentence is filtered out for HSST decoding, try to use hiero
            logging.info('Sentence %s filtered out of HSST decoding.' % (sentence.id,))

            if hiero_dir is None or not self.hiero_backoff:
                logging.info('Returning empty result for sentence %s.' % (sentence.id,))
                return Sentence(sentence.id)

            else:
                logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,))
                sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst)

        else:

            try:
                # Load sentence specific grammar
                logging.info('Loading grammar for sentence %s from %s.' % (sentence.id, grammar_path))
                sentence_grammar = utility.load_sentence_specific_grammar(sentence.id, grammar_path)
                logging.info('Loading coverage for sentence %s from %s.' % (sentence.id, coverage_path))
                sentence_coverage = utility.load_sentence_specific_coverage(sentence.id, coverage_path)

                with to.timeout(seconds=self.timeout):
                    if hiero_dir is None:
                        logging.info('Starting decoding of sentence %s in HSST only mode.' % (sentence.id,))
                        sentence_lattice = self.hsst_only_mode(
                            sentence,
                            sentence_grammar,
                            sentence_coverage,
                            feature_weights_dict,
                            local_pruning_lm,
                            openfst
                        )

                    else:
                        logging.info('Starting decoding of sentence %s in hiero+HSST mode.' % (sentence.id,))
                        sentence_lattice = self.hsst_hiero_mode(
                            sentence,
                            sentence_grammar,
                            sentence_coverage,
                            feature_weights_dict,
                            local_pruning_lm,
                            openfst,
                            hiero_dir
                        )

            except to.TimeoutError:
                logging.warn('Decoding of sentence %s (%s) was terminated due to exceeding the timeout limit of %d seconds' % (
                    sentence.id,
                    sentence.orig_id,
                    self.timeout
                ))

                if hiero_dir is None or not self.hiero_backoff:
                    # Return empty result sentence to indicate decoding was attempted
                    logging.error('Decoding of sentence %s (%s) was terminated due to timeout and no backup possible. Sentence will not be decoded.' % (
                        sentence.id,
                        sentence.orig_id
                    ))
                    return Sentence(sentence.id)

                else:
                    # Relaunch decoding using Hiero-only mode
                    logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,))
                    sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst)

            except MemoryError:
                logging.error(
                    'Decoding of sentence %s (%s) was terminated due to exceeding allowed memory after %d seconds' % (
                        sentence.id, sentence.orig_id, time.time() - start_time))

                if hiero_dir is None or not self.hiero_backoff:
                    logging.error('Decoding of sentence %s (%s) was terminated due to exceeding maximum memory and no backup possible. Sentence will not be decoded.' % (
                        sentence.id,
                        sentence.orig_id
                    ))
                    return Sentence(sentence.id)

                else:
                    # Relaunch decoding using Hiero-only mode
                    logging.info('Starting decoding of sentence %s in hiero only mode.' % (sentence.id,))
                    sentence_lattice = self.hiero_only_mode(sentence, hiero_dir, openfst)

        # If decoding failed, skip storing lattice to the disk
        if sentence_lattice is None:
            logging.info('Decoding of sentence with index %d and id %s failed in %.3f seconds' % (
                index, sentence.id, time.time() - start_time))

            # Return empty result sentence to indicate decoding was attempted
            return Sentence(sentence.id)

        logging.info('Decoding of sentence with index %d and id %s finished in %.3f seconds' % (
            index, sentence.id, time.time() - start_time))

        # Store sentence lattice to disk with a mapped name to support applylm tool.
        helpers.store_sentence_lattice_to_disk(sentence_lattice, sentence, sentence_id_map, lats_dir, openfst)

        # Return sentence marking it as successfully decoded
        return Sentence(sentence.id, result=Result())