예제 #1
0
    def __init__(self,
                 context: mx.context.Context,
                 inputs: str,
                 references: str,
                 model: str,
                 max_input_len: int,
                 beam_size=C.DEFAULT_BEAM_SIZE,
                 limit: int = -1):
        self.context = context
        self.max_input_len = max_input_len
        self.beam_size = beam_size
        self.model = model
        with smart_open(inputs) as inputs_fin, smart_open(references) as references_fin:
            input_sentences = inputs_fin.readlines()
            target_sentences = references_fin.readlines()
            assert len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match"
            if limit <= 0:
                limit = len(input_sentences)
            if limit < len(input_sentences):
                self.input_sentences, self.target_sentences = zip(
                    *random.sample(list(zip(input_sentences, target_sentences)),
                                   limit))
            else:
                self.input_sentences, self.target_sentences = input_sentences, target_sentences

        logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)",
                    max_input_len, beam_size, model, len(self.input_sentences))

        with smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \
                smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out:
            [trg_out.write(s) for s in self.target_sentences]
            [src_out.write(s) for s in self.input_sentences]
예제 #2
0
    def __init__(self,
                 context: mx.context.Context,
                 inputs: str,
                 references: str,
                 model: str,
                 max_input_len: int,
                 beam_size: int = C.DEFAULT_BEAM_SIZE,
                 bucket_width_source: int = 10,
                 bucket_width_target: int = 10,
                 length_penalty_alpha: float = 1.0,
                 length_penalty_beta: float = 0.0,
                 softmax_temperature: Optional[float] = None,
                 max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
                 ensemble_mode: str = 'linear',
                 sample_size: int = -1,
                 random_seed: int = 42) -> None:
        self.context = context
        self.max_input_len = max_input_len
        self.max_output_length_num_stds = max_output_length_num_stds
        self.ensemble_mode = ensemble_mode
        self.beam_size = beam_size
        self.bucket_width_source = bucket_width_source
        self.bucket_width_target = bucket_width_target
        self.length_penalty_alpha = length_penalty_alpha
        self.length_penalty_beta = length_penalty_beta
        self.softmax_temperature = softmax_temperature
        self.model = model
        with smart_open(inputs) as inputs_fin, smart_open(references) as references_fin:
            input_sentences = inputs_fin.readlines()
            target_sentences = references_fin.readlines()
            check_condition(len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match")
            if sample_size <= 0:
                sample_size = len(input_sentences)
            if sample_size < len(input_sentences):
                # custom random number generator to guarantee the same samples across runs in order to be able to
                # compare metrics across independent runs
                random_gen = random.Random(random_seed)
                self.input_sentences, self.target_sentences = zip(
                    *random_gen.sample(list(zip(input_sentences, target_sentences)),
                                       sample_size))
            else:
                self.input_sentences, self.target_sentences = input_sentences, target_sentences

        logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)",
                    max_input_len, beam_size, model, len(self.input_sentences))

        with smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \
                smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out:
            [trg_out.write(s) for s in self.target_sentences]
            [src_out.write(s) for s in self.input_sentences]
예제 #3
0
    def decode_and_evaluate(self, checkpoint: int) -> Dict[str, float]:
        """
        Decodes data set and evaluates given a checkpoint.

        :param checkpoint: Checkpoint to load parameters from.
        :return: Mapping of metric names to scores.
        """
        translator = sockeye.inference.Translator(self.context, 'linear',
                                                  *sockeye.inference.load_models(self.context,
                                                                                 self.max_input_len,
                                                                                 self.beam_size,
                                                                                 [self.model],
                                                                                 [checkpoint]))

        output_name = os.path.join(self.model, C.DECODE_OUT_NAME % checkpoint)
        with smart_open(output_name, 'w') as output:
            handler = sockeye.output_handler.StringOutputHandler(output)
            translations = []
            for sent_id, input_sentence in enumerate(self.input_sentences):
                trans_input = translator.make_input(sent_id, input_sentence)
                trans_output = translator.translate(trans_input)
                handler.handle(trans_input, trans_output)
                translations.append(trans_output.translation)
        logger.info("Checkpoint [%d] %d translations saved to '%s'", checkpoint, len(translations), output_name)
        # TODO(fhieber): eventually add more metrics (METEOR etc.)
        return {"bleu-val": sockeye.bleu.corpus_bleu(translations, self.target_sentences)}
예제 #4
0
def read_lexicon(path: str, vocab_source: Dict[str, int],
                 vocab_target: Dict[str, int]) -> np.ndarray:
    """
    Loads lexical translation probabilities from a translation table of format: src, trg, logprob.
    Source words unknown to vocab_source are discarded.
    Target words unknown to vocab_target contribute to p(unk|source_word).
    See Incorporating Discrete Translation Lexicons into Neural Machine Translation, Section 3.1 & Equation 5
    (https://arxiv.org/pdf/1606.02006.pdf))

    :param path: Path to lexicon file.
    :param vocab_source: Source vocabulary.
    :param vocab_target: Target vocabulary.
    :return: Lexicon array. Shape: (vocab_source_size, vocab_target_size).
    """
    assert C.UNK_SYMBOL in vocab_source
    assert C.UNK_SYMBOL in vocab_target
    src_unk_id = vocab_source[C.UNK_SYMBOL]
    trg_unk_id = vocab_target[C.UNK_SYMBOL]
    lexicon = np.zeros((len(vocab_source), len(vocab_target)))
    n = 0
    with smart_open(path) as fin:
        for line in fin:
            src, trg, logprob = line.rstrip('\n').split("\t")
            prob = np.exp(float(logprob))
            src_id = vocab_source.get(src, src_unk_id)
            trg_id = vocab_target.get(trg, trg_unk_id)
            if src_id == src_unk_id:
                continue
            if trg_id == trg_unk_id:
                lexicon[src_id, trg_unk_id] += prob
            else:
                lexicon[src_id, trg_id] = prob
            n += 1
    logger.info("Loaded lexicon from '%s' with %d entries", path, n)
    return lexicon
예제 #5
0
def make_inputs(
    input_file: Optional[str],
    translator: inference.Translator,
    input_is_json: bool,
    input_factors: Optional[List[str]] = None
) -> Generator[inference.TranslatorInput, None, None]:
    """
    Generates TranslatorInput instances from input. If input is None, reads from stdin. If num_input_factors > 1,
    the function will look for factors attached to each token, separated by '|'.
    If source is not None, reads from the source file. If num_source_factors > 1, num_source_factors source factor
    filenames are required.

    :param input_file: The source file (possibly None).
    :param translator: Translator that will translate each line of input.
    :param input_is_json: Whether the input is in json format.
    :param input_factors: Source factor files.
    :return: TranslatorInput objects.
    """
    if input_file is None:
        check_condition(
            input_factors is None,
            "Translating from STDIN, not expecting any factor files.")
        for sentence_id, line in enumerate(sys.stdin, 1):

            #GRN
            surface, graph = line.split('\t')

            if input_is_json:
                yield inference.make_input_from_json_string(
                    sentence_id=sentence_id, json_string=line)
            else:
                yield inference.make_input_from_factored_string(
                    sentence_id=sentence_id,
                    factored_string=surface,
                    graph=graph,
                    translator=translator)
    else:
        input_factors = [] if input_factors is None else input_factors
        inputs = [input_file] + input_factors
        check_condition(
            translator.num_source_factors == len(inputs),
            "Model(s) require %d factors, but %d given (through --input and --input-factors)."
            % (translator.num_source_factors, len(inputs)))
        with ExitStack() as exit_stack:
            streams = [
                exit_stack.enter_context(data_io.smart_open(i)) for i in inputs
            ]
            for sentence_id, inputs in enumerate(zip(*streams), 1):
                if input_is_json:
                    yield inference.make_input_from_json_string(
                        sentence_id=sentence_id, json_string=inputs[0])
                else:
                    yield inference.make_input_from_multiple_strings(
                        sentence_id=sentence_id, strings=list(inputs))
예제 #6
0
    def __init__(self,
                 context: mx.context.Context,
                 inputs: str,
                 references: str,
                 model: str,
                 max_input_len: int,
                 beam_size=C.DEFAULT_BEAM_SIZE,
                 limit: int = -1) -> None:
        self.context = context
        self.max_input_len = max_input_len
        self.beam_size = beam_size
        self.model = model
        with smart_open(inputs) as inputs_fin, smart_open(
                references) as references_fin:
            input_sentences = inputs_fin.readlines()
            target_sentences = references_fin.readlines()
            check_condition(
                len(input_sentences) == len(target_sentences),
                "Number of sentence pairs do not match")
            if limit <= 0:
                limit = len(input_sentences)
            if limit < len(input_sentences):
                # custom random number generator to guarantee the same samples across runs in order to be able to
                # compare metrics across independent runs
                random_gen = random.Random(42)
                self.input_sentences, self.target_sentences = zip(
                    *random_gen.sample(
                        list(zip(input_sentences, target_sentences)), limit))
            else:
                self.input_sentences, self.target_sentences = input_sentences, target_sentences

        logger.info(
            "Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)",
            max_input_len, beam_size, model, len(self.input_sentences))

        with smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \
                smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out:
            [trg_out.write(s) for s in self.target_sentences]
            [src_out.write(s) for s in self.input_sentences]
예제 #7
0
def build_from_paths(paths: List[str], num_words: int = 50000, min_count: int = 1) -> Dict[str, int]:
    """
    Creates vocabulary from paths to a file in sentence-per-line format. A sentence is just a whitespace delimited
    list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the
    vocabulary.

    :param paths: List of paths to files with one sentence per line.
    :param num_words: Maximum number of words in the vocabulary.
    :param min_count: Minimum occurrences of words to be included in the vocabulary.
    :return: Word-to-id mapping.
    """
    with ExitStack() as stack:
        logger.info("Building vocabulary from dataset(s): %s", paths)
        files = (stack.enter_context(smart_open(path)) for path in paths)
        return build_vocab(chain(*files), num_words, min_count)
예제 #8
0
def build_from_path(path: str,
                    num_words: int = 50000,
                    min_count: int = 1) -> Dict[str, int]:
    """
    Creates vocabulary from path to a file in sentence-per-line format. A sentence is just a whitespace delimited
    list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the
    vocabulary.
    
    :param path: Path to file with one sentence per line.
    :param num_words: Maximum number of words in the vocabulary.
    :param min_count: Minimum occurrences of words to be included in the vocabulary.
    :return: Word-to-id mapping.
    """
    with smart_open(path) as data:
        logger.info("Building vocabulary from dataset: %s", path)
        return build_vocab(data, num_words, min_count)
예제 #9
0
    def decode_and_evaluate(self,
                            checkpoint: Optional[int] = None,
                            output_name: str = os.devnull,
                            speed_percentile: int = 99) -> Dict[str, float]:
        """
        Decodes data set and evaluates given a checkpoint.

        :param checkpoint: Checkpoint to load parameters from.
        :param output_name: Filename to write translations to. Defaults to /dev/null.
        :param speed_percentile: Percentile to compute for sec/sent. Default: p99.
        :return: Mapping of metric names to scores.
        """
        models, vocab_source, vocab_target = load_models(self.context,
                                                         self.max_input_len,
                                                         self.beam_size,
                                                         [self.model],
                                                         [checkpoint],
                                                         softmax_temperature=self.softmax_temperature,
                                                         max_output_length_num_stds=self.max_output_length_num_stds)
        translator = Translator(self.context,
                                self.ensemble_mode,
                                self.bucket_width_source,
                                self.bucket_width_target,
                                LengthPenalty(self.length_penalty_alpha, self.length_penalty_beta),
                                models,
                                vocab_source,
                                vocab_target)
        trans_wall_times = np.zeros((len(self.input_sentences),))
        with smart_open(output_name, 'w') as output:
            handler = sockeye.output_handler.StringOutputHandler(output)
            translations = []
            for i, input_sentence in enumerate(self.input_sentences):
                tic = time.time()
                trans_input = translator.make_input(i, input_sentence)
                trans_output = translator.translate(trans_input)
                handler.handle(trans_input, trans_output)
                trans_wall_time = time.time() - tic
                trans_wall_times[i] = trans_wall_time
                translations.append(trans_output.translation)
        percentile_sec_per_sent = np.percentile(trans_wall_times, speed_percentile)

        # TODO(fhieber): eventually add more metrics (METEOR etc.)
        return {C.BLEU_VAL: sockeye.bleu.corpus_bleu(translations, self.target_sentences),
                C.SPEED_PCT % speed_percentile: percentile_sec_per_sent}