Пример #1
0
def test_tokenization_with_vocabulary_restriction(tmpdir):
    sp_model_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        "corpus",
        "resources",
        "subword",
        "en_de.sp",
    )
    config = {
        "source":
        "en",
        "target":
        "de",
        "preprocess": [
            {
                "op": "tokenization",
                "source": {
                    "mode": "none",
                    "sp_model_path": sp_model_path,
                    "restrict_subword_vocabulary": True,
                },
                "target": {
                    "mode": "none",
                    "sp_model_path": sp_model_path,
                },
            },
        ],
    }

    process_type = prepoperator.ProcessType.INFERENCE
    example = tu.TranslationUnit("World", "World")

    with pytest.raises(ValueError, match="restrict_subword_vocabulary"):
        pipeline = prepoperator.Pipeline(config, process_type)

    vocab_path = str(tmpdir.join("vocab.txt"))
    with open(vocab_path, "w") as vocab_file:
        vocab_file.write("# Comment\n")
        vocab_file.write("▁Wor 0.0224656\n")
    config.update({
        "vocabulary": {
            "source": {
                "path": vocab_path,
            },
            "target": {
                "path": vocab_path,
            },
        },
    })

    pipeline = prepoperator.Pipeline(config, process_type)
    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == ["▁Wor", "l", "d"]
    assert tu_list[0].tgt_tok.tokens[0] == ["▁World"]
Пример #2
0
def _process_batch(
        pipeline,
        tu_batch,
        options=None,
        # Arguments below are used to rebuild the pipeline, if required.
        config=None,
        process_type=None,
        exit_step=None,
        override_label=None,
        shared_state=None,
):
    """Rebuilds the pipeline if required and processes a batch of TUs."""
    if pipeline is None or override_label != pipeline.override_label:
        if pipeline is None:
            logger.info('Building processing pipeline')
        else:
            logger.info('Rebuilding processing pipeline for label %s', override_label)
        pipeline = prepoperator.Pipeline(
            config,
            process_type,
            preprocess_exit_step=exit_step,
            override_label=override_label,
            shared_state=shared_state)

    tu_list, batch_meta = tu_batch
    base_name = batch_meta.get('base_name')
    logger.info(
        'Processing %d samples%s',
        len(tu_list),
        ' from %s' % base_name if base_name is not None else '',
    )

    tu_list, batch_meta = pipeline(tu_batch, options=options)
    outputs = [tu.export(pipeline.process_type) for tu in tu_list]
    return (outputs, batch_meta), pipeline
Пример #3
0
def test_tokenization_with_lang():
    tokenization_config = {
        "mode": "aggressive",
        "case_markup": True,
        "soft_case_regions": True,
    }
    config = {
        "source":
        "el",
        "target":
        "en",
        "preprocess": [{
            "op": "tokenization",
            "source": tokenization_config,
            "target": tokenization_config,
        }],
    }

    example = tu.TranslationUnit("ΣΙΓΜΑ ΤΕΛΙΚΟΣ")
    pipeline = prepoperator.Pipeline(config,
                                     prepoperator.ProcessType.INFERENCE)
    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == [
        "⦅mrk_begin_case_region_U⦆",
        "σιγμα",
        "τελικος",
        "⦅mrk_end_case_region_U⦆",
    ]
Пример #4
0
def test_tokenization_with_inference_config(tmpdir):
    config = {
        "source":
        "en",
        "target":
        "de",
        "preprocess": [
            {
                "op": "tokenization",
                "source": {
                    "mode": "aggressive",
                },
                "target": {
                    "mode": "aggressive",
                },
            },
        ],
    }

    process_type = prepoperator.ProcessType.INFERENCE
    example = tu.TranslationUnit("2,000", "2,000")

    pipeline = prepoperator.Pipeline(config, process_type)

    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == ["2", ",", "000"]
    assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"]

    config["inference"] = {
        "overrides": {
            "tokenization_1": {
                "source": {
                    "mode": "none"
                }
            }
        }
    }
    pipeline = prepoperator.Pipeline(config, process_type)

    example = tu.TranslationUnit("2,000", "2,000")
    tu_list, _ = pipeline(([example], {}))

    assert tu_list[0].src_tok.tokens[0] == ["2,000"]
    assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"]
Пример #5
0
    def process_input(self,
                      source,
                      target=None,
                      target_name=None,
                      metadata=None,
                      config=None,
                      options=None):
        """Processes one translation example at inference.

        Args:
          source: In preprocess, a string. In postprocess, a (possibly multipart)
            list of tokens.
          target: In preprocess, a string. In postprocess, a (possibly multipart)
            list of tokens.
          target_name: The name of the target that is passed during inference.
          metadata: Additional metadata of the input.
          config: The configuration for this example.
          options: A dictionary with operators options.

        Returns:
          - In preprocess, a tuple (source_tokens, target_tokens, metadata).
          - In postprocess, a string (the postprocessed target)
        """
        # This method should be thread-safe as the inference server is starting a new
        # thread for each request.

        # Rebuild pipeline if the example has its own configuration.
        if config is not None:
            pipeline = prepoperator.Pipeline(
                config,
                self._pipeline_type,
                shared_state=self._global_shared_state.get(),
            )
        else:
            pipeline = self._pipeline

        tu = TranslationUnit(
            source=source,
            metadata=metadata,
            source_tokenizer=pipeline.start_state.get('src_tokenizer'),
        )

        if target is not None:
            tu.add_target(
                target,
                name=target_name,
                tokenizer=pipeline.start_state.get('tgt_tokenizer'))

        tu_batch = ([tu], {})
        tu_batch = pipeline(tu_batch, options=options)
        tu = tu_batch[0][0]

        if self._postprocess:
            return tu.tgt_detok
        src_tokens = tu.src_tok.tokens
        tgt_tokens = tu.tgt_tok.tokens if tu.tgt_tok is not None else [None for _ in src_tokens]
        return src_tokens, tgt_tokens, tu.metadata
Пример #6
0
 def __init__(self, config, postprocess=False):
     pipeline_type = (prepoperator.ProcessType.POSTPROCESS
                      if postprocess
                      else prepoperator.ProcessType.INFERENCE)
     super().__init__(config, pipeline_type)
     self._postprocess = postprocess
     # Build a generic pipeline that will be used in process_input.
     self._pipeline = prepoperator.Pipeline(
         self._config,
         self._pipeline_type,
         shared_state=self._global_shared_state.get(),
     )
Пример #7
0
def _run_pipeline(config, process_type, tu_list):
    if isinstance(tu_list, str):
        tu_list = tu.TranslationUnit(tu_list)
    if not isinstance(tu_list, list):
        tu_list = [tu_list]
    if isinstance(config, list):
        config = {
            "source": "xx",
            "target": "yy",
            "preprocess": config,
        }
    pipeline = prepoperator.Pipeline(config, process_type)
    tu_list, _ = pipeline((tu_list, {}))
    return tu_list
Пример #8
0
def test_align_perplexity_invalid_config(mode, lower, upper):
    config = {
        "source":
        "en",
        "target":
        "de",
        "preprocess": [{
            "op": "align_perplexity_filter",
            mode: {
                "lower": lower,
                "upper": upper,
            }
        }]
    }
    with pytest.raises(ValueError, match="align_perplexity_filter"):
        prepoperator.Pipeline(config, prepoperator.ProcessType.TRAINING)
Пример #9
0
def test_tokenization_with_non_iso_639_lang():
    config = {
        "source":
        "en-GB",
        "target":
        "en-US",
        "preprocess": [{
            "op": "tokenization",
            "source": {
                "mode": "none"
            },
            "target": {
                "mode": "none"
            },
        }],
    }

    # Should not throw an exception.
    prepoperator.Pipeline(config, prepoperator.ProcessType.INFERENCE)
Пример #10
0
def _process_batch(
    pipeline,
    tu_batch,
    options=None,
    # Arguments below are used to rebuild the pipeline, if required.
    config=None,
    process_type=None,
    exit_step=None,
    shared_state=None,
):
    """Rebuilds the pipeline if required and processes a batch of TUs."""
    override_label = _get_corpus_label(tu_batch)
    if pipeline is None or override_label != pipeline.override_label:
        if override_label is None:
            logger.info("Building default processing pipeline")
        else:
            logger.info("Building processing pipeline for label %s",
                        override_label)
        pipeline = prepoperator.Pipeline(
            config,
            process_type,
            preprocess_exit_step=exit_step,
            override_label=override_label,
            shared_state=shared_state,
        )

    tu_list, batch_meta = tu_batch
    base_name = _get_corpus_name(tu_batch)
    logger.info(
        "Processing %d samples%s",
        len(tu_list),
        " from %s" % base_name if base_name is not None else "",
    )

    tu_list, batch_meta = pipeline(tu_batch, options=options)
    outputs = [tu.export(pipeline.process_type) for tu in tu_list]
    return (outputs, batch_meta), pipeline
Пример #11
0
 def build_pipeline(self, config):
     return prepoperator.Pipeline(
         config,
         self._pipeline_type,
         shared_state=self._global_shared_state.get(),
     )