def test_tokenization_with_vocabulary_restriction(tmpdir): sp_model_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "corpus", "resources", "subword", "en_de.sp", ) config = { "source": "en", "target": "de", "preprocess": [ { "op": "tokenization", "source": { "mode": "none", "sp_model_path": sp_model_path, "restrict_subword_vocabulary": True, }, "target": { "mode": "none", "sp_model_path": sp_model_path, }, }, ], } process_type = prepoperator.ProcessType.INFERENCE example = tu.TranslationUnit("World", "World") with pytest.raises(ValueError, match="restrict_subword_vocabulary"): pipeline = prepoperator.Pipeline(config, process_type) vocab_path = str(tmpdir.join("vocab.txt")) with open(vocab_path, "w") as vocab_file: vocab_file.write("# Comment\n") vocab_file.write("▁Wor 0.0224656\n") config.update({ "vocabulary": { "source": { "path": vocab_path, }, "target": { "path": vocab_path, }, }, }) pipeline = prepoperator.Pipeline(config, process_type) tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == ["▁Wor", "l", "d"] assert tu_list[0].tgt_tok.tokens[0] == ["▁World"]
def _process_batch( pipeline, tu_batch, options=None, # Arguments below are used to rebuild the pipeline, if required. config=None, process_type=None, exit_step=None, override_label=None, shared_state=None, ): """Rebuilds the pipeline if required and processes a batch of TUs.""" if pipeline is None or override_label != pipeline.override_label: if pipeline is None: logger.info('Building processing pipeline') else: logger.info('Rebuilding processing pipeline for label %s', override_label) pipeline = prepoperator.Pipeline( config, process_type, preprocess_exit_step=exit_step, override_label=override_label, shared_state=shared_state) tu_list, batch_meta = tu_batch base_name = batch_meta.get('base_name') logger.info( 'Processing %d samples%s', len(tu_list), ' from %s' % base_name if base_name is not None else '', ) tu_list, batch_meta = pipeline(tu_batch, options=options) outputs = [tu.export(pipeline.process_type) for tu in tu_list] return (outputs, batch_meta), pipeline
def test_tokenization_with_lang(): tokenization_config = { "mode": "aggressive", "case_markup": True, "soft_case_regions": True, } config = { "source": "el", "target": "en", "preprocess": [{ "op": "tokenization", "source": tokenization_config, "target": tokenization_config, }], } example = tu.TranslationUnit("ΣΙΓΜΑ ΤΕΛΙΚΟΣ") pipeline = prepoperator.Pipeline(config, prepoperator.ProcessType.INFERENCE) tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == [ "⦅mrk_begin_case_region_U⦆", "σιγμα", "τελικος", "⦅mrk_end_case_region_U⦆", ]
def test_tokenization_with_inference_config(tmpdir): config = { "source": "en", "target": "de", "preprocess": [ { "op": "tokenization", "source": { "mode": "aggressive", }, "target": { "mode": "aggressive", }, }, ], } process_type = prepoperator.ProcessType.INFERENCE example = tu.TranslationUnit("2,000", "2,000") pipeline = prepoperator.Pipeline(config, process_type) tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == ["2", ",", "000"] assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"] config["inference"] = { "overrides": { "tokenization_1": { "source": { "mode": "none" } } } } pipeline = prepoperator.Pipeline(config, process_type) example = tu.TranslationUnit("2,000", "2,000") tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == ["2,000"] assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"]
def process_input(self, source, target=None, target_name=None, metadata=None, config=None, options=None): """Processes one translation example at inference. Args: source: In preprocess, a string. In postprocess, a (possibly multipart) list of tokens. target: In preprocess, a string. In postprocess, a (possibly multipart) list of tokens. target_name: The name of the target that is passed during inference. metadata: Additional metadata of the input. config: The configuration for this example. options: A dictionary with operators options. Returns: - In preprocess, a tuple (source_tokens, target_tokens, metadata). - In postprocess, a string (the postprocessed target) """ # This method should be thread-safe as the inference server is starting a new # thread for each request. # Rebuild pipeline if the example has its own configuration. if config is not None: pipeline = prepoperator.Pipeline( config, self._pipeline_type, shared_state=self._global_shared_state.get(), ) else: pipeline = self._pipeline tu = TranslationUnit( source=source, metadata=metadata, source_tokenizer=pipeline.start_state.get('src_tokenizer'), ) if target is not None: tu.add_target( target, name=target_name, tokenizer=pipeline.start_state.get('tgt_tokenizer')) tu_batch = ([tu], {}) tu_batch = pipeline(tu_batch, options=options) tu = tu_batch[0][0] if self._postprocess: return tu.tgt_detok src_tokens = tu.src_tok.tokens tgt_tokens = tu.tgt_tok.tokens if tu.tgt_tok is not None else [None for _ in src_tokens] return src_tokens, tgt_tokens, tu.metadata
def __init__(self, config, postprocess=False): pipeline_type = (prepoperator.ProcessType.POSTPROCESS if postprocess else prepoperator.ProcessType.INFERENCE) super().__init__(config, pipeline_type) self._postprocess = postprocess # Build a generic pipeline that will be used in process_input. self._pipeline = prepoperator.Pipeline( self._config, self._pipeline_type, shared_state=self._global_shared_state.get(), )
def _run_pipeline(config, process_type, tu_list): if isinstance(tu_list, str): tu_list = tu.TranslationUnit(tu_list) if not isinstance(tu_list, list): tu_list = [tu_list] if isinstance(config, list): config = { "source": "xx", "target": "yy", "preprocess": config, } pipeline = prepoperator.Pipeline(config, process_type) tu_list, _ = pipeline((tu_list, {})) return tu_list
def test_align_perplexity_invalid_config(mode, lower, upper): config = { "source": "en", "target": "de", "preprocess": [{ "op": "align_perplexity_filter", mode: { "lower": lower, "upper": upper, } }] } with pytest.raises(ValueError, match="align_perplexity_filter"): prepoperator.Pipeline(config, prepoperator.ProcessType.TRAINING)
def test_tokenization_with_non_iso_639_lang(): config = { "source": "en-GB", "target": "en-US", "preprocess": [{ "op": "tokenization", "source": { "mode": "none" }, "target": { "mode": "none" }, }], } # Should not throw an exception. prepoperator.Pipeline(config, prepoperator.ProcessType.INFERENCE)
def _process_batch( pipeline, tu_batch, options=None, # Arguments below are used to rebuild the pipeline, if required. config=None, process_type=None, exit_step=None, shared_state=None, ): """Rebuilds the pipeline if required and processes a batch of TUs.""" override_label = _get_corpus_label(tu_batch) if pipeline is None or override_label != pipeline.override_label: if override_label is None: logger.info("Building default processing pipeline") else: logger.info("Building processing pipeline for label %s", override_label) pipeline = prepoperator.Pipeline( config, process_type, preprocess_exit_step=exit_step, override_label=override_label, shared_state=shared_state, ) tu_list, batch_meta = tu_batch base_name = _get_corpus_name(tu_batch) logger.info( "Processing %d samples%s", len(tu_list), " from %s" % base_name if base_name is not None else "", ) tu_list, batch_meta = pipeline(tu_batch, options=options) outputs = [tu.export(pipeline.process_type) for tu in tu_list] return (outputs, batch_meta), pipeline
def build_pipeline(self, config): return prepoperator.Pipeline( config, self._pipeline_type, shared_state=self._global_shared_state.get(), )