def test_identity_filter(): config = [ { "op": "identity_filter", "min_characters": 0, }, ] assert _is_filtered(config, tu.TranslationUnit("Hello world!", "Hello world!")) assert not _is_filtered(config, tu.TranslationUnit("Hello world!", "Hello world")) config[0]["min_characters"] = 20 assert not _is_filtered(config, tu.TranslationUnit("Hello world!", "Hello world!"))
def _get_translation_units(self, files): src_file = files["source"] tgt_file = files.get("target") annotations = { key: f for key, f in files.items() if key not in ("source", "target") } for i in range(self._file.lines_count): src_line = src_file.readline() tgt_line = tgt_file.readline() if tgt_file else None annot_lines = {} for key, annot_file in annotations.items(): annot_lines[key] = annot_file.readline() num_samples = self._file.random_sample.get(i, 0) if num_samples == 0: continue src_line = src_line.strip() if tgt_line: tgt_line = tgt_line.strip() for key, line in annot_lines.items(): annot_lines[key] = line.strip() while num_samples > 0: yield tu.TranslationUnit( source=src_line, target=tgt_line, annotations=annot_lines ) num_samples -= 1
def test_align_perplexity_percent_threshold(lower, upper, log_probs, expected_log_probs): if expected_log_probs is None: expected_log_probs = log_probs tu_list = [] tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=True) for log_prob in log_probs: single_tu = tu.TranslationUnit("a b c", "a b c", source_tokenizer=tokenizer, target_tokenizer=tokenizer) single_tu.set_alignment( _MockAligner(forward_log_prob=log_prob, backward_log_prob=log_prob)) tu_list.append(single_tu) config = { "source": "en", "target": "fr", "preprocess": [{ "op": "align_perplexity_filter", "percent_threshold": { "lower": lower, "upper": upper, } }] } tu_list = _run_pipeline(config, prepoperator.ProcessType.TRAINING, tu_list) assert len(tu_list) == len(expected_log_probs) for single_tu, log_prob in zip(tu_list, expected_log_probs): assert single_tu.alignment_log_probs[0][0] == log_prob
def test_tokenization_with_lang(): tokenization_config = { "mode": "aggressive", "case_markup": True, "soft_case_regions": True, } config = { "source": "el", "target": "en", "preprocess": [{ "op": "tokenization", "source": tokenization_config, "target": tokenization_config, }], } example = tu.TranslationUnit("ΣΙΓΜΑ ΤΕΛΙΚΟΣ") pipeline = prepoperator.Pipeline(config, prepoperator.ProcessType.INFERENCE) tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == [ "⦅mrk_begin_case_region_U⦆", "σιγμα", "τελικος", "⦅mrk_end_case_region_U⦆", ]
def test_parentheses_filter(src, tgt, filtered, expected): config = [ { "op": "tokenization", "source": { "mode": "conservative", "joiner_annotate": True }, "target": { "mode": "conservative", "joiner_annotate": True }, }, { "op": "parentheses", "side": "both", "type": [["(", ")"], ["<", ">"]] }, ] TU = tu.TranslationUnit(src, tgt) assert filtered == _is_filtered(config, TU) if not filtered: result_src = TU.src_detok result_tgt = TU.tgt_detok if expected[0] is None: assert src == result_src else: assert expected[0] == result_src if expected[1] is None: assert tgt == result_tgt else: assert expected[1] == result_tgt
def __call__(self): files = [utils.open_file(path) for path in self._files] try: tu_list = [] # Postprocess. if len(self._files) > 1: for meta in self._metadata: # TODO : prefix, features num_parts = len(meta) src_lines = [ next(files[0]).strip().split() for _ in range(num_parts) ] tgt_lines = [ next(files[1]).strip().split() for _ in range(num_parts) ] tu_list.append( tu.TranslationUnit( source=src_lines, target=tgt_lines, metadata=meta, source_tokenizer=self._source_tokenizer, target_tokenizer=self._target_tokenizer)) if len(tu_list) == self._batch_size: yield tu_list, {} tu_list = [] # Preprocess. else: for line in files[0]: tu_list.append(tu.TranslationUnit(source=line)) if len(tu_list) == self._batch_size: yield tu_list, {} tu_list = [] if tu_list: yield tu_list, {} finally: for f in files: f.close()
def test_tokenization_with_inference_config(tmpdir): config = { "source": "en", "target": "de", "preprocess": [ { "op": "tokenization", "source": { "mode": "aggressive", }, "target": { "mode": "aggressive", }, }, ], } process_type = prepoperator.ProcessType.INFERENCE example = tu.TranslationUnit("2,000", "2,000") pipeline = prepoperator.Pipeline(config, process_type) tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == ["2", ",", "000"] assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"] config["inference"] = { "overrides": { "tokenization_1": { "source": { "mode": "none" } } } } pipeline = prepoperator.Pipeline(config, process_type) example = tu.TranslationUnit("2,000", "2,000") tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == ["2,000"] assert tu_list[0].tgt_tok.tokens[0] == ["2", ",", "000"]
def test_tokenization_with_vocabulary_restriction(tmpdir): sp_model_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "corpus", "resources", "subword", "en_de.sp", ) config = { "source": "en", "target": "de", "preprocess": [ { "op": "tokenization", "source": { "mode": "none", "sp_model_path": sp_model_path, "restrict_subword_vocabulary": True, }, "target": { "mode": "none", "sp_model_path": sp_model_path, }, }, ], } process_type = prepoperator.ProcessType.INFERENCE example = tu.TranslationUnit("World", "World") with pytest.raises(ValueError, match="restrict_subword_vocabulary"): pipeline = prepoperator.Pipeline(config, process_type) vocab_path = str(tmpdir.join("vocab.txt")) with open(vocab_path, "w") as vocab_file: vocab_file.write("# Comment\n") vocab_file.write("▁Wor 0.0224656\n") config.update({ "vocabulary": { "source": { "path": vocab_path, }, "target": { "path": vocab_path, }, }, }) pipeline = prepoperator.Pipeline(config, process_type) tu_list, _ = pipeline(([example], {})) assert tu_list[0].src_tok.tokens[0] == ["▁Wor", "l", "d"] assert tu_list[0].tgt_tok.tokens[0] == ["▁World"]
def _run_pipeline(config, process_type, tu_list): if isinstance(tu_list, str): tu_list = tu.TranslationUnit(tu_list) if not isinstance(tu_list, list): tu_list = [tu_list] if isinstance(config, list): config = { "source": "xx", "target": "yy", "preprocess": config, } pipeline = prepoperator.Pipeline(config, process_type) tu_list, _ = pipeline((tu_list, {})) return tu_list
def test_align_perplexity_hard_threshold(lower, upper, src_length, tgt_length, fwd_log_prob, bwd_log_prob, filtered): config = [{ "op": "align_perplexity_filter", "hard_threshold": { "lower": lower, "upper": upper, } }] tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=True) single_tu = tu.TranslationUnit(" ".join(str(i) for i in range(src_length)), " ".join(str(i) for i in range(tgt_length)), source_tokenizer=tokenizer, target_tokenizer=tokenizer) single_tu.set_alignment( _MockAligner(forward_log_prob=fwd_log_prob, backward_log_prob=bwd_log_prob)) assert filtered == _is_filtered(config, single_tu)
def _get_translation_units(self, files): source_file = files["source"] target_file = files["target"] for meta in self._metadata: # TODO : features num_parts = len(meta) src_lines = [next(source_file).strip().split(" ") for _ in range(num_parts)] tgt_lines = [next(target_file).strip().split(" ") for _ in range(num_parts)] if self._target_score_type is not None: score = _extract_score(tgt_lines, self._target_score_type) meta = [{"score": score}] yield tu.TranslationUnit( source=src_lines, target=tgt_lines, metadata=meta, source_tokenizer=self._source_tokenizer, target_tokenizer=self._target_tokenizer, )
def test_length_filter(filter_config, filtered): filter_config["op"] = "length_filter" config = [ { "op": "tokenization", "source": { "mode": "conservative", "joiner_annotate": True }, "target": { "mode": "conservative", "joiner_annotate": True }, }, filter_config, ] source = "Hello world!" target = "Bonjour le monde !" assert filtered == _is_filtered(config, tu.TranslationUnit(source, target))
def test_length_filter_empty_target(): config = [ { "op": "tokenization", "source": { "mode": "conservative", "joiner_annotate": True }, "target": { "mode": "conservative", "joiner_annotate": True }, }, { "op": "length_filter", "min_words_ratio": 0.7, "max_words_ratio": 2, }, ] source = "Hello" target = "" assert _is_filtered(config, tu.TranslationUnit(source, target))
def _get_samples(): for i in range(self._file.lines_count): src_line = src_file.readline() tgt_line = tgt_file.readline() annot_lines = {} for key, annot_file in annotations.items(): annot_lines[key] = annot_file.readline() num_samples = self._file.random_sample.get(i, 0) if num_samples == 0: continue src_line = src_line.strip() tgt_line = tgt_line.strip() for key, line in annot_lines.items(): annot_lines[key] = line.strip() while num_samples > 0: yield tu.TranslationUnit(source=src_line, target=tgt_line, annotations=annot_lines) num_samples -= 1
def _get_translation_units(self, files): source_file = files["source"] target_file = files.get("target", itertools.repeat(None)) for source, target in zip(source_file, target_file): yield tu.TranslationUnit(source=source, target=target)