def test_segment_alphabet(): tokenizer = pyonmttok.Tokenizer(mode="aggressive", segment_alphabet=["Han"]) tokens, _ = tokenizer.tokenize("測試 abc") assert tokens == ["測", "試", "abc"] tokenizer = pyonmttok.Tokenizer(mode="aggressive", segment_alphabet=None) tokens, _ = tokenizer.tokenize("測試 abc") assert tokens == ["測試", "abc"]
def test_invalid_annotation(): with pytest.raises(ValueError): pyonmttok.Tokenizer("conservative", joiner_annotate=True, spacer_annotate=True) with pytest.raises(ValueError): pyonmttok.Tokenizer("conservative", joiner_new=True) with pytest.raises(ValueError): pyonmttok.Tokenizer("conservative", spacer_new=True)
def main(): print("Applies an OpenNMT model to translate a TXT file") start_time = datetime.datetime.now() init_logging(True) model_name, input_filename, translated_file, tokenizer_models, translation_models = read_parameters( ) model_path = os.path.join(translation_models, model_name) openNMT = CTranslate(model_path) if (model_name == 'eng-cat'): src_model_path = os.path.join(tokenizer_models, "en_m.model") tgt_model_path = os.path.join(tokenizer_models, "ca_m.model") else: src_model_path = os.path.join(tokenizer_models, "ca_m.model") tgt_model_path = os.path.join(tokenizer_models, "en_m.model") openNMT.tokenizer_source = pyonmttok.Tokenizer( mode="none", sp_model_path=src_model_path) openNMT.tokenizer_target = pyonmttok.Tokenizer( mode="none", sp_model_path=tgt_model_path) target_filename_review = "translated-review.txt" with open(input_filename, encoding='utf-8', mode='r') as tf_en,\ open(translated_file, encoding='utf-8', mode='w') as tf_ca,\ open(target_filename_review, encoding='utf-8', mode='w') as tf_ca_review: en_strings = tf_en.readlines() translated = 0 errors = 0 for src in en_strings: src = src.replace('\n', '') try: tgt = openNMT.translate_splitted(src) except Exception as e: logging.error(str(e)) logging.error("Processing: {0}".format(src)) errors = errors + 1 tf_ca.write("{0}\n".format("Error")) continue translated = translated + 1 tf_ca.write("{0}\n".format(tgt)) tf_ca_review.write("{0}\n{1}\n\n".format(src, tgt)) logging.debug('Source: ' + str(src)) logging.debug('Target: ' + str(tgt)) print("Sentences translated: {0}".format(translated)) print("Sentences unable to translate {0} (NMT errors)".format(errors)) print("Time used {0}".format(str(datetime.datetime.now() - start_time)))
def test_deepcopy(use_constructor): text = "Hello World!" tok1 = pyonmttok.Tokenizer("aggressive", joiner_annotate=True) tokens1, _ = tok1.tokenize(text) if use_constructor: tok2 = pyonmttok.Tokenizer(tok1) else: tok2 = copy.deepcopy(tok1) tokens2, _ = tok2.tokenize(text) assert tokens1 == tokens2 del tok1 tokens2, _ = tok2.tokenize(text) assert tokens1 == tokens2
def __init__(self, export_dir): ''' Load translation model and sentencepiece models ''' imported = tf.saved_model.load(export_dir) self._translate_fn = imported.signatures["serving_default"] sp_jpn_model_path = os.path.join(export_dir, "assets.extra", "sp.jpn.model") sp_eng_model_path = os.path.join(export_dir, "assets.extra", "sp.eng.model") self.jpn_tokenizer = pyonmttok.Tokenizer( "none", sp_model_path=sp_jpn_model_path) self.eng_tokenizer = pyonmttok.Tokenizer( "none", sp_model_path=sp_eng_model_path)
def init_core(self, bpe_path): import pyonmttok if os.path.isdir(bpe_path): files = [(file.split('_')[-1], file) for file in os.listdir(bpe_path)] return { key: pyonmttok.Tokenizer('conservative', joiner_annotate=True, bpe_model_path=os.path.join( bpe_path, file)) for key, file in files } return pyonmttok.Tokenizer('conservative', joiner_annotate=True, bpe_model_path=bpe_path)
def test_random_seed(): pyonmttok.set_random_seed(42) tokenizer = pyonmttok.Tokenizer( "none", sp_model_path=os.path.join(_DATA_DIR, "sp-models", "wmtende.model"), sp_nbest_size=10, sp_alpha=0.1) assert tokenizer.tokenize("appealing")[0] == ["▁app", "e", "al", "ing"] tokenizer = pyonmttok.Tokenizer( "conservative", bpe_model_path=os.path.join(_DATA_DIR, "bpe-models", "testcode.v0.1"), bpe_dropout=0.3) assert tokenizer.tokenize("improvement")[0] == ["i", "m", "pr", "ove", "m", "e", "n", "t"]
def test_file(tmpdir, tokens_delimiter): tokenizer = pyonmttok.Tokenizer( "aggressive", joiner_annotate=True, joiner_new=True, case_feature=True, ) text = "Hello WORLD!" expected_tokens = ["hello│C", "world│U", "■│N", "!│N"] input_path = str(tmpdir.join("input.txt")) output_path = str(tmpdir.join("output.txt")) with open(input_path, "w", encoding="utf-8") as input_file: input_file.write(text) input_file.write("\n") tokenizer.tokenize_file(input_path, output_path, tokens_delimiter=tokens_delimiter) assert os.path.exists(output_path) with open(output_path, encoding="utf-8") as output_file: assert output_file.readline( ) == tokens_delimiter.join(expected_tokens) + "\n" os.remove(input_path) tokenizer.detokenize_file(output_path, input_path, tokens_delimiter=tokens_delimiter) assert os.path.exists(input_path) with open(input_path, encoding="utf-8") as input_file: assert input_file.readline() == text + "\n"
def tokenize_mn(segment): tokenizer = pyonmttok.Tokenizer("aggressive", segment_numbers=True, joiner_annotate=True) segment=protect(segment) tokens, features = tokenizer.tokenize(segment) tokenized=" ".join(tokens) unprotected=unprotect(tokenized).replace("%0020"," ") return(unprotected)
def _translate_sentence(stub, model_name, text): print(text) tokenizer = pyonmttok.Tokenizer("conservative") _default = 10.0 output = translate(stub, model_name, [text], tokenizer, timeout=_default) print(output[0]) return output[0]
def tokenize(segment): tokenizer = pyonmttok.Tokenizer("aggressive", segment_numbers=False, joiner_annotate=False) segment=protect(segment) tokens, features = tokenizer.tokenize(segment) tokenized=" ".join(tokens) unprotected=unprotect(tokenized) return(unprotected)
def build_tokenizer(args): """Builds a tokenizer based on user arguments.""" args = args.copy() args.pop('vocabulary', None) args.pop('build_subword', None) args.pop('build_vocabulary', None) return pyonmttok.Tokenizer(**args)
def __init__(self): self._morph = pymorphy2.MorphAnalyzer() with open(os.path.join(FILES_PATH, 'introduction_words.txt'), 'r', encoding='utf-8') as inwords: self._introduction_words_lst = inwords.read().split('\n') self._tokenizer = pyonmttok.Tokenizer('aggressive')
def main(): parser = argparse.ArgumentParser(description="Translation client example") parser.add_argument("--model_name", required=True, help="model name") parser.add_argument("--sentencepiece_model", required=True, help="path to the sentence model") parser.add_argument("--host", default="localhost", help="model server host") parser.add_argument("--port", type=int, default=9000, help="model server port") parser.add_argument("--timeout", type=float, default=10.0, help="request timeout") args = parser.parse_args() channel = grpc.insecure_channel("%s:%d" % (args.host, args.port)) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) tokenizer = pyonmttok.Tokenizer("none", sp_model_path=args.sentencepiece_model) while True: text = input("Source: ") output = translate(stub, args.model_name, [text], tokenizer, timeout=args.timeout) print("Target: %s" % output[0]) print("")
def __init__(self, **kwargs): case_feature = kwargs.get("case_feature") if case_feature: raise ValueError("case_feature is not supported with OpenNMT-tf") kwargs.setdefault("mode", "conservative") self._config = kwargs self._tokenizer = pyonmttok.Tokenizer(**kwargs)
def test_bpe_case_insensitive_issue_147(): tokenizer = pyonmttok.Tokenizer("conservative", bpe_model_path=os.path.join( _DATA_DIR, "bpe-models", "issue-147.txt")) tokenizer.tokenize( "𝘛𝘩𝘦𝘳𝘦'𝘴 𝘯𝘰𝘵𝘩𝘪𝘯𝘨 𝘮𝘰𝘳𝘦 𝘨𝘭𝘢𝘮𝘰𝘳𝘰𝘶𝘴 𝘵𝘩𝘢𝘯 𝘭𝘰𝘰𝘬𝘪𝘯𝘨 𝘵𝘰𝘸𝘢𝘳𝘥𝘴 𝘵𝘩𝘦 𝘧𝘶𝘵𝘶𝘳𝘦")
def main(): parser = argparse.ArgumentParser(description="Translation client example") parser.add_argument("--model_name", required=True, help="model name") parser.add_argument("--sentencepiece_model", required=True, help="path to the sentence model") parser.add_argument("--host", default="localhost", help="model server host") parser.add_argument("--port", type=int, default=9000, help="model server port") parser.add_argument("--timeout", type=float, default=100000.0, help="request timeout") args = parser.parse_args() channel = grpc.insecure_channel("%s:%d" % (args.host, args.port)) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) tokenizer = pyonmttok.Tokenizer("none", sp_model_path=args.sentencepiece_model) sample = ["Hello world! My name is John. I live on the West coast.",] batch_output = translate(stub, args.model_name, sample*16, tokenizer, timeout=args.timeout) for bs in (1,2,4,8,16,32,64): print("batch_size: "+str(bs)) batch_input = sample * bs print(len(batch_input)) start = datetime.datetime.now() for i in range(0,50): batch_output = translate(stub, args.model_name, batch_input, tokenizer, timeout=args.timeout) # for input_text, output_text in zip(batch_input, batch_output): # print("{} ||| {}".format(input_text, output_text)) end = datetime.datetime.now() elapsed = end - start print(elapsed.seconds,":",elapsed.microseconds)
def test_align_perplexity_percent_threshold(lower, upper, log_probs, expected_log_probs): if expected_log_probs is None: expected_log_probs = log_probs tu_list = [] tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=True) for log_prob in log_probs: single_tu = tu.TranslationUnit("a b c", "a b c", source_tokenizer=tokenizer, target_tokenizer=tokenizer) single_tu.set_alignment( _MockAligner(forward_log_prob=log_prob, backward_log_prob=log_prob)) tu_list.append(single_tu) config = { "source": "en", "target": "fr", "preprocess": [{ "op": "align_perplexity_filter", "percent_threshold": { "lower": lower, "upper": upper, } }] } tu_list = _run_pipeline(config, prepoperator.ProcessType.TRAINING, tu_list) assert len(tu_list) == len(expected_log_probs) for single_tu, log_prob in zip(tu_list, expected_log_probs): assert single_tu.alignment_log_probs[0][0] == log_prob
def tokenize_m(segment): tokenizer = pyonmttok.Tokenizer("space", segment_numbers=False, joiner_annotate=True) seg_list = jieba.cut(segment, cut_all=False) tokenized = " ■".join(list(seg_list)) return (tokenized)
def tokenize_m(segment): tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True, segment_numbers=False, segment_alphabet=["Han"], segment_alphabet_change=True) segment=protect(segment) tokens, features = tokenizer.tokenize(segment) tokenized=" ".join(tokens) unprotected=unprotect(tokenized).replace("%0020"," ") return(unprotected)
def create_tokenizer(config): """Creates a new OpenNMT tokenizer. Args: config: A dictionary of tokenization options. Returns: A ``pyonmttok.Tokenizer``. """ def _set(kwargs, key): if key in config: value = config[key] if isinstance(value, six.string_types): value = tf.compat.as_bytes(value) kwargs[key] = value kwargs = {} _set(kwargs, "bpe_model_path") _set(kwargs, "sp_model_path") _set(kwargs, "joiner") _set(kwargs, "joiner_annotate") _set(kwargs, "joiner_new") _set(kwargs, "spacer_annotate") _set(kwargs, "case_feature") _set(kwargs, "no_substitution") _set(kwargs, "segment_case") _set(kwargs, "segment_numbers") _set(kwargs, "segment_alphabet_change") _set(kwargs, "segment_alphabet") return pyonmttok.Tokenizer(config.get("mode", "conservative"), **kwargs)
def __init__(self, export_dir): imported = tf.saved_model.load(export_dir) self._translate_fn = imported.signatures["serving_default"] #https://github.com/OpenNMT/Tokenizer/blob/1ae0877a733268c9a3ef5fc063d4c1f0b6dfe2f7/docs/options.md self._tokenizer = pyonmttok.Tokenizer( "conservative", joiner_annotate=True, segment_numbers=True ) #https://github.com/OpenNMT/Tokenizer/tree/master/bindings/python
def detokenize_mn(segment): tokenizer = pyonmttok.Tokenizer("aggressive", segment_numbers=True, joiner_annotate=False) segment = tokenizer.detokenize(segment.split(" ")) detok = segment return (detok)
def __init__(self): with open(os.path.join(FILES_PATH, 'abbreviations.json'), 'r', encoding='utf-8') as abbs: self._abbs = json.load(abbs) self._morph = pymorphy2.MorphAnalyzer() self._tokenizer = pyonmttok.Tokenizer('aggressive')
def test_detok_with_ranges(): tokenizer = pyonmttok.Tokenizer("conservative") text, ranges = tokenizer.detokenize_with_ranges(["a", "b"]) assert text == "a b" assert len(ranges) == 2 assert ranges[0] == (0, 0) assert ranges[1] == (2, 2)
def __init__(self, **kwargs): self._config = copy.deepcopy(kwargs) mode = "conservative" if "mode" in kwargs: mode = kwargs["mode"] del kwargs["mode"] self._tokenizer = pyonmttok.Tokenizer(mode, **kwargs)
def test_token_api_with_subword(): tokenizer = pyonmttok.Tokenizer("conservative", case_markup=True, joiner_annotate=True, bpe_model_path=os.path.join( _DATA_DIR, "bpe-models", "codes_suffix_case_insensitive.fr")) text = "BONJOUR MONDE" def _check_subword(tokens): assert len(tokens) == 5 assert tokens[0].type == pyonmttok.TokenType.LEADING_SUBWORD # bon assert tokens[1].type == pyonmttok.TokenType.TRAILING_SUBWORD # j assert tokens[2].type == pyonmttok.TokenType.TRAILING_SUBWORD # our assert tokens[3].type == pyonmttok.TokenType.LEADING_SUBWORD # mon assert tokens[4].type == pyonmttok.TokenType.TRAILING_SUBWORD # de tokens = tokenizer.tokenize(text, as_token_objects=True) _check_subword(tokens) serialized_tokens, _ = tokenizer.serialize_tokens(tokens) # Deserialization should not loose subword information. tokens = tokenizer.deserialize_tokens(serialized_tokens) _check_subword(tokens) assert serialized_tokens == tokenizer.serialize_tokens(tokens)[0]
def build_tokenizer(self, tokenizer_opt): """Build tokenizer described by `tokenizer_opt`.""" if "type" not in tokenizer_opt: raise ValueError("Missing mandatory tokenizer option 'type'") if tokenizer_opt['type'] == 'sentencepiece': if "model" not in tokenizer_opt: raise ValueError("Missing mandatory tokenizer option 'model'") import sentencepiece as spm tokenizer = spm.SentencePieceProcessor() model_path = os.path.join(self.model_root, tokenizer_opt['model']) tokenizer.Load(model_path) elif tokenizer_opt['type'] == 'pyonmttok': if "params" not in tokenizer_opt: raise ValueError("Missing mandatory tokenizer option 'params'") import pyonmttok if tokenizer_opt["mode"] is not None: mode = tokenizer_opt["mode"] else: mode = None # load can be called multiple times: modify copy tokenizer_params = dict(tokenizer_opt["params"]) for key, value in tokenizer_opt["params"].items(): if key.endswith("path"): tokenizer_params[key] = os.path.join( self.model_root, value) tokenizer = pyonmttok.Tokenizer(mode, **tokenizer_params) else: raise ValueError("Invalid value for tokenizer type") return tokenizer
def __init__(self, export_dir): imported = tf.saved_model.load(export_dir) self._translate_fn = imported.signatures["serving_default"] sp_model_path = os.path.join(export_dir, "assets.extra", "wmtende.model") self._tokenizer = pyonmttok.Tokenizer("none", sp_model_path=sp_model_path)
def main(): parser = argparse.ArgumentParser(description="Translation client example") parser.add_argument("--model_name", required=True, help="model name") parser.add_argument("--sentencepiece_model", required=True, help="path to the sentence model") parser.add_argument("--host", default="localhost", help="model server host") parser.add_argument("--port", type=int, default=9000, help="model server port") parser.add_argument("--timeout", type=float, default=10.0, help="request timeout") args = parser.parse_args() channel = grpc.insecure_channel("%s:%d" % (args.host, args.port)) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) tokenizer = pyonmttok.Tokenizer("none", sp_model_path=args.sentencepiece_model) batch_input = [ "Hello world!", "My name is John.", "I live on the West coast." ] batch_output = translate(stub, args.model_name, batch_input, tokenizer, timeout=args.timeout) for input_text, output_text in zip(batch_input, batch_output): print("{} ||| {}".format(input_text, output_text))