Exemplo n.º 1
0
def test_segment_alphabet():
    tokenizer = pyonmttok.Tokenizer(mode="aggressive",
                                    segment_alphabet=["Han"])
    tokens, _ = tokenizer.tokenize("測試 abc")
    assert tokens == ["測", "試", "abc"]

    tokenizer = pyonmttok.Tokenizer(mode="aggressive", segment_alphabet=None)
    tokens, _ = tokenizer.tokenize("測試 abc")
    assert tokens == ["測試", "abc"]
Exemplo n.º 2
0
def test_invalid_annotation():
    with pytest.raises(ValueError):
        pyonmttok.Tokenizer("conservative",
                            joiner_annotate=True,
                            spacer_annotate=True)
    with pytest.raises(ValueError):
        pyonmttok.Tokenizer("conservative", joiner_new=True)
    with pytest.raises(ValueError):
        pyonmttok.Tokenizer("conservative", spacer_new=True)
Exemplo n.º 3
0
def main():

    print("Applies an OpenNMT model to translate a TXT file")

    start_time = datetime.datetime.now()
    init_logging(True)
    model_name, input_filename, translated_file, tokenizer_models, translation_models = read_parameters(
    )

    model_path = os.path.join(translation_models, model_name)
    openNMT = CTranslate(model_path)

    if (model_name == 'eng-cat'):
        src_model_path = os.path.join(tokenizer_models, "en_m.model")
        tgt_model_path = os.path.join(tokenizer_models, "ca_m.model")
    else:
        src_model_path = os.path.join(tokenizer_models, "ca_m.model")
        tgt_model_path = os.path.join(tokenizer_models, "en_m.model")

    openNMT.tokenizer_source = pyonmttok.Tokenizer(
        mode="none", sp_model_path=src_model_path)
    openNMT.tokenizer_target = pyonmttok.Tokenizer(
        mode="none", sp_model_path=tgt_model_path)

    target_filename_review = "translated-review.txt"
    with open(input_filename, encoding='utf-8', mode='r') as tf_en,\
         open(translated_file, encoding='utf-8', mode='w') as tf_ca,\
         open(target_filename_review, encoding='utf-8', mode='w') as tf_ca_review:

        en_strings = tf_en.readlines()
        translated = 0
        errors = 0

        for src in en_strings:
            src = src.replace('\n', '')

            try:
                tgt = openNMT.translate_splitted(src)
            except Exception as e:
                logging.error(str(e))
                logging.error("Processing: {0}".format(src))
                errors = errors + 1
                tf_ca.write("{0}\n".format("Error"))
                continue

            translated = translated + 1
            tf_ca.write("{0}\n".format(tgt))
            tf_ca_review.write("{0}\n{1}\n\n".format(src, tgt))
            logging.debug('Source: ' + str(src))
            logging.debug('Target: ' + str(tgt))

    print("Sentences translated: {0}".format(translated))
    print("Sentences unable to translate {0} (NMT errors)".format(errors))
    print("Time used {0}".format(str(datetime.datetime.now() - start_time)))
Exemplo n.º 4
0
def test_deepcopy(use_constructor):
    text = "Hello World!"
    tok1 = pyonmttok.Tokenizer("aggressive", joiner_annotate=True)
    tokens1, _ = tok1.tokenize(text)
    if use_constructor:
        tok2 = pyonmttok.Tokenizer(tok1)
    else:
        tok2 = copy.deepcopy(tok1)
    tokens2, _ = tok2.tokenize(text)
    assert tokens1 == tokens2
    del tok1
    tokens2, _ = tok2.tokenize(text)
    assert tokens1 == tokens2
Exemplo n.º 5
0
 def __init__(self, export_dir):
     '''
         Load translation model and sentencepiece models
     '''
     imported = tf.saved_model.load(export_dir)
     self._translate_fn = imported.signatures["serving_default"]
     sp_jpn_model_path = os.path.join(export_dir, "assets.extra",
                                      "sp.jpn.model")
     sp_eng_model_path = os.path.join(export_dir, "assets.extra",
                                      "sp.eng.model")
     self.jpn_tokenizer = pyonmttok.Tokenizer(
         "none", sp_model_path=sp_jpn_model_path)
     self.eng_tokenizer = pyonmttok.Tokenizer(
         "none", sp_model_path=sp_eng_model_path)
Exemplo n.º 6
0
 def init_core(self, bpe_path):
     import pyonmttok
     if os.path.isdir(bpe_path):
         files = [(file.split('_')[-1], file)
                  for file in os.listdir(bpe_path)]
         return {
             key: pyonmttok.Tokenizer('conservative',
                                      joiner_annotate=True,
                                      bpe_model_path=os.path.join(
                                          bpe_path, file))
             for key, file in files
         }
     return pyonmttok.Tokenizer('conservative',
                                joiner_annotate=True,
                                bpe_model_path=bpe_path)
Exemplo n.º 7
0
def test_random_seed():
    pyonmttok.set_random_seed(42)

    tokenizer = pyonmttok.Tokenizer(
        "none",
        sp_model_path=os.path.join(_DATA_DIR, "sp-models", "wmtende.model"),
        sp_nbest_size=10,
        sp_alpha=0.1)
    assert tokenizer.tokenize("appealing")[0] == ["▁app", "e", "al", "ing"]

    tokenizer = pyonmttok.Tokenizer(
        "conservative",
        bpe_model_path=os.path.join(_DATA_DIR, "bpe-models", "testcode.v0.1"),
        bpe_dropout=0.3)
    assert tokenizer.tokenize("improvement")[0] == ["i", "m", "pr", "ove", "m", "e", "n", "t"]
Exemplo n.º 8
0
def test_file(tmpdir, tokens_delimiter):
    tokenizer = pyonmttok.Tokenizer(
        "aggressive",
        joiner_annotate=True,
        joiner_new=True,
        case_feature=True,
    )
    text = "Hello WORLD!"
    expected_tokens = ["hello│C", "world│U", "■│N", "!│N"]

    input_path = str(tmpdir.join("input.txt"))
    output_path = str(tmpdir.join("output.txt"))
    with open(input_path, "w", encoding="utf-8") as input_file:
        input_file.write(text)
        input_file.write("\n")

    tokenizer.tokenize_file(input_path,
                            output_path,
                            tokens_delimiter=tokens_delimiter)
    assert os.path.exists(output_path)
    with open(output_path, encoding="utf-8") as output_file:
        assert output_file.readline(
        ) == tokens_delimiter.join(expected_tokens) + "\n"
    os.remove(input_path)

    tokenizer.detokenize_file(output_path,
                              input_path,
                              tokens_delimiter=tokens_delimiter)
    assert os.path.exists(input_path)
    with open(input_path, encoding="utf-8") as input_file:
        assert input_file.readline() == text + "\n"
Exemplo n.º 9
0
def tokenize_mn(segment):
    tokenizer = pyonmttok.Tokenizer("aggressive", segment_numbers=True, joiner_annotate=True)
    segment=protect(segment)
    tokens, features = tokenizer.tokenize(segment)
    tokenized=" ".join(tokens)       
    unprotected=unprotect(tokenized).replace("%0020"," ")
    return(unprotected) 
Exemplo n.º 10
0
def _translate_sentence(stub, model_name, text):
    print(text)
    tokenizer = pyonmttok.Tokenizer("conservative")
    _default = 10.0
    output = translate(stub, model_name, [text], tokenizer, timeout=_default)
    print(output[0])
    return output[0]
Exemplo n.º 11
0
def tokenize(segment):
    tokenizer = pyonmttok.Tokenizer("aggressive", segment_numbers=False, joiner_annotate=False)
    segment=protect(segment)
    tokens, features = tokenizer.tokenize(segment)
    tokenized=" ".join(tokens)       
    unprotected=unprotect(tokenized)
    return(unprotected) 
Exemplo n.º 12
0
def build_tokenizer(args):
    """Builds a tokenizer based on user arguments."""
    args = args.copy()
    args.pop('vocabulary', None)
    args.pop('build_subword', None)
    args.pop('build_vocabulary', None)
    return pyonmttok.Tokenizer(**args)
Exemplo n.º 13
0
 def __init__(self):
     self._morph = pymorphy2.MorphAnalyzer()
     with open(os.path.join(FILES_PATH, 'introduction_words.txt'),
               'r',
               encoding='utf-8') as inwords:
         self._introduction_words_lst = inwords.read().split('\n')
     self._tokenizer = pyonmttok.Tokenizer('aggressive')
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(description="Translation client example")
    parser.add_argument("--model_name", required=True, help="model name")
    parser.add_argument("--sentencepiece_model",
                        required=True,
                        help="path to the sentence model")
    parser.add_argument("--host",
                        default="localhost",
                        help="model server host")
    parser.add_argument("--port",
                        type=int,
                        default=9000,
                        help="model server port")
    parser.add_argument("--timeout",
                        type=float,
                        default=10.0,
                        help="request timeout")
    args = parser.parse_args()

    channel = grpc.insecure_channel("%s:%d" % (args.host, args.port))
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    tokenizer = pyonmttok.Tokenizer("none",
                                    sp_model_path=args.sentencepiece_model)

    while True:
        text = input("Source: ")
        output = translate(stub,
                           args.model_name, [text],
                           tokenizer,
                           timeout=args.timeout)
        print("Target: %s" % output[0])
        print("")
Exemplo n.º 15
0
 def __init__(self, **kwargs):
     case_feature = kwargs.get("case_feature")
     if case_feature:
         raise ValueError("case_feature is not supported with OpenNMT-tf")
     kwargs.setdefault("mode", "conservative")
     self._config = kwargs
     self._tokenizer = pyonmttok.Tokenizer(**kwargs)
Exemplo n.º 16
0
def test_bpe_case_insensitive_issue_147():
    tokenizer = pyonmttok.Tokenizer("conservative",
                                    bpe_model_path=os.path.join(
                                        _DATA_DIR, "bpe-models",
                                        "issue-147.txt"))
    tokenizer.tokenize(
        "𝘛𝘩𝘦𝘳𝘦'𝘴 𝘯𝘰𝘵𝘩𝘪𝘯𝘨 𝘮𝘰𝘳𝘦 𝘨𝘭𝘢𝘮𝘰𝘳𝘰𝘶𝘴 𝘵𝘩𝘢𝘯 𝘭𝘰𝘰𝘬𝘪𝘯𝘨 𝘵𝘰𝘸𝘢𝘳𝘥𝘴 𝘵𝘩𝘦 𝘧𝘶𝘵𝘶𝘳𝘦")
Exemplo n.º 17
0
def main():
  parser = argparse.ArgumentParser(description="Translation client example")
  parser.add_argument("--model_name", required=True,
                      help="model name")
  parser.add_argument("--sentencepiece_model", required=True,
                      help="path to the sentence model")
  parser.add_argument("--host", default="localhost",
                      help="model server host")
  parser.add_argument("--port", type=int, default=9000,
                      help="model server port")
  parser.add_argument("--timeout", type=float, default=100000.0,
                      help="request timeout")
  args = parser.parse_args()

  channel = grpc.insecure_channel("%s:%d" % (args.host, args.port))
  stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
  tokenizer = pyonmttok.Tokenizer("none", sp_model_path=args.sentencepiece_model)

  sample = ["Hello world! My name is John. I live on the West coast.",]
  batch_output = translate(stub, args.model_name, sample*16, tokenizer, timeout=args.timeout)

  for bs in (1,2,4,8,16,32,64):
    print("batch_size: "+str(bs))
    batch_input = sample * bs
    print(len(batch_input))
    start = datetime.datetime.now()
    for i in range(0,50):
      batch_output = translate(stub, args.model_name, batch_input, tokenizer, timeout=args.timeout)
      # for input_text, output_text in zip(batch_input, batch_output):
      #   print("{} ||| {}".format(input_text, output_text))
    end = datetime.datetime.now()
    elapsed = end - start
    print(elapsed.seconds,":",elapsed.microseconds)
Exemplo n.º 18
0
def test_align_perplexity_percent_threshold(lower, upper, log_probs,
                                            expected_log_probs):
    if expected_log_probs is None:
        expected_log_probs = log_probs
    tu_list = []
    tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=True)
    for log_prob in log_probs:
        single_tu = tu.TranslationUnit("a b c",
                                       "a b c",
                                       source_tokenizer=tokenizer,
                                       target_tokenizer=tokenizer)
        single_tu.set_alignment(
            _MockAligner(forward_log_prob=log_prob,
                         backward_log_prob=log_prob))
        tu_list.append(single_tu)

    config = {
        "source":
        "en",
        "target":
        "fr",
        "preprocess": [{
            "op": "align_perplexity_filter",
            "percent_threshold": {
                "lower": lower,
                "upper": upper,
            }
        }]
    }

    tu_list = _run_pipeline(config, prepoperator.ProcessType.TRAINING, tu_list)
    assert len(tu_list) == len(expected_log_probs)
    for single_tu, log_prob in zip(tu_list, expected_log_probs):
        assert single_tu.alignment_log_probs[0][0] == log_prob
def tokenize_m(segment):
    tokenizer = pyonmttok.Tokenizer("space",
                                    segment_numbers=False,
                                    joiner_annotate=True)
    seg_list = jieba.cut(segment, cut_all=False)
    tokenized = " ■".join(list(seg_list))
    return (tokenized)
def tokenize_m(segment):
    tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True, segment_numbers=False, segment_alphabet=["Han"], segment_alphabet_change=True)
    segment=protect(segment)
    tokens, features = tokenizer.tokenize(segment)
    tokenized=" ".join(tokens)       
    unprotected=unprotect(tokenized).replace("%0020"," ")
    return(unprotected) 
Exemplo n.º 21
0
def create_tokenizer(config):
    """Creates a new OpenNMT tokenizer.

  Args:
    config: A dictionary of tokenization options.

  Returns:
    A ``pyonmttok.Tokenizer``.
  """
    def _set(kwargs, key):
        if key in config:
            value = config[key]
            if isinstance(value, six.string_types):
                value = tf.compat.as_bytes(value)
            kwargs[key] = value

    kwargs = {}
    _set(kwargs, "bpe_model_path")
    _set(kwargs, "sp_model_path")
    _set(kwargs, "joiner")
    _set(kwargs, "joiner_annotate")
    _set(kwargs, "joiner_new")
    _set(kwargs, "spacer_annotate")
    _set(kwargs, "case_feature")
    _set(kwargs, "no_substitution")
    _set(kwargs, "segment_case")
    _set(kwargs, "segment_numbers")
    _set(kwargs, "segment_alphabet_change")
    _set(kwargs, "segment_alphabet")

    return pyonmttok.Tokenizer(config.get("mode", "conservative"), **kwargs)
Exemplo n.º 22
0
 def __init__(self, export_dir):
     imported = tf.saved_model.load(export_dir)
     self._translate_fn = imported.signatures["serving_default"]
     #https://github.com/OpenNMT/Tokenizer/blob/1ae0877a733268c9a3ef5fc063d4c1f0b6dfe2f7/docs/options.md
     self._tokenizer = pyonmttok.Tokenizer(
         "conservative", joiner_annotate=True, segment_numbers=True
     )  #https://github.com/OpenNMT/Tokenizer/tree/master/bindings/python
Exemplo n.º 23
0
def detokenize_mn(segment):
    tokenizer = pyonmttok.Tokenizer("aggressive",
                                    segment_numbers=True,
                                    joiner_annotate=False)
    segment = tokenizer.detokenize(segment.split(" "))
    detok = segment
    return (detok)
Exemplo n.º 24
0
 def __init__(self):
     with open(os.path.join(FILES_PATH, 'abbreviations.json'),
               'r',
               encoding='utf-8') as abbs:
         self._abbs = json.load(abbs)
     self._morph = pymorphy2.MorphAnalyzer()
     self._tokenizer = pyonmttok.Tokenizer('aggressive')
Exemplo n.º 25
0
def test_detok_with_ranges():
    tokenizer = pyonmttok.Tokenizer("conservative")
    text, ranges = tokenizer.detokenize_with_ranges(["a", "b"])
    assert text == "a b"
    assert len(ranges) == 2
    assert ranges[0] == (0, 0)
    assert ranges[1] == (2, 2)
Exemplo n.º 26
0
 def __init__(self, **kwargs):
   self._config = copy.deepcopy(kwargs)
   mode = "conservative"
   if "mode" in kwargs:
     mode = kwargs["mode"]
     del kwargs["mode"]
   self._tokenizer = pyonmttok.Tokenizer(mode, **kwargs)
Exemplo n.º 27
0
def test_token_api_with_subword():
    tokenizer = pyonmttok.Tokenizer("conservative",
                                    case_markup=True,
                                    joiner_annotate=True,
                                    bpe_model_path=os.path.join(
                                        _DATA_DIR, "bpe-models",
                                        "codes_suffix_case_insensitive.fr"))

    text = "BONJOUR MONDE"

    def _check_subword(tokens):
        assert len(tokens) == 5
        assert tokens[0].type == pyonmttok.TokenType.LEADING_SUBWORD  # bon
        assert tokens[1].type == pyonmttok.TokenType.TRAILING_SUBWORD  # j
        assert tokens[2].type == pyonmttok.TokenType.TRAILING_SUBWORD  # our
        assert tokens[3].type == pyonmttok.TokenType.LEADING_SUBWORD  # mon
        assert tokens[4].type == pyonmttok.TokenType.TRAILING_SUBWORD  # de

    tokens = tokenizer.tokenize(text, as_token_objects=True)
    _check_subword(tokens)
    serialized_tokens, _ = tokenizer.serialize_tokens(tokens)

    # Deserialization should not loose subword information.
    tokens = tokenizer.deserialize_tokens(serialized_tokens)
    _check_subword(tokens)
    assert serialized_tokens == tokenizer.serialize_tokens(tokens)[0]
Exemplo n.º 28
0
    def build_tokenizer(self, tokenizer_opt):
        """Build tokenizer described by `tokenizer_opt`."""
        if "type" not in tokenizer_opt:
            raise ValueError("Missing mandatory tokenizer option 'type'")

        if tokenizer_opt['type'] == 'sentencepiece':
            if "model" not in tokenizer_opt:
                raise ValueError("Missing mandatory tokenizer option 'model'")
            import sentencepiece as spm
            tokenizer = spm.SentencePieceProcessor()
            model_path = os.path.join(self.model_root, tokenizer_opt['model'])
            tokenizer.Load(model_path)
        elif tokenizer_opt['type'] == 'pyonmttok':
            if "params" not in tokenizer_opt:
                raise ValueError("Missing mandatory tokenizer option 'params'")
            import pyonmttok
            if tokenizer_opt["mode"] is not None:
                mode = tokenizer_opt["mode"]
            else:
                mode = None
            # load can be called multiple times: modify copy
            tokenizer_params = dict(tokenizer_opt["params"])
            for key, value in tokenizer_opt["params"].items():
                if key.endswith("path"):
                    tokenizer_params[key] = os.path.join(
                        self.model_root, value)
            tokenizer = pyonmttok.Tokenizer(mode, **tokenizer_params)
        else:
            raise ValueError("Invalid value for tokenizer type")
        return tokenizer
Exemplo n.º 29
0
 def __init__(self, export_dir):
     imported = tf.saved_model.load(export_dir)
     self._translate_fn = imported.signatures["serving_default"]
     sp_model_path = os.path.join(export_dir, "assets.extra",
                                  "wmtende.model")
     self._tokenizer = pyonmttok.Tokenizer("none",
                                           sp_model_path=sp_model_path)
Exemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser(description="Translation client example")
    parser.add_argument("--model_name", required=True, help="model name")
    parser.add_argument("--sentencepiece_model",
                        required=True,
                        help="path to the sentence model")
    parser.add_argument("--host",
                        default="localhost",
                        help="model server host")
    parser.add_argument("--port",
                        type=int,
                        default=9000,
                        help="model server port")
    parser.add_argument("--timeout",
                        type=float,
                        default=10.0,
                        help="request timeout")
    args = parser.parse_args()

    channel = grpc.insecure_channel("%s:%d" % (args.host, args.port))
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    tokenizer = pyonmttok.Tokenizer("none",
                                    sp_model_path=args.sentencepiece_model)

    batch_input = [
        "Hello world!", "My name is John.", "I live on the West coast."
    ]
    batch_output = translate(stub,
                             args.model_name,
                             batch_input,
                             tokenizer,
                             timeout=args.timeout)
    for input_text, output_text in zip(batch_input, batch_output):
        print("{} ||| {}".format(input_text, output_text))