Exemplo n.º 1
0
def test_compute_type():
    model_path = _get_model_path()
    with pytest.raises(ValueError):
        ctranslate2.Translator(model_path, compute_type="float64")
    with pytest.raises(TypeError):
        ctranslate2.Translator(model_path, compute_type=["int8", "int16"])
    ctranslate2.Translator(model_path, compute_type="int8")
    ctranslate2.Translator(model_path, compute_type={"cuda": "float16", "cpu": "int8"})
Exemplo n.º 2
0
def test_compute_type():
    model_path = _get_model_path()
    with pytest.raises(ValueError, match="compute type"):
        ctranslate2.Translator(model_path, compute_type="float64")
    with pytest.raises(TypeError, match="incompatible constructor arguments"):
        ctranslate2.Translator(model_path, compute_type=["int8", "int16"])
    ctranslate2.Translator(model_path, compute_type="int8")
    ctranslate2.Translator(model_path,
                           compute_type={
                               "cuda": "float16",
                               "cpu": "int8"
                           })
Exemplo n.º 3
0
def test_raw_file_translation(tmpdir):
    input_path = str(tmpdir.join("input.txt"))
    output_path = str(tmpdir.join("output.txt"))
    with open(input_path, "w", encoding="utf-8") as input_file:
        input_file.write("آتزمون")
        input_file.write("\n")
        input_file.write("آتشيسون")
        input_file.write("\n")

    translator = ctranslate2.Translator(_get_model_path())
    tokenize_fn = lambda text: list(text)
    detokenize_fn = lambda tokens: "".join(tokens)

    with pytest.raises(ValueError, match="target_detokenize_fn"):
        translator.translate_file(input_path,
                                  output_path,
                                  source_tokenize_fn=tokenize_fn)
    with pytest.raises(ValueError, match="source_tokenize_fn"):
        translator.translate_file(input_path,
                                  output_path,
                                  target_detokenize_fn=detokenize_fn)

    translator.translate_file(
        input_path,
        output_path,
        source_tokenize_fn=tokenize_fn,
        target_detokenize_fn=detokenize_fn,
    )

    with open(output_path, encoding="utf-8") as output_file:
        lines = output_file.readlines()
        assert lines[0].strip() == "atzmon"
        assert lines[1].strip() == "achison"
Exemplo n.º 4
0
def test_raw_file_translation(tmpdir):
    input_path = str(tmpdir.join("input.txt"))
    output_path = str(tmpdir.join("output.txt"))
    with open(input_path, "w") as input_file:
        input_file.write("آتزمون")
        input_file.write("\n")
        input_file.write("آتشيسون")
        input_file.write("\n")

    translator = ctranslate2.Translator(_get_model_path())
    tokenize_fn = lambda text: list(text)
    detokenize_fn = lambda tokens: "".join(tokens)
    max_batch_size = 4

    with pytest.raises(ValueError):
        translator.translate_file(input_path,
                                  output_path,
                                  max_batch_size,
                                  tokenize_fn=tokenize_fn)
    with pytest.raises(ValueError):
        translator.translate_file(input_path,
                                  output_path,
                                  max_batch_size,
                                  detokenize_fn=detokenize_fn)

    translator.translate_file(input_path,
                              output_path,
                              max_batch_size,
                              tokenize_fn=tokenize_fn,
                              detokenize_fn=detokenize_fn)

    with open(output_path) as output_file:
        lines = output_file.readlines()
        assert lines[0].strip() == "atzmon"
        assert lines[1].strip() == "achison"
Exemplo n.º 5
0
 def return_loaded_models(self, model_paths, ids):
     loaded_models = {}
     for i, path in enumerate(model_paths):
         translator = ctranslate2.Translator(path, device="auto")
         loaded_models[ids[i]] = translator
         log_info("Model Loaded: {}".format(ids[i]), MODULE_CONTEXT)
     return loaded_models
Exemplo n.º 6
0
    def __init__(self, model_path):
        self.model_path = model_path
        self.tokenizer_source = None
        self.tokenizer_target = None

        if self.INTER_THREADS in os.environ:
            inter_threads = int(os.environ[self.INTER_THREADS])
        else:
            inter_threads = 1

        if self.INTRA_THREADS in os.environ:
            intra_threads = int(os.environ[self.INTRA_THREADS])
        else:
            intra_threads = 4

        if self.BEAM_SIZE in os.environ:
            self.beam_size = int(os.environ[self.BEAM_SIZE])
        else:
            self.beam_size = 2

        if self.USE_VMAP in os.environ:
            self.use_vmap = True
        else:
            self.use_vmap = False

        print(
            f"inter_threads: {inter_threads}, intra_threads: {intra_threads}, beam_size {self.beam_size}, use_vmap {self.use_vmap}"
        )
        self.translator = ctranslate2.Translator(model_path,
                                                 inter_threads=inter_threads,
                                                 intra_threads=intra_threads)
Exemplo n.º 7
0
    def hypotheses(self, input_text, num_hypotheses):
        if self.translator == None:
            model_path = str(self.pkg.package_path / 'model')
            self.translator = ctranslate2.Translator(model_path)
        paragraphs = ITranslation.split_into_paragraphs(input_text)
        info("paragraphs", paragraphs)
        translated_paragraphs = []
        for paragraph in paragraphs:
            translated_paragraphs.append(
                apply_packaged_translation(self.pkg, paragraph,
                                           self.translator, num_hypotheses))
        info("translated_paragraphs", translated_paragraphs)

        # Construct new hypotheses using all paragraphs
        hypotheses_to_return = [
            Hypothesis('', 0) for i in range(num_hypotheses)
        ]
        for i in range(num_hypotheses):
            for translated_paragraph in translated_paragraphs:
                value = ITranslation.combine_paragraphs([
                    hypotheses_to_return[i].value,
                    translated_paragraph[i].value
                ])
                score = hypotheses_to_return[i].score + translated_paragraph[
                    i].score
                hypotheses_to_return[i] = Hypothesis(value, score)
            hypotheses_to_return[i].value = hypotheses_to_return[
                i].value.lstrip('\n')
        info('hypotheses_to_return', hypotheses_to_return)
        return hypotheses_to_return
Exemplo n.º 8
0
    def __init__(self, models_path, model_name, tokenizer_source = None, tokenizer_target = None, translator = None):

        inter_threads, intra_threads = self._init_read_env_vars()

        model_path = os.path.join(models_path, model_name)

        if tokenizer_source:
            self.tokenizer_source = tokenizer_target
        else:
            src_model_path = self.get_source_tokenizer_file(model_path, model_name)
            self.tokenizer_source = pyonmttok.Tokenizer(mode="none", sp_model_path = src_model_path)

        if tokenizer_target:
            self.tokenizer_target = tokenizer_target
        else:
            tgt_model_path = self.get_target_tokenizer_file(model_path, model_name)
            self.tokenizer_target = pyonmttok.Tokenizer(mode="none", sp_model_path = tgt_model_path)

        self.tokenizer_source_language = self._get_sentence_tokenizer_source_language(model_name)

        print(f"inter_threads: {inter_threads}, intra_threads: {intra_threads}, beam_size {self.beam_size}, use_vmap {self.use_vmap}")

        if translator is None:
            self.model_path = model_path
            ctranslate_model_path = os.path.join(model_path, "ctranslate2")
            self.translator = ctranslate2.Translator(ctranslate_model_path, inter_threads = inter_threads, intra_threads = intra_threads)
        else:
            self.translator = translator

        self.model_name = model_name
 def __init__(self,
              model_path,
              device,
              device_index,
              batch_size,
              beam_size,
              n_best,
              preload=False):
     import ctranslate2
     self.translator = ctranslate2.Translator(model_path,
                                              device=device,
                                              device_index=device_index,
                                              inter_threads=1,
                                              intra_threads=1,
                                              compute_type="default")
     self.batch_size = batch_size
     self.beam_size = beam_size
     self.n_best = n_best
     if preload:
         # perform a first request to initialize everything
         dummy_translation = self.translate(["a"])
         print("Performed a dummy translation to initialize the model",
               dummy_translation)
         time.sleep(1)
         self.translator.unload_model(to_cpu=True)
Exemplo n.º 10
0
def test_fairseq_user_start_token(tmpdir):
    class _CustomFairseqConverter(ctranslate2.converters.FairseqConverter):
        def _load(self):
            model_spec = super()._load()
            model_spec.user_decoder_start_tokens = True
            return model_spec

    data_dir = os.path.join(
        _TEST_DATA_DIR,
        "models",
        "transliteration-aren-all",
        "fairseq",
    )
    converter = _CustomFairseqConverter(os.path.join(data_dir, "model.pt"),
                                        data_dir)
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir)
    translator = ctranslate2.Translator(output_dir)
    tokens = [["آ", "ت", "ز", "م", "و", "ن"]]

    with pytest.raises(ValueError, match="start token"):
        translator.translate_batch(tokens)

    output = translator.translate_batch(tokens, target_prefix=[["</s>"]])
    assert output[0].hypotheses[0] == ["a", "t", "z", "m", "o", "n"]
Exemplo n.º 11
0
def test_opennmt_tf_variables_conversion(tmpdir):
    import opennmt

    model_path = os.path.join(
        _TEST_DATA_DIR,
        "models",
        "transliteration-aren-all",
        "opennmt_tf",
        "v2",
        "checkpoint",
    )

    src_vocab = opennmt.data.Vocab.from_file(
        os.path.join(model_path, "ar.vocab"))
    tgt_vocab = opennmt.data.Vocab.from_file(
        os.path.join(model_path, "en.vocab"))
    _, variables = opennmt_tf.load_model(model_path)
    converter = ctranslate2.converters.OpenNMTTFConverter(
        ctranslate2.specs.TransformerSpec(6, 8),
        src_vocab,
        tgt_vocab,
        variables=variables,
    )
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir)
    translator = ctranslate2.Translator(output_dir)
    output = translator.translate_batch([["آ", "ت", "ز", "م", "و", "ن"]])
    assert output[0].hypotheses[0] == ["a", "t", "z", "m", "o", "n"]
Exemplo n.º 12
0
def test_translator_properties():
    translator = ctranslate2.Translator(_get_model_path(), inter_threads=2)
    assert translator.model_is_loaded
    assert translator.device == "cpu"
    assert translator.device_index == 0
    assert translator.num_translators == 2
    assert translator.num_queued_batches == 0
Exemplo n.º 13
0
def test_opennmt_py_model_conversion(tmpdir):
    model_path = os.path.join(
        _TEST_DATA_DIR, "models", "transliteration-aren-all", "opennmt_py", "aren_7000.pt")
    converter = ctranslate2.converters.OpenNMTPyConverter(model_path)
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir, ctranslate2.specs.TransformerBase())
    translator = ctranslate2.Translator(output_dir)
    output = translator.translate_batch([["آ" ,"ت" ,"ز" ,"م" ,"و" ,"ن"]])
    assert output[0][0]["tokens"] == ["a", "t", "z", "m", "o", "n"]
Exemplo n.º 14
0
 def translate(self, input_text):
     if self.translator == None:
         model_path = str(self.pkg.package_path / 'model')
         self.translator = ctranslate2.Translator(model_path)
     paragraphs = self.split_into_paragraphs(input_text)
     translated_paragraphs = []
     for paragraph in paragraphs:
         translated_paragraphs.append(
                 apply_packaged_translation(self.pkg, paragraph, self.translator))
     return self.combine_paragraphs(translated_paragraphs)
Exemplo n.º 15
0
    def load_translator(self, inter_threads, intra_threads):
        import ctranslate2
        translator = ctranslate2.Translator('./models/multi_lang_v3',
                                            device='auto',
                                            device_index=0,
                                            inter_threads=inter_threads,
                                            intra_threads=intra_threads,
                                            compute_type="default")

        print('Using device:', translator.device)

        return translator
Exemplo n.º 16
0
 def testCTranslate2Export(self, variant):
     try:
         import ctranslate2
     except ImportError:
         self.skipTest("ctranslate2 module is not available")
     export_dir = os.path.join(self.get_temp_dir(), "export")
     runner = self._getTransliterationRunner()
     runner.export(export_dir, exporter=exporters.make_exporter(variant))
     self.assertTrue(ctranslate2.contains_model(export_dir))
     translator = ctranslate2.Translator(export_dir)
     output = translator.translate_batch([["آ", "ت", "ز", "م", "و", "ن"]])
     self.assertListEqual(output[0][0]["tokens"], ["a", "t", "z", "m", "o", "n"])
Exemplo n.º 17
0
def test_opennmt_tf_model_quantization(tmpdir, quantization):
    model_path = os.path.join(
        _TEST_DATA_DIR, "models", "transliteration-aren-all", "opennmt_tf", "v2", "checkpoint")
    converter = ctranslate2.converters.OpenNMTTFConverter(
        model_path,
        src_vocab=os.path.join(model_path, "ar.vocab"),
        tgt_vocab=os.path.join(model_path, "en.vocab"))
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir, ctranslate2.specs.TransformerBase(), quantization=quantization)
    translator = ctranslate2.Translator(output_dir)
    output = translator.translate_batch([["آ" ,"ت" ,"ز" ,"م" ,"و" ,"ن"]])
    assert output[0][0]["tokens"] == ["a", "t", "z", "m", "o", "n"]
Exemplo n.º 18
0
def test_opennmt_py_source_features(tmpdir, filename):
    model_path = os.path.join(
        _TEST_DATA_DIR,
        "models",
        "transliteration-aren-all",
        "opennmt_py",
        filename,
    )
    converter = ctranslate2.converters.OpenNMTPyConverter(model_path)
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir)
    assert os.path.isfile(os.path.join(output_dir, "source_1_vocabulary.txt"))
    assert os.path.isfile(os.path.join(output_dir, "source_2_vocabulary.txt"))

    source = [
        ["آ", "ت", "ز", "م", "و", "ن"],
        ["آ", "ت", "ش", "ي", "س", "و", "ن"],
    ]
    source_features = [
        ["0", "1", "2", "3", "4", "5"],
        ["0", "1", "2", "3", "4", "5", "6"],
    ]
    expected_target = [
        ["a", "t", "z", "m", "o", "n"],
        ["a", "c", "h", "i", "s", "o", "n"],
    ]

    source_w_features = []
    for tokens, features in zip(source, source_features):
        source_w_features.append(
            ["%s│%s" % pair for pair in zip(tokens, features)])

    translator = ctranslate2.Translator(output_dir)
    with pytest.raises(ValueError, match="features"):
        translator.translate_batch(source)

    outputs = translator.translate_batch(source_w_features)
    for output, expected_hypothesis in zip(outputs, expected_target):
        assert output.hypotheses[0] == expected_hypothesis

    input_path = str(tmpdir.join("input.txt"))
    output_path = str(tmpdir.join("output.txt"))

    _write_tokens(source, input_path)
    with pytest.raises(ValueError, match="features"):
        translator.translate_file(input_path, output_path)

    _write_tokens(source_w_features, input_path)
    translator.translate_file(input_path, output_path)
    with open(output_path) as output_file:
        for line, expected_hypothesis in zip(output_file, expected_target):
            assert line.strip().split() == expected_hypothesis
Exemplo n.º 19
0
def test_fairseq_model_conversion(tmpdir):
    data_dir = os.path.join(
        _TEST_DATA_DIR,
        "models",
        "transliteration-aren-all",
        "fairseq",
    )
    converter = ctranslate2.converters.FairseqConverter(
        os.path.join(data_dir, "model.pt"), data_dir)
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir)
    translator = ctranslate2.Translator(output_dir)
    output = translator.translate_batch([["آ", "ت", "ز", "م", "و", "ن"]])
    assert output[0].hypotheses[0] == ["a", "t", "z", "m", "o", "n"]
Exemplo n.º 20
0
def test_opennmt_tf_model_conversion(tmpdir, model_path, src_vocab, tgt_vocab, model_spec):
    model_path = os.path.join(
        _TEST_DATA_DIR, "models", "transliteration-aren-all", "opennmt_tf", model_path)
    if src_vocab is not None:
        src_vocab = os.path.join(model_path, src_vocab)
    if tgt_vocab is not None:
        tgt_vocab = os.path.join(model_path, tgt_vocab)
    converter = ctranslate2.converters.OpenNMTTFConverter(
        model_path, src_vocab=src_vocab, tgt_vocab=tgt_vocab)
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir, model_spec)
    translator = ctranslate2.Translator(output_dir)
    output = translator.translate_batch([["آ" ,"ت" ,"ز" ,"م" ,"و" ,"ن"]])
    assert output[0][0]["tokens"] == ["a", "t", "z", "m", "o", "n"]
Exemplo n.º 21
0
def test_hard_target_prefix_with_vmap(tmpdir, beam_size):
    model_dir = str(tmpdir.join("model"))
    shutil.copytree(_get_model_path(), model_dir)
    with open(os.path.join(model_dir, "vmap.txt"), "w",
              encoding="utf-8") as vmap:
        vmap.write("ن\tt z m o n\n")

    translator = ctranslate2.Translator(model_dir)
    output = translator.translate_batch(
        [["آ", "ت", "ز", "م", "و", "ن"]],
        target_prefix=[["a", "t", "z"]],
        beam_size=beam_size,
        use_vmap=True,
    )
    assert output[0].hypotheses[0] == ["a", "t", "z", "m", "o", "n"]
Exemplo n.º 22
0
def quickstart_example():
	# Convert a model trained with OpenNMT-py or OpenNMT-tf.
	#	REF [site] >> https://github.com/OpenNMT/CTranslate2

	#--------------------
	translator = ctranslate2.Translator('ende_ctranslate2/')

	print('Start translating...')
	start_time = time.time()
	# Translate tokenized inputs.
	translated = translator.translate_batch([['▁H', 'ello', '▁world', '!']])
	print('End translating: {} secs.'.format(time.time() - start_time))

	print('Tokens: {}.'.format(translated[0][0]['tokens']))
	print('Score = {}.'.format(translated[0][0]['score']))
Exemplo n.º 23
0
def test_opennmt_py_relative_transformer(tmpdir):
    model_path = os.path.join(
        _TEST_DATA_DIR,
        "models",
        "transliteration-aren-all",
        "opennmt_py",
        "aren_relative_6000.pt",
    )
    converter = ctranslate2.converters.OpenNMTPyConverter(model_path)
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir)
    translator = ctranslate2.Translator(output_dir)
    output = translator.translate_batch([["آ", "ت", "ز", "م", "و", "ن"],
                                         ["آ", "ر", "ث", "ر"]])
    assert output[0].hypotheses[0] == ["a", "t", "z", "o", "m", "o", "n"]
    assert output[1].hypotheses[0] == ["a", "r", "t", "h", "e", "r"]
Exemplo n.º 24
0
 def multi_translate(self, input_text, nresults=4):
     if self.translator == None:
         model_path = str(self.pkg.package_path / 'model')
         self.translator = ctranslate2.Translator(model_path)
     paragraphs = self.split_into_paragraphs(input_text)
     info("paragraphs", paragraphs)
     translated_paragraphs = []
     for paragraph in paragraphs:
         translated_paragraphs.append(
             apply_packaged_translation(self.pkg, paragraph,
                                        self.translator, nresults))
     info("translated_paragraphs", translated_paragraphs)
     pre_combine_paragraphs = [[s[i] for s in translated_paragraphs]
                               for i in range(nresults)]
     info("pre_combine_paragraphs", pre_combine_paragraphs)
     return self.combine_paragraphs(pre_combine_paragraphs, nresults)
Exemplo n.º 25
0
def test_raw_file_translation_with_prefix(tmpdir):
    source_path = str(tmpdir.join("input.txt"))
    target_path = str(tmpdir.join("target.txt"))
    output_path = str(tmpdir.join("output.txt"))
    with open(source_path, "w") as source_file:
        source_file.write("آتزمون")
        source_file.write("\n")
        source_file.write("آتشيسون")
        source_file.write("\n")
    with open(target_path, "w") as target_file:
        # Write target in reverse to use a different tokenization.
        target_file.write("sta\n")
        target_file.write("\n")

    translator = ctranslate2.Translator(_get_model_path())
    source_tokenize_fn = lambda text: list(text)
    target_tokenize_fn = lambda text: list(reversed(list(text)))
    detokenize_fn = lambda tokens: "".join(tokens)
    max_batch_size = 4

    with pytest.raises(ValueError):
        # Target tokenization is missing.
        translator.translate_file(
            source_path,
            output_path,
            max_batch_size,
            tokenize_fn=source_tokenize_fn,
            detokenize_fn=detokenize_fn,
            target_path=target_path,
        )

    translator.translate_file(
        source_path,
        output_path,
        max_batch_size,
        tokenize_fn=source_tokenize_fn,
        detokenize_fn=detokenize_fn,
        target_path=target_path,
        target_tokenize_fn=target_tokenize_fn,
    )

    with open(output_path) as output_file:
        lines = output_file.readlines()
        assert lines[0].strip() == "atsumon"
        assert lines[1].strip() == "achison"
Exemplo n.º 26
0
def test_opennmt_tf_shared_embeddings_conversion(tmpdir):
    # Issue https://github.com/OpenNMT/CTranslate2/issues/118
    import tensorflow as tf
    import opennmt

    vocab = opennmt.data.Vocab()
    for i in range(10):
        vocab.add(str(i))
    vocab_path = str(tmpdir.join("vocab.txt"))
    vocab.serialize(vocab_path)

    num_layers = 3
    num_heads = 4
    model = opennmt.models.Transformer(
        opennmt.inputters.WordEmbedder(32),
        opennmt.inputters.WordEmbedder(32),
        num_layers,
        num_units=32,
        num_heads=num_heads,
        ffn_inner_dim=64,
        share_embeddings=opennmt.models.EmbeddingsSharingLevel.ALL,
    )
    model.initialize({
        "source_vocabulary": vocab_path,
        "target_vocabulary": vocab_path
    })
    model.create_variables()

    checkpoint_prefix = str(tmpdir.join("ckpt"))
    checkpoint = tf.train.Checkpoint(model=model)
    checkpoint.write(checkpoint_prefix)

    converter = ctranslate2.converters.OpenNMTTFConverter(
        model_path=checkpoint_prefix,
        src_vocab=vocab_path,
        tgt_vocab=vocab_path)
    output_dir = str(tmpdir.join("ctranslate2_model"))
    converter.convert(output_dir,
                      ctranslate2.specs.TransformerSpec(num_layers, num_heads))

    assert os.path.isfile(os.path.join(output_dir, "shared_vocabulary.txt"))

    # Check that the translation runs.
    translator = ctranslate2.Translator(output_dir)
    translator.translate_batch([["1", "2", "3"]], max_decoding_length=10)
Exemplo n.º 27
0
 def __init__(self,
              model_path,
              ct2_translator_args,
              ct2_translate_batch_args,
              target_prefix=False,
              preload=False):
     import ctranslate2
     self.translator = ctranslate2.Translator(model_path,
                                              **ct2_translator_args)
     self.ct2_translate_batch_args = ct2_translate_batch_args
     self.target_prefix = target_prefix
     if preload:
         # perform a first request to initialize everything
         dummy_translation = self.translate(["a"])
         print("Performed a dummy translation to initialize the model",
               dummy_translation)
         time.sleep(1)
         self.translator.unload_model(to_cpu=True)
Exemplo n.º 28
0
def translate(src_list, sp_path_src, sp_path_tgt, ct_path):
    tokenize = MosesTokenizer('ru')
    sp_src = spm.SentencePieceProcessor()
    sp_src.load(sp_path_src)
    lengths = []
    temp = []
    p_big = re.compile('Ҧ')
    g_big = re.compile('Ҕ')
    p_small = re.compile('ҧ')
    g_small = re.compile('ҕ')
    for text in src_list:
        text = p_big.sub('Ԥ', text)
        text = g_big.sub('Ӷ', text)
        text = p_small.sub('ԥ', text)
        text = g_small.sub('ӷ', text)
        if text != '':
            with MosesSentenceSplitter('ru') as splitsents:
                text = splitsents([text])
        lengths.append(len(text))
        temp.extend(text)
    src_list = temp
    for i, text in enumerate(src_list):
        text = ' '.join(tokenize(text)).lower()
        text = sp_src.encode(text, out_type=str)
        src_list[i] = text
    translator = ctranslate2.Translator(ct_path)
    tgt_list = translator.translate_batch(src_list)
    for i, text in enumerate(tgt_list):
        detokenize = MosesDetokenizer('ru')
        sp_tgt = spm.SentencePieceProcessor()
        sp_tgt.load(sp_path_tgt)
        text = sp_tgt.decode(text[0]['tokens'])
        text = detokenize(text.split(' '))
        tgt_list[i] = text
    temp = []
    i = 0
    for length in lengths:
        text = ''
        for jw in range(length):
            text = text + tgt_list[i + jw] + ' '
        temp.append(text.strip())
        i = i + length
    tgt_list = temp
    return tgt_list
Exemplo n.º 29
0
def create_app():
    import ctranslate2
    import pathlib
    path = pathlib.Path(__file__).absolute().parents[2] / 'model_store'
    logger.info("model directory path = '{}'".format(path))
    # start packing
    net_models = {}
    # pack arabic model
    model_ara = ctranslate2.Translator(
        str(path / 'arabic' / 'ctranslate2_released')
    )
    net_models["ara"] = model_ara
    logger.info("packed arabic model")
    # pack chiense model
    model_chi = ctranslate2.Translator(
        str(path / 'chinese' / 'ctranslate2_released')
    )
    net_models["chi"] = model_chi
    logger.info("packed chinese model")
    # pack hebrew model
    model_heb = ctranslate2.Translator(
        str(path / 'hebrew' / 'ctranslate2_released')
    )
    net_models["heb"] = model_heb
    logger.info("packed hebrew model")
    # pack japanese model
    model_jpn = ctranslate2.Translator(
        str(path / 'katakana' / 'ctranslate2_released')
    )
    net_models["jpn"] = model_jpn
    logger.info("packed japanese model")
    # pack korean model
    model_kor = ctranslate2.Translator(
        str(path / 'korean' / 'ctranslate2_released')
    )
    net_models["kor"] = model_kor
    logger.info("packed korean model")
    # pack russian model
    model_rus = ctranslate2.Translator(
        str(path / 'russian' / 'ctranslate2_released')
    )
    net_models["rus"] = model_rus
    logger.info("packed russian model")
    # init flask objects
    app = Flask(__name__)
    api = Api(app)
    api.app.config['RESTFUL_JSON'] = {'ensure_ascii': False}
    api.add_resource(TransformerNETransliterator,
                     '/predict',
                     resource_class_kwargs={"net_models": net_models})
    return app
def encode_itranslate_decode(i,
                             sp_encoder,
                             sp_decoder,
                             num_map,
                             tp_tokenizer,
                             num_hypotheses=3):
    try:
        logger.info("Inside encode_itranslate_decode function")
        model_path = get_model_path(i['id'])
        translator = ctranslate2.Translator(model_path)
        i['src'] = str(sp.encode_line(sp_encoder, i['src']))
        i_final = format_converter(i['src'])

        if 'target_prefix' in i and len(
                i['target_prefix']) > 0 and i['target_prefix'].isspace(
                ) == False:
            logger.info("target prefix: {}".format(i['target_prefix']))
            i['target_prefix'] = i['target_prefix']
            i['target_prefix'] = replace_num_target_prefix(i, num_map)
            if tp_tokenizer is not None:
                i['target_prefix'] = tp_tokenizer(i['target_prefix'])
            i['target_prefix'] = str(
                sp.encode_line(sp_decoder, i['target_prefix']))
            tp_final = format_converter(i['target_prefix'])
            tp_final[-1] = tp_final[-1].replace(']', ",")
            m_out = translator.translate_batch([i_final],
                                               beam_size=5,
                                               target_prefix=[tp_final],
                                               num_hypotheses=num_hypotheses)
        else:
            m_out = translator.translate_batch([i_final],
                                               beam_size=5,
                                               num_hypotheses=num_hypotheses)

        translation = multiple_hypothesis_decoding(m_out[0], sp_decoder)
        return translation

    except Exception as e:
        logger.error(
            "Unexpexcted error in encode_itranslate_decode: {} and {}".format(
                e,
                sys.exc_info()[0]))
        raise