def test_define_models_default():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5")

    model, encoder_model, decoder_model = translator._define_models()

    model_layers = ["encoder_input", "input_embeddings", "decoder_input", "bidirectional_encoder_layer",
                    "target_embeddings", "average_3", "average_4", "decoder_layer_1", "output_layer"]

    encoder_input = model.get_layer(name="encoder_input")
    input_embeddings = model.get_layer(name="input_embeddings")
    bidirectional_encoder_layer = model.get_layer(name="bidirectional_encoder_layer")
    decoder_input = model.get_layer(name="decoder_input")
    target_embeddings = model.get_layer(name="target_embeddings")
    decoder_layer_1 = model.get_layer(name="decoder_layer_1")
    output_layer = model.get_layer(name="output_layer")

    assert len(model.layers) == 9
    assert encoder_input.get_output_at(0) == input_embeddings.get_input_at(0)
    assert input_embeddings.get_output_at(0) == bidirectional_encoder_layer.get_input_at(0)
    assert decoder_input.get_output_at(0) == target_embeddings.get_input_at(0)
    assert decoder_layer_1.get_output_at(0)[0] == output_layer.get_input_at(0)

    assert len(decoder_model.layers) == 6
    assert decoder_model.get_layer("decoder_layer_1").get_input_at(0)[0] == decoder_model.get_layer(
        "target_embeddings").get_output_at(0)

    assert len(encoder_model.layers) == 5
def test_get_gen_steps():
    class TestDataset(object):
        pass

    dataset = TestDataset()

    dataset.num_samples = 64
    batch_size = 64
    result = 1
    assert Translator.get_gen_steps(dataset, batch_size) == result

    dataset.num_samples = 63
    batch_size = 64
    result = 1
    assert Translator.get_gen_steps(dataset, batch_size) == result

    dataset.num_samples = 64
    batch_size = 63
    result = 2
    assert Translator.get_gen_steps(dataset, batch_size) == result

    dataset.num_samples = 127
    batch_size = 63
    result = 3
    assert Translator.get_gen_steps(dataset, batch_size) == result

    dataset.num_samples = 128
    batch_size = 64
    result = 2
    assert Translator.get_gen_steps(dataset, batch_size) == result
def test_encode_sequences():
    x_word_seq = [
        ["jedna", "dva", "tři"],
        ["čtyři", "pět", "šest", "sedm", "osm"],
        ["devět"],
        ["deset"]
    ]
    x_max_seq_len = 5
    x_vocab = Vocabulary(x_word_seq, 100)

    y_word_seq = [
        [SpecialSymbols.GO, "one", "two", "three", SpecialSymbols.EOS],
        [SpecialSymbols.GO, "four", "five", "six", "seven", SpecialSymbols.EOS],
        [SpecialSymbols.GO, "eight", SpecialSymbols.EOS],
        [SpecialSymbols.GO, "nine", "ten", SpecialSymbols.EOS]
    ]
    y_max_seq_len = 6
    y_vocab = Vocabulary(y_word_seq, 100)

    reverse_input = False

    encoder_input_data, decoder_input_data, decoder_target_data = Translator.encode_sequences(
        x_word_seq=x_word_seq, y_word_seq=y_word_seq,
        x_max_seq_len=x_max_seq_len, y_max_seq_len=y_max_seq_len,
        source_vocab=x_vocab, target_vocab=y_vocab, reverse_input=reverse_input
    )

    test_encoder_input_data = np.asarray([
        [4, 5, 6, 0, 0],
        [7, 8, 9, 10, 11],
        [12, 0, 0, 0, 0],
        [13, 0, 0, 0, 0]
    ])
    np.testing.assert_array_equal(encoder_input_data, test_encoder_input_data)

    test_decoder_input_data = np.asarray([
        [SpecialSymbols.GO_IX, 4, 5, 6, 0],
        [SpecialSymbols.GO_IX, 7, 8, 9, 10],
        [SpecialSymbols.GO_IX, 11, 0, 0, 0],
        [SpecialSymbols.GO_IX, 12, 13, 0, 0]
    ])

    np.testing.assert_array_equal(decoder_input_data, test_decoder_input_data)

    decoded_target_data = []
    for seq in decoder_target_data:
        decoded_target_data.append(
            Translator.decode_encoded_seq(seq, y_vocab, one_hot=True)
        )

    test_target_data = [
        ["one", "two", "three", SpecialSymbols.EOS, SpecialSymbols.PAD],
        ["four", "five", "six", "seven", SpecialSymbols.EOS],
        ["eight", SpecialSymbols.EOS, SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD],
        ["nine", "ten", SpecialSymbols.EOS, SpecialSymbols.PAD, SpecialSymbols.PAD]
    ]

    np.testing.assert_array_equal(decoded_target_data, test_target_data)
def test_define_models_dropout():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5",
                            num_encoder_layers=4, num_decoder_layers=3, dropout=0.2)

    model, encoder_model, decoder_model = translator._define_models()

    assert True
def test_define_models_multiple_layers():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5",
                            num_encoder_layers=2, num_decoder_layers=4)

    model, encoder_model, decoder_model = translator._define_models()

    encoder_input = model.get_layer(name="encoder_input")
    input_embeddings = model.get_layer(name="input_embeddings")
    bidirectional_encoder_layer = model.get_layer(name="bidirectional_encoder_layer")
    decoder_input = model.get_layer(name="decoder_input")
    target_embeddings = model.get_layer(name="target_embeddings")
    decoder_layer_1 = model.get_layer(name="decoder_layer_1")
    output_layer = model.get_layer(name="output_layer")

    assert len(model.layers) == 13

    assert len(decoder_model.layers) == 11

    assert len(encoder_model.layers) == 4
def test_encode_text_seq_to_encoder_seq():
    word_seq = [
        ["jedna", "dva", "tři"],
        ["čtyři", "pět", "šest", "sedm"],
        ["osm"],
        ["devět", "deset"]
    ]
    vocab = Vocabulary(word_seq, 100)

    text = "jedna dva kočka leze tři čtyři"

    test_encoded = np.asarray([[
        vocab.word_to_ix["jedna"], vocab.word_to_ix["dva"],
        SpecialSymbols.UNK_IX, SpecialSymbols.UNK_IX,
        vocab.word_to_ix["tři"], vocab.word_to_ix["čtyři"]
    ]], dtype="float32")

    encoded = Translator.encode_text_seq_to_encoder_seq(text, vocab)

    np.testing.assert_array_equal(encoded, test_encoded)
def test_translating_small_dataset_use_generator():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5")

    translator.fit(epochs=100, use_fit_generator=True)
    translator.translate_test_data()

    os.remove("data/model.h5")

    with open("data/smallTest.en.reference.translated", encoding="utf-8") as test_file:
        test_data = test_file.read()
    with open("data/smallTest.en.translated", encoding="utf-8") as translated_file:
        translated_data = translated_file.read()

    os.remove("data/smallTest.en.translated")

    assert translated_data == test_data
def test_training_data_gen_shuffling():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5")
    random.seed(1)  # seed chosen to switch the indeces in data generator
    generator = translator._training_data_gen(batch_size=4, shuffle=True)

    # to remove first returned value
    steps = next(generator)

    training_data = next(generator)
    encoder_input_data = training_data[0][0]
    decoder_input_data = training_data[0][1]
    decoder_target_data = training_data[1]

    assert len(encoder_input_data) == 4
    assert len(decoder_input_data) == 4
    assert len(decoder_target_data) == 4

    training_data = next(generator)
    encoder_input_data = training_data[0][0]
    decoder_input_data = training_data[0][1]
    decoder_target_data = training_data[1]

    assert len(encoder_input_data) == 3
    assert len(decoder_input_data) == 3
    assert len(decoder_target_data) == 3

    decoded_data = Translator.decode_encoded_seq(encoder_input_data[0], translator.source_vocab)
    test_decoded_data = ["se", "rozzlobila", SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD,
                         SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)

    decoded_data = Translator.decode_encoded_seq(decoder_input_data[0], translator.target_vocab)
    test_decoded_data = [SpecialSymbols.GO, "she", "got", "angry", SpecialSymbols.PAD, SpecialSymbols.PAD,
                         SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)

    decoded_data = Translator.decode_encoded_seq(decoder_target_data[0], translator.target_vocab,
                                                 one_hot=True)
    test_decoded_data = ["she", "got", "angry", SpecialSymbols.EOS, SpecialSymbols.PAD, SpecialSymbols.PAD,
                         SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)
def test_translating_small_dataset_multiple_layers():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5",
                            num_encoder_layers=4, num_decoder_layers=3)

    translator.fit(epochs=200, bucketing=True, bucket_range=2, early_stopping_patience=15)

    translator.translate_test_data()

    os.remove("data/model.h5")

    with open("data/smallTest.en.reference.translated", encoding="utf-8") as test_file:
        test_data = test_file.read()
    with open("data/smallTest.en.translated", encoding="utf-8") as translated_file:
        translated_data = translated_file.read()

    os.remove("data/smallTest.en.translated")

    assert translated_data == test_data
def test_training_data_gen_bucketing():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5")
    generator = translator._training_data_bucketing(batch_size=2, infinite=True,
                                                    shuffle=False, bucket_range=1)

    # to remove first returned value
    steps = next(generator)
    assert steps == 4

    training_data = next(generator)
    encoder_input_data = training_data[0][0]
    decoder_input_data = training_data[0][1]
    decoder_target_data = training_data[1]

    assert len(encoder_input_data) == 2
    assert len(decoder_input_data) == 2
    assert len(decoder_target_data) == 2

    training_data = next(generator)
    encoder_input_data = training_data[0][0]
    decoder_input_data = training_data[0][1]
    decoder_target_data = training_data[1]

    assert len(encoder_input_data) == 1
    assert len(decoder_input_data) == 1
    assert len(decoder_target_data) == 1

    decoded_data = Translator.decode_encoded_seq(encoder_input_data[0], translator.source_vocab)
    test_decoded_data = ["přátelé", "jsme", SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)

    decoded_data = Translator.decode_encoded_seq(decoder_input_data[0], translator.target_vocab)
    test_decoded_data = [SpecialSymbols.GO, "we're", "friends", SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)

    decoded_data = Translator.decode_encoded_seq(decoder_target_data[0], translator.target_vocab,
                                                 one_hot=True)
    test_decoded_data = ["we're", "friends", SpecialSymbols.EOS, SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)
def test_get_training_data():
    translator = Translator(training_dataset="data/small", test_dataset="data/smallTest",
                            source_lang="cs", target_lang="en", log_folder="logs",
                            model_folder="data", model_file="model.h5")

    training_data = translator._get_training_data()

    decoded_data = Translator.decode_encoded_seq(training_data["encoder_input_data"][0], translator.source_vocab)
    test_decoded_data = ["se", "rozzlobila", SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD,
                         SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)

    decoded_data = Translator.decode_encoded_seq(training_data["decoder_input_data"][0], translator.target_vocab)
    test_decoded_data = [SpecialSymbols.GO, "she", "got", "angry", SpecialSymbols.PAD, SpecialSymbols.PAD,
                         SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)

    decoded_data = Translator.decode_encoded_seq(training_data["decoder_target_data"][0], translator.target_vocab,
                                                 one_hot=True)
    test_decoded_data = ["she", "got", "angry", SpecialSymbols.EOS, SpecialSymbols.PAD, SpecialSymbols.PAD,
                         SpecialSymbols.PAD, SpecialSymbols.PAD, SpecialSymbols.PAD]
    np.testing.assert_array_equal(decoded_data, test_decoded_data)
Пример #12
0
def main():
    parser = argparse.ArgumentParser(
        description='Arguments for the main.py that uses nmt package')
    add_arguments(parser)

    args, unparsed = parser.parse_known_args()

    if not (args.train or args.evaluate or args.livetest):
        parser.error(
            'At least one action requested, add -train, -evaluate or livetest')

    if unparsed:
        logger.warning("some unexpected arguments: {}".format(unparsed))

    if args.find_gpu:
        device = "/gpu:{}".format(set_gpu())
    else:
        device = "/gpu:0"

    # to speed up loading of parser help
    # tensorflow takes quite some time to load
    sys.path.insert(
        0,
        os.path.abspath(os.path.join(os.path.dirname(__file__), 'nmtPackage')))
    from nmt import Translator, utils
    import tensorflow as tf

    with tf.device(device):
        translator = Translator(
            source_embedding_dim=args.source_embedding_dim,
            target_embedding_dim=args.target_embedding_dim,
            source_embedding_path=args.source_embedding_path,
            target_embedding_path=args.target_embedding_path,
            max_source_embedding_num=args.max_source_embedding_num,
            max_target_embedding_num=args.max_target_embedding_num,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            source_lang=args.source_lang,
            dropout=args.dropout,
            num_units=args.num_units,
            num_threads=args.num_threads,
            optimizer=args.optimizer,
            log_folder=args.log_folder,
            max_source_vocab_size=args.max_source_vocab_size,
            max_target_vocab_size=args.max_target_vocab_size,
            model_file=args.model_file,
            model_folder=args.model_folder,
            num_training_samples=args.num_training_samples,
            num_test_samples=args.num_test_samples,
            reverse_input=args.reverse_input,
            target_lang=args.target_lang,
            test_dataset=args.test_dataset,
            training_dataset=args.training_dataset,
            tokenize=args.tokenize,
            clear=args.clear)

        if args.train:
            translator.fit(
                epochs=args.epochs,
                initial_epoch=args.initial_epoch,
                batch_size=args.batch_size,
                use_fit_generator=args.use_fit_generator,
                bucketing=args.bucketing,
                bucket_range=args.bucket_range,
                early_stopping_patience=args.early_stopping_patience)

        if args.evaluate:
            translator.translate_test_data(args.batch_size, args.beam_size)

            bleu = translator.get_bleu_for_test_data_translation()
            print("BLEU: {}".format(bleu))

        if args.livetest:
            while True:
                seq = input("Enter sequence: ")
                translator.translate(seq)