示例#1
0
def predict():
    name = [x for x in request.form.values()]
    print('POST')
    print(name[0])
    inp_txt = name[0].strip()
    input_tokens = [[i for i in inp_txt]]
    inp_encoded = [['<START>'] + tokens + ['<END>'] for tokens in input_tokens]
    inp_max_len = 40
    inp_padded = [
        tokens + ['<PAD>'] * (inp_max_len - len(tokens))
        for tokens in inp_encoded
    ]
    encode_inp = [
        list(map(lambda x: inp_dict[x], tokens)) for tokens in inp_padded
    ]
    decoded = decode(
        model,
        encode_inp,
        start_token=1,
        end_token=2,
        pad_token=0,
    )
    tar_dict_inv = {v: k for k, v in tar_dict.items()}
    answer = ''.join(map(lambda x: tar_dict_inv[x], decoded[0][1:-1]))
    print(answer)
    return render_template('home.html',
                           prediction_text='output_name {}'.format(name[0]))
示例#2
0
def predict():
    with open('./models/target_token_dict.pkl', 'rb') as f:
        target_token_dict = pickle.load(f)
    with open('./models/source_token_dict.pkl', 'rb') as f:
        source_token_dict = pickle.load(f)

    target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

    source_tokens_list = [
        t.split() for t in '''He lost.
    I try.
    I won!
    I runs.
    I came.
    He run.
    We lost.
    We runs in the park every day.
    He calmed down.
    See you about 8.
    He get you.
    She wears a wig.'''.split('\n') if t
    ]

    encode_tokens = [['<START>'] + tokens + ['<END>']
                     for tokens in source_tokens_list]
    encode_tokens = [
        tokens + ['<PAD>'] * (source_max_len - len(tokens))
        for tokens in encode_tokens
    ]
    encode_input = [
        list(
            map(
                lambda x: source_token_dict.get(x, source_token_dict[
                    '<UNKOWN>']), tokens)) for tokens in encode_tokens
    ]

    model = get_model(
        token_num=max(len(source_token_dict), len(target_token_dict)),
        embed_dim=32,
        encoder_num=2,
        decoder_num=2,
        head_num=4,
        hidden_dim=128,
        dropout_rate=0.05,
        use_same_embed=False,  # Use different embeddings for different languages
    )
    model.load_weights('./models/model.h5', by_name=True, reshape=True)
    # Predict
    decoded = decode(model,
                     encode_input,
                     start_token=target_token_dict['<START>'],
                     end_token=target_token_dict['<END>'],
                     pad_token=target_token_dict['<PAD>'],
                     max_repeat=len(encode_input),
                     max_repeat_block=len(encode_input))
    for i, source in enumerate(source_tokens_list):
        predicted = ''.join(
            map(lambda x: target_token_dict_inv[x], decoded[i][1:-1]))
        print("{},预测结果:{}".format(source, predicted))
示例#3
0
def traductor3000(frase):
    tokens_frase= [tokens + ['<END>','<PAD>'] for tokens in [frase.split(' ')]]
    tr_entrada = [list(map(lambda x: diccionario_entrada[x], tokens)) for tokens in tokens_frase][0]
    salida_decodificada = decode(
        modelo,
        tr_entrada,
        start_token = diccionario_salida['<START>'],
        end_token = diccionario_salida['<END>'],
        pad_token = diccionario_salida['<PAD>']
    )
    print('Frase original: {}'.format(frase))
    print('Traduccion: {}'.format(' '.join(map(lambda x: diccionario_salida_inverso[x], salida_decodificada[1:-1]))))
def transformer_model_predict(prediction, transformer_model, config):
    source_token_dict = config["vocab"]["source_token_dict"]
    target_token_dict = config["vocab"]["target_token_dict"]
    target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

    sos_token = config["tok"]["sos_token"]
    eos_token = config["tok"]["eos_token"]
    pad_token = config["tok"]["pad_token"]

    encode_tokens = [[sos_token] + list(pred) + [eos_token]
                     for pred in prediction]
    source_max_len = max(map(len, encode_tokens))
    encode_tokens = [
        tokens + [pad_token] * (source_max_len - len(tokens))
        for tokens in encode_tokens
    ]
    encode_input = [
        list(map(lambda x: source_token_dict[x], tokens))
        for tokens in encode_tokens
    ]

    decoded = decode(
        transformer_model,
        encode_input,
        start_token=target_token_dict[sos_token],
        end_token=target_token_dict[eos_token],
        pad_token=target_token_dict[pad_token],
        temperature=1.0,
    )

    res = []

    for pred in decoded:
        pred = [
            char for char in pred
            if not target_token_dict_inv[char] in (sos_token, eos_token,
                                                   pad_token)
        ]
        pred = "".join(map(lambda x: target_token_dict_inv[x], pred))
        res.append(pred)

    return res
示例#5
0
文件: enc_dec.py 项目: hsha0/M-M
def main():
    tf.logging.set_verbosity = True
    eventSequence = convert_files_to_eventSequence(FLAGS.data_dir)

    test_sequence = eventSequence[-1]
    if len(eventSequence) > 1: eventSequence = eventSequence[:-1]

    input_feature, notes, velocity, time = build_input_feature(eventSequence)

    encoder_input, decoder_input = process_input_feature(input_feature)

    model = create_transformer()
    model.fit(
        x=[encoder_input, decoder_input],
        y=np.reshape(decoder_input,
                     (decoder_input.shape[0], decoder_input.shape[1], 1)),
        epochs=FLAGS.num_epochs,
        batch_size=FLAGS.training_batch_size)

    init = np.reshape(test_sequence[:FLAGS.interval], (FLAGS.interval * 3))
    print(init.shape)

    #init = list(init.astype(dtype=np.float64).flatten())
    init = list(init)
    print(init)
    init.insert(0, START)
    init.append(END)
    decoded = decode(model,
                     init,
                     start_token=START,
                     end_token=END,
                     pad_token=PAD,
                     max_len=FLAGS.interval * 3,
                     top_k=10,
                     temperature=1.0)

    print(decoded)
    """
示例#6
0
def translate(sentence):
    sentence = sentence.lower()
    sentence_tokens = [
        tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]
    ]
    print(sentence_tokens)
    tr_input = [
        list(map(lambda x: source_token_dict[x], tokens))
        for tokens in sentence_tokens
    ][0]
    print(tr_input)
    decoded = decode(
        loaded_mod,
        tr_input,
        start_token=target_token_dict['<START>'],
        end_token=target_token_dict['<END>'],
        pad_token=target_token_dict['<PAD>'],
        max_len=100,
    )
    print(decoded)
    print('Frase original: {}'.format(sentence))
    print('Traducción: {}'.format(' '.join(
        map(lambda x: target_token_dict_inv[x], decoded[1:-1]))))
示例#7
0
文件: train.py 项目: IwasakiYuuki/NLP
def prediction(
    model,
    inputs,
    max_len,
):
    predicted = keras_transformer.decode(
        model,
        inputs.tolist(),
        start_token=2,
        end_token=3,
        pad_token=0,
        max_len=max_len,
        top_k=10,
        temperature=1.0,
    )
    print('=================predict result=================')
    print('input:', sp.decode_ids(inputs.tolist()[0]))
    print('\n------------------------------------------------')
    print('output:', sp.decode_ids(list(map(int, predicted[0]))))
    print('\n================================================')
    print('input:', sp.decode_ids(inputs.tolist()[1]))
    print('\n------------------------------------------------')
    print('output:', sp.decode_ids(list(map(int, predicted[1]))))
    print('\n================================================')
示例#8
0
    embed_weights=np.random.random((13, 30)),
)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
)
model.summary()

# Train the model
model.fit(
    x=[np.asarray(encoder_inputs * 1000),
       np.asarray(decoder_inputs * 1000)],
    y=np.asarray(decoder_outputs * 1000),
    epochs=5,
)

# predict
from keras_transformer import decode

decoded = decode(
    model,
    encoder_inputs_no_padding,
    start_token=token_dict['<START>'],
    end_token=token_dict['<END>'],
    pad_token=token_dict['<PAD>'],
    max_len=100,
)
token_dict_rev = {v: k for k, v in token_dict.items()}
for i in range(len(decoded)):
    print(' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1])))
示例#9
0
    def speaking(self):
        while True:
            print("Tony:")
            ans = ''
            target_tokens = ''
            speaker_1_words = input()
            speaker_1_words_cut = self.kerasTransformer.dataManager.cutting_sentence(
                speaker_1_words)
            source_tokens = [speaker_1_words_cut]

            try:
                encode_tokens = [['<START>'] + tokens + ['<END>']
                                 for tokens in source_tokens]
                encode_input = [
                    list(
                        map(
                            lambda x: self.kerasTransformer.source_token_dict[
                                x], tokens)) for tokens in encode_tokens
                ]
                decoded = decode(
                    self.kerasTransformer.model,
                    encode_input,
                    start_token=self.kerasTransformer.
                    target_token_dict['<START>'],
                    end_token=self.kerasTransformer.target_token_dict['<END>'],
                    pad_token=self.kerasTransformer.target_token_dict['<PAD>'],
                )
                for index in decoded[0][1:-1]:
                    ans += self.kerasTransformer.target_token_dict_inv[str(
                        index)]
                ans_words_cut = self.kerasTransformer.dataManager.cutting_sentence(
                    ans)
                target_tokens = [ans_words_cut]
            except KeyError:
                row = np.random.randint(0, 3, 1)
                ans = self.kerasTransformer.dataManager.ans_sentence[row[0]]
                print(KeyError)
            finally:
                if ans in self.kerasTransformer.dataManager.answerSentence:
                    print('Catalina:\n' + ans)
                else:
                    try:
                        encode_tokens_x, decode_tokens_x, output_tokens_y = self.kerasTransformer.dataManager.addingStartAndEndToken(
                            source_tokens, target_tokens)

                        encode_tokens_x, decode_tokens_x, output_tokens_y = self.kerasTransformer.dataManager.padding(
                            encode_tokens_x, decode_tokens_x, output_tokens_y)

                        encode_input_x, decode_input_x, decode_output_y = self.kerasTransformer.dataManager.turningToNum(
                            encode_tokens_x, decode_tokens_x, output_tokens_y)

                        encode_input_x = np.array(encode_input_x)
                        decode_input_x = np.array(decode_input_x)
                        decode_output_y = np.array(decode_output_y)

                        loss_value, metric_value = self.kerasTransformer.model.evaluate(
                            [encode_input_x, decode_input_x], decode_output_y)
                        if loss_value >= 0.1:
                            row = np.random.randint(0, 3, 1)
                            ans = self.kerasTransformer.dataManager.answerSentence[
                                row[0]]
                    except KeyError:
                        row = np.random.randint(0, 3, 1)
                        ans = self.kerasTransformer.dataManager.ans_sentence[
                            row[0]]
                        print(KeyError)

                    print('Catalina:\n' + ans)
示例#10
0
    def test_translate(self):
        source_tokens = [
            'i need more power'.split(' '),
            'eat jujube and pill'.split(' '),
        ]
        target_tokens = [
            list('我要更多的抛瓦'),
            list('吃枣💊'),
        ]

        # Generate dictionaries
        source_token_dict = self._build_token_dict(source_tokens)
        target_token_dict = self._build_token_dict(target_tokens)
        target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

        # Add special tokens
        encode_tokens = [['<START>'] + tokens + ['<END>']
                         for tokens in source_tokens]
        decode_tokens = [['<START>'] + tokens + ['<END>']
                         for tokens in target_tokens]
        output_tokens = [
            tokens + ['<END>', '<PAD>'] for tokens in target_tokens
        ]

        # Padding
        source_max_len = max(map(len, encode_tokens))
        target_max_len = max(map(len, decode_tokens))

        encode_tokens = [
            tokens + ['<PAD>'] * (source_max_len - len(tokens))
            for tokens in encode_tokens
        ]
        decode_tokens = [
            tokens + ['<PAD>'] * (target_max_len - len(tokens))
            for tokens in decode_tokens
        ]
        output_tokens = [
            tokens + ['<PAD>'] * (target_max_len - len(tokens))
            for tokens in output_tokens
        ]

        encode_input = [
            list(map(lambda x: source_token_dict[x], tokens))
            for tokens in encode_tokens
        ]
        decode_input = [
            list(map(lambda x: target_token_dict[x], tokens))
            for tokens in decode_tokens
        ]
        decode_output = [
            list(map(lambda x: [target_token_dict[x]], tokens))
            for tokens in output_tokens
        ]

        # Build & fit model
        model = get_model(
            token_num=max(len(source_token_dict), len(target_token_dict)),
            embed_dim=32,
            encoder_num=2,
            decoder_num=2,
            head_num=4,
            hidden_dim=128,
            dropout_rate=0.05,
            use_same_embed=
            False,  # Use different embeddings for different languages
        )
        model.compile('adam', 'sparse_categorical_crossentropy')
        model.summary()
        model.fit(
            x=[np.array(encode_input * 1024),
               np.array(decode_input * 1024)],
            y=np.array(decode_output * 1024),
            epochs=10,
            batch_size=32,
        )

        # Predict
        decoded = decode(
            model,
            encode_input,
            start_token=target_token_dict['<START>'],
            end_token=target_token_dict['<END>'],
            pad_token=target_token_dict['<PAD>'],
        )
        for i in range(len(encode_input)):
            predicted = ''.join(
                map(lambda x: target_token_dict_inv[x], decoded[i][1:-1]))
            self.assertEqual(''.join(target_tokens[i]), predicted)
示例#11
0
model = multi_gpu_model(model_cpu, gpus=4)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

model.fit(
    x=[np.array(encode_input), np.array(decode_input)],
    y=np.array(decode_output),
    epochs=10,
    batch_size=128,
)

# Predict
decoded = decode(
    model,
    encode_test_input,
    start_token=token_dict['<START>'],
    end_token=token_dict['<END>'],
    pad_token=token_dict['<PAD>'],
)

predicted_y = []

for d_x in decoded:
    os = ' '.join(map(lambda x: token_dict_inv[x], d_x[1:-1]))
    os = os.split(' ')
    predicted_y.append([os])

bleu_score = nltk.translate.bleu_score.corpus_bleu(predicted_y,
                                                   test_y_tokens,
                                                   weights=(0.5, 0.5))
print(bleu_score)
    embed_dim=32,
    encoder_num=2,
    decoder_num=2,
    head_num=4,
    hidden_dim=128,
    dropout_rate=0.05,
    use_same_embed=False,  # Use different embeddings for different languages
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

print(encode_input[0], " : ", decode_input[0], " : ", decode_output[0])
model.fit(
    x=[np.array(encode_input * 1024),
       np.array(decode_input * 1024)],
    y=np.array(decode_output * 1024),
    epochs=5,
    batch_size=32,
)
print("asds: ", np.array(encode_input * 1024)[:10])
# Predict
decoded = decode(
    model,
    encode_input,
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
    top_k=20,
)
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))
示例#13
0
def main(lang, input_file, output_file):
    exclude = set(string.punctuation + string.digits)

    input_token_index = config_lang_tsf[lang.lower()]['input_token_index']
    target_token_index = config_lang_tsf[lang.lower()]['target_token_index']
    max_encoder_seq_length = config_lang_tsf[
        lang.lower()]['max_encoder_seq_length']
    params = config_lang_tsf[lang.lower()]['params']
    target_max_len = 50
    token_num = max(len(target_token_index), len(input_token_index))

    model = get_model(token_num=token_num,
                      embed_dim=params['embed_dim'],
                      encoder_num=params['encoder_num'],
                      decoder_num=params['decoder_num'],
                      head_num=params['head_num'],
                      hidden_dim=params['hidden_dim'],
                      dropout_rate=params['dropout_rate'],
                      use_same_embed=False,
                      embed_weights=np.random.random(
                          (token_num, params['embed_dim'])))

    model_path = 'models_transformer/' + lang.lower(
    ) + '_clean_28042020.csv_transformer.keras'
    model.load_weights(model_path)

    input_texts = []
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()[:]

    for line in lines:
        for wd in line.strip().split():
            if wd not in input_texts:
                if all([
                        ch in input_token_index for ch in wd.lower()
                        if ch not in exclude
                ]):
                    s = ''.join(ch for ch in wd.lower() if ch not in exclude)
                    if len(s):
                        input_texts.append([x for x in s.lower().strip()])

    reverse_input_char_index = dict(
        (i, char) for char, i in input_token_index.items())
    reverse_target_char_index = dict(
        (i, char) for char, i in target_token_index.items())

    test_encode_tokens = [['<START>'] + tokens + ['<END>']
                          for tokens in input_texts]
    test_encode_tokens = [
        tokens + ['<PAD>'] * (50 - len(tokens))
        for tokens in test_encode_tokens
    ]
    test_input = [
        list(map(lambda x: input_token_index[x], tokens))
        for tokens in test_encode_tokens
    ]

    print("predicting ...")
    decoded = {}
    for i in range(len(test_input)):
        int_decoded = []
        prediction = decode(model,
                            test_input[i],
                            start_token=target_token_index['<START>'],
                            end_token=target_token_index['<END>'],
                            pad_token=target_token_index['<PAD>'],
                            max_len=token_num + 2 + 5)

        wd = ''.join(input_texts[i])
        for j in range(1, len(prediction)):
            if reverse_target_char_index[prediction[j]] in [
                    '<PAD>', '<END>', '<START>'
            ]:
                break
            else:
                int_decoded.append(prediction[j])
        decoded[wd] = ' '.join(
            map(lambda x: reverse_target_char_index[x], int_decoded))

    print(decoded)
    with open(output_file, 'w') as fout:
        for i in range(len(lines)):
            fout.write("%s|" % lines[i].strip())
            for wd in lines[i].strip().lower().split():
                wd_strip = ''.join(ch for ch in wd.lower()
                                   if ch not in exclude)
                if wd_strip in decoded:
                    fout.write("[%s] " % decoded[wd_strip])
                else:
                    fout.write("[UNK] ")
            fout.write("\n")

    print('\n' + "*" * 20)
    print("DONE! Wrote %d lines to %s..." % (len(lines), output_file))
    print("*" * 20 + '\n')
示例#14
0
train_data_total = read_train_data()
print('Total pwds: ', len(train_data_total), train_data_total[0])
# inputs, targets = generate_source_target(train_data_total)


# directly use the trained model to guess passwords
train = True
if not train:
    model = load_model('my_model.h5')
    x_train, y_train, x_val, y_val, output_y_train, output_y_val = gen_large_chunk_single_thread(inputs, targets, chunk_size=BATCH_SIZE, iteration=iteration)

    decoded = decode(
        model,
        tokens=[x.tolist() for x in x_val],
        start_token=c_table.encode_char(start_token),
        end_token=c_table.encode_char(end_token),
        pad_token=c_table.encode_char(pad_char),
        top_k=1,
        max_len=ENCODING_MAX_PASSWORD_LENGTH,
    )

    for i in range(20):
        print('-' * 50)
        password = c_table.decode(decoded[-i], calc_argmax=False)
        # print('former: ', x_val[i])
        print('former-decode: ', c_table.decode(x_val[-i], False))
        print('target: ', c_table.decode(y_val[-i], False))
        print('decoded: ', decoded[-i])
        print('guess: ', password)

        rowx, rowy = x_val[-i], y_val[-i]
示例#15
0
    def test_decode(self):
        tokens = 'all work and no play makes jack a dull boy'.split(' ')
        token_dict = {
            '<PAD>': 0,
            '<START>': 1,
            '<END>': 2,
        }
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
        model = get_model(
            token_num=len(token_dict),
            embed_dim=32,
            encoder_num=3,
            decoder_num=2,
            head_num=4,
            hidden_dim=128,
            dropout_rate=0.05,
        )
        model.compile(
            optimizer='adam',
            loss='sparse_categorical_crossentropy',
        )
        model.summary()
        encoder_inputs_no_padding = []
        encoder_inputs, decoder_inputs, decoder_outputs = [], [], []
        for i in range(1, len(tokens)):
            encode_tokens, decode_tokens = tokens[:i], tokens[i:]
            encode_tokens = ['<START>'] + encode_tokens + [
                '<END>'
            ] + ['<PAD>'] * (len(tokens) - len(encode_tokens))
            output_tokens = decode_tokens + [
                '<END>', '<PAD>'
            ] + ['<PAD>'] * (len(tokens) - len(decode_tokens))
            decode_tokens = ['<START>'] + decode_tokens + [
                '<END>'
            ] + ['<PAD>'] * (len(tokens) - len(decode_tokens))
            encode_tokens = list(map(lambda x: token_dict[x], encode_tokens))
            decode_tokens = list(map(lambda x: token_dict[x], decode_tokens))
            output_tokens = list(map(lambda x: [token_dict[x]], output_tokens))
            encoder_inputs_no_padding.append(encode_tokens[:i + 2])
            encoder_inputs.append(encode_tokens)
            decoder_inputs.append(decode_tokens)
            decoder_outputs.append(output_tokens)
        current_path = os.path.dirname(os.path.abspath(__file__))
        model_path = os.path.join(current_path, 'test_transformer.h5')
        if os.path.exists(model_path):
            model.load_weights(model_path, by_name=True)
        else:
            model.fit(
                x=[
                    np.asarray(encoder_inputs * 2048),
                    np.asarray(decoder_inputs * 2048)
                ],
                y=np.asarray(decoder_outputs * 2048),
                epochs=10,
                batch_size=128,
            )
            model.save(model_path)
        model = keras.models.load_model(model_path,
                                        custom_objects=get_custom_objects())
        decoded = decode(
            model,
            encoder_inputs_no_padding * 2,
            start_token=token_dict['<START>'],
            end_token=token_dict['<END>'],
            pad_token=token_dict['<PAD>'],
        )
        token_dict_rev = {v: k for k, v in token_dict.items()}
        for i in range(len(decoded)):
            print(' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1])))
        for i in range(len(decoded)):
            for j in range(len(decoded[i])):
                self.assertEqual(decoder_inputs[i % len(decoder_inputs)][j],
                                 decoded[i][j])

        decoded = decode(
            model,
            encoder_inputs_no_padding[2] + [0] * 5,
            start_token=token_dict['<START>'],
            end_token=token_dict['<END>'],
            pad_token=token_dict['<PAD>'],
        )
        for j in range(len(decoded)):
            self.assertEqual(decoder_inputs[2][j], decoded[j], decoded)

        decoded = decode(
            model,
            encoder_inputs_no_padding,
            start_token=token_dict['<START>'],
            end_token=token_dict['<END>'],
            pad_token=token_dict['<PAD>'],
            max_len=4,
        )
        token_dict_rev = {v: k for k, v in token_dict.items()}
        for i in range(len(decoded)):
            print(' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1])))
        for i in range(len(decoded)):
            self.assertTrue(len(decoded[i]) <= 4, decoded[i])
            for j in range(len(decoded[i])):
                self.assertEqual(decoder_inputs[i][j], decoded[i][j], decoded)

        decoded_top_5 = decode(
            model,
            encoder_inputs_no_padding,
            start_token=token_dict['<START>'],
            end_token=token_dict['<END>'],
            pad_token=token_dict['<PAD>'],
            max_len=4,
            top_k=5,
            temperature=1e-10,
        )
        has_diff = False
        for i in range(len(decoded)):
            s1 = ' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1]))
            s5 = ' '.join(
                map(lambda x: token_dict_rev[x], decoded_top_5[i][1:-1]))
            if s1 != s5:
                has_diff = True
        self.assertFalse(has_diff)

        decoded_top_5 = decode(
            model,
            encoder_inputs_no_padding,
            start_token=token_dict['<START>'],
            end_token=token_dict['<END>'],
            pad_token=token_dict['<PAD>'],
            max_len=4,
            top_k=5,
        )
        has_diff = False
        for i in range(len(decoded)):
            s1 = ' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1]))
            s5 = ' '.join(
                map(lambda x: token_dict_rev[x], decoded_top_5[i][1:-1]))
            if s1 != s5:
                has_diff = True
        self.assertTrue(has_diff)
示例#16
0
    def train(self, train_file='/home/gswyhq/data/cmn-eng/cmn.txt'):
        source_tokens = [
            'i need more power'.split(' '),
            'eat jujube and pill'.split(' '),
        ]
        target_tokens = [
            list('我要更多的抛瓦'),
            list('吃枣💊'),
        ]

        with open(train_file) as f:
            for data in f.readlines():
                if '\t' in data:
                    source, target = data.strip().split('\t', maxsplit=1)
                    source_tokens.append(source.split(' '))
                    target_tokens.append(list(target))

        # Generate dictionaries
        source_token_dict = self._build_token_dict(source_tokens)
        target_token_dict = self._build_token_dict(target_tokens)
        target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

        # Add special tokens
        encode_tokens = [['<START>'] + tokens + ['<END>']
                         for tokens in source_tokens]
        decode_tokens = [['<START>'] + tokens + ['<END>']
                         for tokens in target_tokens]
        output_tokens = [
            tokens + ['<END>', '<PAD>'] for tokens in target_tokens
        ]
        # print('output_tokens: {}'.format(output_tokens))
        # Padding
        # source_max_len = max(map(len, encode_tokens))
        # target_max_len = max(map(len, decode_tokens))

        print('source_max_len: {}; target_max_len: {}'.format(
            source_max_len,
            target_max_len))  # source_max_len: 34; target_max_len: 46
        print("len(source_token_dict): {}, len(target_token_dict): {}".format(
            len(source_token_dict), len(target_token_dict))
              )  # len(source_token_dict): 10814, len(target_token_dict): 3442

        with open('./models/target_token_dict.pkl', 'wb') as f:
            pickle.dump(target_token_dict, f)

        with open('./models/source_token_dict.pkl', 'wb') as f:
            pickle.dump(source_token_dict, f)

        encode_tokens = [
            tokens + ['<PAD>'] * (source_max_len - len(tokens))
            for tokens in encode_tokens
        ]
        decode_tokens = [
            tokens + ['<PAD>'] * (target_max_len - len(tokens))
            for tokens in decode_tokens
        ]
        output_tokens = [
            tokens + ['<PAD>'] * (target_max_len - len(tokens))
            for tokens in output_tokens
        ]
        # print('output_tokens: {}'.format(output_tokens))
        encode_input = [
            list(map(lambda x: source_token_dict[x], tokens))
            for tokens in encode_tokens
        ]
        decode_input = [
            list(map(lambda x: target_token_dict[x], tokens))
            for tokens in decode_tokens
        ]
        decode_output = [
            list(map(lambda x: [target_token_dict[x]], tokens))
            for tokens in output_tokens
        ]
        # print("decode_output: {}".format(decode_output))
        # Build & fit model
        model = get_model(
            token_num=max(len(source_token_dict), len(target_token_dict)),
            embed_dim=32,
            encoder_num=2,
            decoder_num=2,
            head_num=4,
            hidden_dim=128,
            dropout_rate=0.05,
            use_same_embed=
            False,  # Use different embeddings for different languages
        )
        model.compile('adam', 'sparse_categorical_crossentropy')
        model.summary()

        early_stopping = EarlyStopping(monitor='loss', patience=3)

        model_checkpoint = ModelCheckpoint(filepath=os.path.join(
            './models', 'translate-{epoch:02d}-{loss:.4f}.hdf5'),
                                           save_best_only=False,
                                           save_weights_only=False)

        model.fit(x=[np.array(encode_input * 1),
                     np.array(decode_input * 1)],
                  y=np.array(decode_output * 1),
                  epochs=10,
                  batch_size=32,
                  callbacks=[early_stopping, model_checkpoint])

        model.save('./models/model.h5')

        # Predict
        encode_input = encode_input[:30]
        decoded = decode(model,
                         encode_input,
                         start_token=target_token_dict['<START>'],
                         end_token=target_token_dict['<END>'],
                         pad_token=target_token_dict['<PAD>'],
                         max_repeat=len(encode_input),
                         max_repeat_block=len(encode_input))

        right_count = 0
        error_count = 0

        for i in range(len(encode_input)):
            predicted = ''.join(
                map(lambda x: target_token_dict_inv[x], decoded[i][1:-1]))
            print("原始结果:{},预测结果:{}".format(''.join(target_tokens[i]),
                                           predicted))

            if ''.join(target_tokens[i]) == predicted:
                right_count += 1
            else:
                error_count += 1

        print("正确: {}, 错误:{}, 正确率: {}".format(
            right_count, error_count,
            right_count / (right_count + error_count + 0.001)))
示例#17
0
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

model.fit(
    x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
    y=np.array(decode_output * 1024),
    epochs=10,
    batch_size=32,
)

# Predict
decoded = decode(
    model,
    encode_input,
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
)
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))


decoded = decode(
    model,
    encode_input,
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
    top_k=10,
    temperature=1.0,
        # Model reconstruction from JSON file
        with open(os.path.join(MODEL_NAME, 'model.json'), 'r') as fh:
            model = model_from_json(fh.read(), get_custom_objects())

        # Load weights into the new model
        model.load_weights(os.path.join(MODEL_NAME, 'model_weights.h5'))

        model.compile('adam', 'sparse_categorical_crossentropy')
        model.summary()

    # Predict with beam search
    decoded = decode(
        model,
        encode_input,  # The test set
        start_token=target_token_dict['<START>'],
        end_token=target_token_dict['<END>'],
        pad_token=target_token_dict['<PAD>'],
        top_k=TOP_K,
        temperature=BEAM_TEMP,
    )

    def predict(text):
        return ' '.join(map(lambda x: target_token_dict_inv[x], text))

    # Predict the first two examples in data
    print("este é o primeiro livro que eu fiz .")
    print(predict(decoded[0][1:-1]))
    print(
        "\nvou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram ."
    )
    print(predict(decoded[1][1:-1]))