def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']


    # input_text = ['床 前 明 月 光 ,'
    #               , '举 头 望 明 月 ,'
    #               , '敝 笱 在 梁 ,'
    #               , '齐 子 归 止 ,'
    #               , '21 22 23 24 25']
    # tar_text = ['疑 是 地 上 霜。'
    #             , '低 头 思 故 乡。'
    #             , '其 鱼 鲂 鳏 。'
    #             , '其 从 如 云 。'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']


    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = loaddic('./corpus/dic.txt')

    print '-----------'
    # print vocab
    print '-----------'
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 97
    tar_maxlen = 20
    output_dim = vocab_size
    hidden_dim = 500

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=tar_maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    # input_text = loadfile('./corpus/content-12147.txt')
    # tar_text = loadfile('./corpus/title-12147.txt')
    input_text = loadfile('./corpus/content-12147.txt')
    tar_text = loadfile('./corpus/title-12147.txt')
    time1 = time.time()
    for iter_num in range(1):
        for i in range(0,58213,20):
            if(i == 58200):
                break
            input_list = []
            tar_list = []
            for tmp_input in input_text[i:i+20]:
                input_list.append(chtokenize(tmp_input))
            for tmp_tar in tar_text[i:i+20]:
                tar_list.append(chtokenize(tmp_tar))
            inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
            en_de_model.fit(inputs_train, tars_train, batch_size=2, nb_epoch=1, show_accuracy=True)
            print'Current line:'+ str(i)
            print('Current iter_num is:%d' % iter_num)
        # out_predicts = en_de_model.predict(inputs_train)
        # for i_idx, out_predict in enumerate(out_predicts):
        #     predict_sequence = []
        #     for predict_vector in out_predict:
        #         next_index = np.argmax(predict_vector)
        #         next_token = idx_to_word[next_index]
        #         predict_sequence.append(next_token)
        #     print('Target output:', tar_text[i_idx])
        #     print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
    en_de_model.save_weights('en_de_weights.h5')
    print ('Train Ended')
    time2 = time.time()-time1
    print time2
Пример #2
0
def main():
    input_text = ['1 2 3 4 5'
                  , '6 7 8 9 10'
                  , '11 12 13 14 15'
                  , '16 17 18 19 20'
                  , '21 22 23 24 25']
    tar_text = ['one two three four five'
                , 'six seven eight nine ten'
                , 'eleven twelve thirteen fourteen fifteen'
                , 'sixteen seventeen eighteen nineteen twenty'
                , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    input_list = []
    tar_list = []

    for tmp_input in input_text:
        input_list.append(tokenize(tmp_input))
    for tmp_tar in tar_text:
        tar_list.append(tokenize(tmp_tar))

    vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    input_maxlen = max(map(len, (x for x in input_list)))
    tar_maxlen = max(map(len, (x for x in tar_list)))
    output_dim = vocab_size
    hidden_dim = 20

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    print('Number of training stories:', len(input_list))
    print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符
    inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=tar_maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(inputs_train, tars_train, batch_size=3, nb_epoch=1, show_accuracy=True)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                predict_sequence.append(next_token)
            print('Target output:', tar_text[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
Пример #3
0
checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True)
model.fit_generator(generator=train_gen,
                    steps_per_epoch=train_num_batches,
                    epochs=NUM_EPOCHS,
                    verbose=1,
                    validation_data=test_gen,
                    validation_steps=test_num_batches,
                    callbacks=[checkpoint])

encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.save('model/encoder-weights.h5')

new_decoder_inputs = Input(batch_shape=(1, None, num_decoder_tokens),
                           name='new_decoder_inputs')
new_decoder_lstm = LSTM(units=HIDDEN_UNITS,
                        return_state=True,
                        return_sequences=True,
                        name='new_decoder_lstm',
                        stateful=True)
new_decoder_outputs, _, _ = new_decoder_lstm(new_decoder_inputs)
new_decoder_dense = Dense(units=num_decoder_tokens,
                          activation='softmax',
                          name='new_decoder_dense')
new_decoder_outputs = new_decoder_dense(new_decoder_outputs)
new_decoder_lstm.set_weights(decoder_lstm.get_weights())
new_decoder_dense.set_weights(decoder_dense.get_weights())

new_decoder_model = Model(new_decoder_inputs, new_decoder_outputs)

new_decoder_model.save('model/decoder-weights.h5')
Пример #4
0
def main():
    input_text = [
        '1 2 3 4 5', '6 7 8 9 10', '11 12 13 14 15', '16 17 18 19 20',
        '21 22 23 24 25'
    ]
    tar_text = [
        'one two three four five', 'six seven eight nine ten',
        'eleven twelve thirteen fourteen fifteen',
        'sixteen seventeen eighteen nineteen twenty',
        'twenty_one twenty_two twenty_three twenty_four twenty_five'
    ]

    input_list = []
    tar_list = []

    for tmp_input in input_text:
        input_list.append(tokenize(tmp_input))
    for tmp_tar in tar_text:
        tar_list.append(tokenize(tmp_tar))

    vocab = sorted(
        reduce(lambda x, y: x | y,
               (set(tmp_list) for tmp_list in input_list + tar_list)))
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    input_maxlen = max(map(len, (x for x in input_list)))
    tar_maxlen = max(map(len, (x for x in tar_list)))
    output_dim = vocab_size
    hidden_dim = 20

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    print('Number of training stories:', len(input_list))
    print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符
    inputs_train, tars_train = vectorize_stories(input_list, tar_list,
                                                 word_to_idx, input_maxlen,
                                                 tar_maxlen, vocab_size)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=tar_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=tar_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(inputs_train,
                        tars_train,
                        batch_size=3,
                        nb_epoch=1,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                predict_sequence.append(next_token)
            print('Target output:', tar_text[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
Пример #5
0
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    # input_text = ['Hello this is Tom speaking Is that John ?'
    #               , 'Would you like to go swimming with me ?'
    #               , 'Ok see you then Bye'
    #               , 'Yeah I am free What time shall we meet ?'
    #               , 'How does it taste ?']
    # tar_text = ['Yes this is What s up ?'
    #             , 'That sounds great It s good weather for swimming  I d love to .'
    #             , 'See you'
    #             , 'At 3:00PM'
    #             , 'It tastes good you should try some.']

    #排序 生成数据格式为list的字典
    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = []
    dicfile = open('./corpus/dic.txt','r')
    line = dicfile.readline()
    while line:
        vocab.append(line.strip())
        line = dicfile.readline()
    dicfile.close()

    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 99
    tar_maxlen = 22
    output_dim = vocab_size
    hidden_dim = 50

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符


    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=tar_maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    for iter_num in range(5000):
        #构造数据 批量喂入
        input_text = loadfile('./corpus/content-12147.txt')
        tar_text = loadfile('./corpus/title-12147.txt')
        for i in range (0,58213):
            if (i == 58200):
                break
            input_list = []
            tar_list = []
            for tmp_input in input_text[i:i+20]:
                input_list.append(chtokenize(tmp_input,vocab))
            for tmp_tar in tar_text[i:i+20]:
                tar_list.append(chtokenize(tmp_tar,vocab))
            inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
            en_de_model.fit(inputs_train, tars_train, batch_size=4, nb_epoch=1, show_accuracy=True)
            i += 20

        # out_predicts = en_de_model.predict(inputs_train)
        # for i_idx, out_predict in enumerate(out_predicts):
        #     predict_sequence = []
        #     for predict_vector in out_predict:
        #         next_index = np.argmax(predict_vector)
        #         next_token = idx_to_word[next_index]
        #         predict_sequence.append(next_token)
        #     print('Target output:', tar_text[i_idx].decode('utf8'))
        #     print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
    en_de_model.save_weights('en_de_weights1-40.h5')
    print ('Train Ended')
def main():
    f = open("X_train.pkl", 'r')
    X_train = pickle.load(f)
    '''
    f=open('word2index.pkl','r')
    word2index=pickle.load(f)
    f=open('index2word.pkl','r')
    index2word=pickle.load(f)

    inputs_train, tars_train = vectorize_stories(X_train, X_train, word2index, maxlen, maxlen, vocab_size)
    '''
    X_train=pad_sequences(X_train, maxlen=maxlen)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(vocab_size))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time: %f second!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(X_train, X_train, batch_size=3, nb_epoch=1, show_accuracy=True)
        out_predicts = en_de_model.predict(X_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            '''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = index2word[next_index]
                predict_sequence.append(next_token)
            '''
            print('Target output:', X_train[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
Пример #7
0
def main():
    filep = pjoin(m_path, "cut_recovery_10", "Paging")
    file_num = len(os.listdir(pjoin(filep, "input")))

    file_names_idx = range(file_num)
    random.shuffle(file_names_idx)
    # train_names_idx = file_names_idx[:file_num/7]
    # test_names_idx = file_names_idx[file_num/7+1:]
    train_names_idx = file_names_idx[:5000]
    test_names_idx = file_names_idx[5001:6000]

    vocab = load_vocab(
        pjoin(m_path, "BaseLine-BigData_1kUE_20ENB_paging-Case_Group_1-Case_1",
              "dic.txt"))
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1

    #input_maxlen = max(map(len, (x for x in input_list)))
    #output_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 200
    output_maxlen = 200
    output_dim = vocab_size
    hidden_dim = 300

    # print('-')
    # print('Vocab size:', vocab_size, 'unique words')
    # print('Input max length:', input_maxlen, 'words')
    # print('Target max length:', tar_maxlen, 'words')
    # print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    # print('-')
    # print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=output_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=output_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=output_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)

    if decoder_mode == 0:
        en_de_model.add(RepeatVector(output_maxlen))

    en_de_model.add(decoder_top_layer)
    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))

    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    for iter_num in range(10):
        input_sen, output_sen = load_data(filep, train_names_idx)
        input_list, output_list = io_list(input_sen, output_sen)
        inputs_train, outputs_train = vectorize_stories(
            input_list, output_list, vocab, word_to_idx, input_maxlen,
            output_maxlen, vocab_size)

        en_de_model.fit(inputs_train,
                        outputs_train,
                        batch_size=50,
                        nb_epoch=5,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(inputs_train)

        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                predict_sequence.append(next_token)
            #print('Target output:', toutput_text[i_idx])
            #print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)

    print("test")
Пример #8
0
def main():
    f = open("X_train.pkl", 'r')
    X_train = pickle.load(f)
    '''
    f=open('word2index.pkl','r')
    word2index=pickle.load(f)
    f=open('index2word.pkl','r')
    index2word=pickle.load(f)

    inputs_train, tars_train = vectorize_stories(X_train, X_train, word2index, maxlen, maxlen, vocab_size)
    '''
    X_train = pad_sequences(X_train, maxlen=maxlen)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(vocab_size))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time: %f second!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(X_train,
                        X_train,
                        batch_size=3,
                        nb_epoch=1,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(X_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            '''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = index2word[next_index]
                predict_sequence.append(next_token)
            '''
            print('Target output:', X_train[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
Пример #9
0
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = loaddic('./corpus/smalldic.txt')

    print('-----------')
    # print vocab
    print('-----------')
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 70
    tar_maxlen = 17
    output_dim = vocab_size
    hidden_dim = 100

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=tar_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=tar_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))

    en_de_model.load_weights('en_de_weights1-40.h5')

    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    # # input_text = loadfile('./corpus/content-12147.txt')
    # input_text = loadfile('./corpus/content1-500.txt')
    #
    # input_list = []
    # for tmp_input in input_text:
    #     input_list.append(chtokenize(tmp_input))
    #
    # inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
    #
    # out_predicts = en_de_model.predict(inputs_train)
    # for i_idx, out_predict in enumerate(out_predicts):
    #     predict_sequence = []
    #     tempstr = ''
    #     for predict_vector in out_predict:
    #         next_index = np.argmax(predict_vector)
    #         next_token = idx_to_word[next_index]
    #         # print next_token
    #         tempstr += next_token
    #         predict_sequence.append(next_token)
    #     print tempstr
    #     # print('Predict output:', predict_sequence)
    #
    # print ('Train Ended')

    # def predict(input_text):
    import socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    host = socket.gethostbyname(socket.gethostname())
    port = 50008
    sock.bind((host, port))
    sock.listen(5)
    while True:
        conn, addr = sock.accept()
        data = conn.recv(1024)
        list = []
        # input_text = '实际上,上周主管部门就和大唐打过招呼了,内部消息人士透露,国资委已经就李小琳任职问题和大唐进行沟通,但李小琳本人至今未报到。情况比较复杂,上述人士表示,目前还不敢完全确定,不排除后续还有变化。'
        tmp = 'BEG ' + data + ' END'
        tmp = jiebacut(tmp)
        list.append(tmp)
        result = ''
        input_list = []
        for tmp_input in list:
            print(tmp_input)
            print('---!--!---')
            input_list.append(chtokenize(tmp_input))
        inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            tempstr = ''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                # print next_token
                tempstr += next_token
                predict_sequence.append(next_token)
            print(tempstr)
            result = tempstr

            print('Predict output:', predict_sequence)
        reply = result
        conn.send(reply.encode())