示例#1
0
 def __init__(self, output_dim, hidden_dim, output_length, depth=1, broadcast_state=True, inner_broadcast_state=True, peek=False, dropout=0.25, **kwargs):
     super(Seq2seq, self).__init__()
     layers= []
     if type(depth) not in [list, tuple]:
         depth = (depth, depth)
     broadcast = (depth[0] > 1 and inner_broadcast_state) or broadcast_state
     encoder = LSTMEncoder(output_dim=hidden_dim, state_input=broadcast, **kwargs)
     if peek:
         decoder = LSTMDecoder2(hidden_dim=hidden_dim, output_length=output_length, state_input=encoder if broadcast else False, **kwargs)
     else:
         decoder = LSTMDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=encoder if broadcast else False, **kwargs)
     lstms = []
     for i in range(1, depth[0]):
         layer = LSTMEncoder(output_dim=hidden_dim, state_input=inner_broadcast_state and (i != 1), return_sequences=True, **kwargs)
         layers.append(layer)
         lstms.append(layer)
         layers.append(Dropout(dropout))
     layers.append(encoder)
     layers.append(Dropout(dropout))
     layers.append(Dense(hidden_dim if depth[1] > 1 else output_dim))
     lstms.append(encoder)
     if inner_broadcast_state:
         for i in range(len(lstms) - 1):
             lstms[i].broadcast_state(lstms[i + 1])
     layers.append(decoder)
     if broadcast_state:
         encoder.broadcast_state(decoder)
     lstms = [decoder]
     for i in range(1, depth[1]):
         layer = LSTMEncoder(output_dim=hidden_dim, state_input=inner_broadcast_state and (i != 1), return_sequences=True, **kwargs)
         layers.append(layer)
         lstms.append(layer)
         layers.append(Dropout(dropout))
     if inner_broadcast_state:
         for i in range(len(lstms) - 1):
             lstms[i].broadcast_state(lstms[i + 1])
     if depth[1] > 1:
         layers.append(TimeDistributedDense(output_dim))
     self.encoder = encoder
     self.decoder = decoder
     for l in layers:
         self.add(l)
     if depth[0] > 1:
         self.layers[0].build()
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']


    # input_text = ['床 前 明 月 光 ,'
    #               , '举 头 望 明 月 ,'
    #               , '敝 笱 在 梁 ,'
    #               , '齐 子 归 止 ,'
    #               , '21 22 23 24 25']
    # tar_text = ['疑 是 地 上 霜。'
    #             , '低 头 思 故 乡。'
    #             , '其 鱼 鲂 鳏 。'
    #             , '其 从 如 云 。'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']


    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = loaddic('./corpus/dic.txt')

    print '-----------'
    # print vocab
    print '-----------'
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 97
    tar_maxlen = 20
    output_dim = vocab_size
    hidden_dim = 500

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=tar_maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    # input_text = loadfile('./corpus/content-12147.txt')
    # tar_text = loadfile('./corpus/title-12147.txt')
    input_text = loadfile('./corpus/content-12147.txt')
    tar_text = loadfile('./corpus/title-12147.txt')
    time1 = time.time()
    for iter_num in range(1):
        for i in range(0,58213,20):
            if(i == 58200):
                break
            input_list = []
            tar_list = []
            for tmp_input in input_text[i:i+20]:
                input_list.append(chtokenize(tmp_input))
            for tmp_tar in tar_text[i:i+20]:
                tar_list.append(chtokenize(tmp_tar))
            inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
            en_de_model.fit(inputs_train, tars_train, batch_size=2, nb_epoch=1, show_accuracy=True)
            print'Current line:'+ str(i)
            print('Current iter_num is:%d' % iter_num)
        # out_predicts = en_de_model.predict(inputs_train)
        # for i_idx, out_predict in enumerate(out_predicts):
        #     predict_sequence = []
        #     for predict_vector in out_predict:
        #         next_index = np.argmax(predict_vector)
        #         next_token = idx_to_word[next_index]
        #         predict_sequence.append(next_token)
        #     print('Target output:', tar_text[i_idx])
        #     print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
    en_de_model.save_weights('en_de_weights.h5')
    print ('Train Ended')
    time2 = time.time()-time1
    print time2
示例#3
0
def main():
    input_text = [
        '1 2 3 4 5', '6 7 8 9 10', '11 12 13 14 15', '16 17 18 19 20',
        '21 22 23 24 25'
    ]
    tar_text = [
        'one two three four five', 'six seven eight nine ten',
        'eleven twelve thirteen fourteen fifteen',
        'sixteen seventeen eighteen nineteen twenty',
        'twenty_one twenty_two twenty_three twenty_four twenty_five'
    ]

    input_list = []
    tar_list = []

    for tmp_input in input_text:
        input_list.append(tokenize(tmp_input))
    for tmp_tar in tar_text:
        tar_list.append(tokenize(tmp_tar))

    vocab = sorted(
        reduce(lambda x, y: x | y,
               (set(tmp_list) for tmp_list in input_list + tar_list)))
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    input_maxlen = max(map(len, (x for x in input_list)))
    tar_maxlen = max(map(len, (x for x in tar_list)))
    output_dim = vocab_size
    hidden_dim = 20

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    print('Number of training stories:', len(input_list))
    print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符
    inputs_train, tars_train = vectorize_stories(input_list, tar_list,
                                                 word_to_idx, input_maxlen,
                                                 tar_maxlen, vocab_size)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=tar_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=tar_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(inputs_train,
                        tars_train,
                        batch_size=3,
                        nb_epoch=1,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                predict_sequence.append(next_token)
            print('Target output:', tar_text[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    # input_text = ['Hello this is Tom speaking Is that John ?'
    #               , 'Would you like to go swimming with me ?'
    #               , 'Ok see you then Bye'
    #               , 'Yeah I am free What time shall we meet ?'
    #               , 'How does it taste ?']
    # tar_text = ['Yes this is What s up ?'
    #             , 'That sounds great It s good weather for swimming  I d love to .'
    #             , 'See you'
    #             , 'At 3:00PM'
    #             , 'It tastes good you should try some.']

    #排序 生成数据格式为list的字典
    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = []
    dicfile = open('./corpus/dic.txt','r')
    line = dicfile.readline()
    while line:
        vocab.append(line.strip())
        line = dicfile.readline()
    dicfile.close()

    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 99
    tar_maxlen = 22
    output_dim = vocab_size
    hidden_dim = 50

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict((c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict((i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符


    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                        , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , output_length=tar_maxlen, state_input=False, return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim
                                             , output_length=tar_maxlen, state_input=False, return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(Embedding(input_dim=vocab_size,
                              output_dim=hidden_dim,
                              input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    for iter_num in range(5000):
        #构造数据 批量喂入
        input_text = loadfile('./corpus/content-12147.txt')
        tar_text = loadfile('./corpus/title-12147.txt')
        for i in range (0,58213):
            if (i == 58200):
                break
            input_list = []
            tar_list = []
            for tmp_input in input_text[i:i+20]:
                input_list.append(chtokenize(tmp_input,vocab))
            for tmp_tar in tar_text[i:i+20]:
                tar_list.append(chtokenize(tmp_tar,vocab))
            inputs_train, tars_train = vectorize_stories(input_list, tar_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
            en_de_model.fit(inputs_train, tars_train, batch_size=4, nb_epoch=1, show_accuracy=True)
            i += 20

        # out_predicts = en_de_model.predict(inputs_train)
        # for i_idx, out_predict in enumerate(out_predicts):
        #     predict_sequence = []
        #     for predict_vector in out_predict:
        #         next_index = np.argmax(predict_vector)
        #         next_token = idx_to_word[next_index]
        #         predict_sequence.append(next_token)
        #     print('Target output:', tar_text[i_idx].decode('utf8'))
        #     print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
    en_de_model.save_weights('en_de_weights1-40.h5')
    print ('Train Ended')
示例#5
0
def main():
    filep = pjoin(m_path, "cut_recovery_10", "Paging")
    file_num = len(os.listdir(pjoin(filep, "input")))

    file_names_idx = range(file_num)
    random.shuffle(file_names_idx)
    # train_names_idx = file_names_idx[:file_num/7]
    # test_names_idx = file_names_idx[file_num/7+1:]
    train_names_idx = file_names_idx[:5000]
    test_names_idx = file_names_idx[5001:6000]

    vocab = load_vocab(
        pjoin(m_path, "BaseLine-BigData_1kUE_20ENB_paging-Case_Group_1-Case_1",
              "dic.txt"))
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1

    #input_maxlen = max(map(len, (x for x in input_list)))
    #output_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 200
    output_maxlen = 200
    output_dim = vocab_size
    hidden_dim = 300

    # print('-')
    # print('Vocab size:', vocab_size, 'unique words')
    # print('Input max length:', input_maxlen, 'words')
    # print('Target max length:', tar_maxlen, 'words')
    # print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    # print('-')
    # print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=output_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=output_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=output_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)

    if decoder_mode == 0:
        en_de_model.add(RepeatVector(output_maxlen))

    en_de_model.add(decoder_top_layer)
    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))

    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    for iter_num in range(10):
        input_sen, output_sen = load_data(filep, train_names_idx)
        input_list, output_list = io_list(input_sen, output_sen)
        inputs_train, outputs_train = vectorize_stories(
            input_list, output_list, vocab, word_to_idx, input_maxlen,
            output_maxlen, vocab_size)

        en_de_model.fit(inputs_train,
                        outputs_train,
                        batch_size=50,
                        nb_epoch=5,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(inputs_train)

        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                predict_sequence.append(next_token)
            #print('Target output:', toutput_text[i_idx])
            #print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)

    print("test")
    encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
else:
    encoder_top_layer = LSTM(hidden_dim)

if decoder_mode == 0:
    decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
elif decoder_mode == 1:
    decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                    output_dim=hidden_dim,
                                    output_length=tar_maxlen,
                                    state_input=False,
                                    return_sequences=True)
elif decoder_mode == 2:
    decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                     output_dim=hidden_dim,
                                     output_length=tar_maxlen,
                                     state_input=False,
                                     return_sequences=True)
elif decoder_mode == 3:
    decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)

en_de_model = Sequential()
print('Test 001 - jacoxu')
en_de_model.add(
    Embedding(input_dim=vocab_size,
              output_dim=hidden_dim,
              input_length=input_maxlen))
示例#7
0
def main():
    f = open("X_train.pkl", 'r')
    X_train = pickle.load(f)
    '''
    f=open('word2index.pkl','r')
    word2index=pickle.load(f)
    f=open('index2word.pkl','r')
    index2word=pickle.load(f)

    inputs_train, tars_train = vectorize_stories(X_train, X_train, word2index, maxlen, maxlen, vocab_size)
    '''
    X_train = pad_sequences(X_train, maxlen=maxlen)

    decoder_mode = 1  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(vocab_size))
    en_de_model.add(Activation('softmax'))
    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time: %f second!' % (time_end - time_start))
    for iter_num in range(5000):
        en_de_model.fit(X_train,
                        X_train,
                        batch_size=3,
                        nb_epoch=1,
                        show_accuracy=True)
        out_predicts = en_de_model.predict(X_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            '''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = index2word[next_index]
                predict_sequence.append(next_token)
            '''
            print('Target output:', X_train[i_idx])
            print('Predict output:', predict_sequence)

        print('Current iter_num is:%d' % iter_num)
示例#8
0
def train():
    # input_text = ['1 2 3 4 5'
    #               , '6 7 8 9 10'
    #               , '11 12 13 14 15'
    #               , '16 17 18 19 20'
    #               , '21 22 23 24 25']
    # tar_text = ['one two three four five'
    #             , 'six seven eight nine ten'
    #             , 'eleven twelve thirteen fourteen fifteen'
    #             , 'sixteen seventeen eighteen nineteen twenty'
    #             , 'twenty_one twenty_two twenty_three twenty_four twenty_five']

    # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list)))

    vocab = loaddic('./corpus/smalldic.txt')

    print('-----------')
    # print vocab
    print('-----------')
    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1  # keras进行embedding的时候必须进行len(vocab)+1
    # input_maxlen = max(map(len, (x for x in input_list)))
    # tar_maxlen = max(map(len, (x for x in tar_list)))
    input_maxlen = 70
    tar_maxlen = 17
    output_dim = vocab_size
    hidden_dim = 100

    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Input max length:', input_maxlen, 'words')
    print('Target max length:', tar_maxlen, 'words')
    print('Dimension of hidden vectors:', hidden_dim)
    # print('Number of training stories:', len(input_list))
    # print('Number of test stories:', len(input_list))
    print('-')
    print('Vectorizing the word sequences...')
    word_to_idx = dict(
        (c, i + 1) for i, c in enumerate(vocab))  # 编码时需要将字符映射成数字index
    idx_to_word = dict(
        (i + 1, c) for i, c in enumerate(vocab))  # 解码时需要将数字index映射成字符

    decoder_mode = 3  # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式
    if decoder_mode == 3:
        encoder_top_layer = LSTM(hidden_dim, return_sequences=True)
    else:
        encoder_top_layer = LSTM(hidden_dim)

    if decoder_mode == 0:
        decoder_top_layer = LSTM(hidden_dim, return_sequences=True)
        decoder_top_layer.get_weights()
    elif decoder_mode == 1:
        decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim,
                                        output_dim=hidden_dim,
                                        output_length=tar_maxlen,
                                        state_input=False,
                                        return_sequences=True)
    elif decoder_mode == 2:
        decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim,
                                         output_dim=hidden_dim,
                                         output_length=tar_maxlen,
                                         state_input=False,
                                         return_sequences=True)
    elif decoder_mode == 3:
        decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim,
                                             output_dim=hidden_dim,
                                             output_length=tar_maxlen,
                                             state_input=False,
                                             return_sequences=True)

    en_de_model = Sequential()
    en_de_model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=hidden_dim,
                  input_length=input_maxlen))
    en_de_model.add(encoder_top_layer)
    if decoder_mode == 0:
        en_de_model.add(RepeatVector(tar_maxlen))
    en_de_model.add(decoder_top_layer)

    en_de_model.add(TimeDistributedDense(output_dim))
    en_de_model.add(Activation('softmax'))

    en_de_model.load_weights('en_de_weights1-40.h5')

    print('Compiling...')
    time_start = time.time()
    en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    time_end = time.time()
    print('Compiled, cost time:%fsecond!' % (time_end - time_start))

    # # input_text = loadfile('./corpus/content-12147.txt')
    # input_text = loadfile('./corpus/content1-500.txt')
    #
    # input_list = []
    # for tmp_input in input_text:
    #     input_list.append(chtokenize(tmp_input))
    #
    # inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size)
    #
    # out_predicts = en_de_model.predict(inputs_train)
    # for i_idx, out_predict in enumerate(out_predicts):
    #     predict_sequence = []
    #     tempstr = ''
    #     for predict_vector in out_predict:
    #         next_index = np.argmax(predict_vector)
    #         next_token = idx_to_word[next_index]
    #         # print next_token
    #         tempstr += next_token
    #         predict_sequence.append(next_token)
    #     print tempstr
    #     # print('Predict output:', predict_sequence)
    #
    # print ('Train Ended')

    # def predict(input_text):
    import socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    host = socket.gethostbyname(socket.gethostname())
    port = 50008
    sock.bind((host, port))
    sock.listen(5)
    while True:
        conn, addr = sock.accept()
        data = conn.recv(1024)
        list = []
        # input_text = '实际上,上周主管部门就和大唐打过招呼了,内部消息人士透露,国资委已经就李小琳任职问题和大唐进行沟通,但李小琳本人至今未报到。情况比较复杂,上述人士表示,目前还不敢完全确定,不排除后续还有变化。'
        tmp = 'BEG ' + data + ' END'
        tmp = jiebacut(tmp)
        list.append(tmp)
        result = ''
        input_list = []
        for tmp_input in list:
            print(tmp_input)
            print('---!--!---')
            input_list.append(chtokenize(tmp_input))
        inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen)
        out_predicts = en_de_model.predict(inputs_train)
        for i_idx, out_predict in enumerate(out_predicts):
            predict_sequence = []
            tempstr = ''
            for predict_vector in out_predict:
                next_index = np.argmax(predict_vector)
                next_token = idx_to_word[next_index]
                # print next_token
                tempstr += next_token
                predict_sequence.append(next_token)
            print(tempstr)
            result = tempstr

            print('Predict output:', predict_sequence)
        reply = result
        conn.send(reply.encode())