def train(train_path=None, save_model_path=None, batch_size=64, epochs=10, rnn_hidden_dim=200): print('Training model...') data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) print('input_texts:', input_texts[0]) print('target_texts:', target_texts[0]) input_characters = data_reader.read_vocab(input_texts) target_characters = data_reader.read_vocab(target_texts) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) max_encoder_seq_len = max([len(text) for text in input_texts]) max_decoder_seq_len = max([len(text) for text in target_texts]) print('num of samples:', len(input_texts)) print('num of unique input tokens:', num_encoder_tokens) print('num of unique output tokens:', num_decoder_tokens) print('max sequence length for inputs:', max_encoder_seq_len) print('max sequence length for outputs:', max_decoder_seq_len) input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict([(char, i) for i, char in enumerate(target_characters)]) encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_len, num_encoder_tokens), dtype='float32') decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32') decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32') # one hot representation for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): for t, char in enumerate(input_text): encoder_input_data[i, t, input_token_index[char]] = 1.0 for t, char in enumerate(target_text): # decoder_target_data is a head of decoder_input_data by one timestep decoder_input_data[i, t, target_token_index[char]] = 1.0 if t > 0: decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 logger.info("Data loaded.") # model logger.info("Training seq2seq model...") model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim) # save callbacks_list = callback(save_model_path, logger) model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list) logger.info("Training has finished.") eval(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim, input_token_index, target_token_index, max_decoder_seq_len, encoder_input_data, input_texts)
def train(train_path=None, save_model_path=None, batch_size=64, epochs=10, rnn_hidden_dim=200): print('Training model...') data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) print('input_texts:', input_texts[0]) print('target_texts:', target_texts[0]) input_characters = data_reader.read_vocab(input_texts) target_characters = data_reader.read_vocab(target_texts) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) max_encoder_seq_len = max([len(text) for text in input_texts]) max_decoder_seq_len = max([len(text) for text in target_texts]) print('num of samples:', len(input_texts)) print('num of unique input tokens:', num_encoder_tokens) print('num of unique output tokens:', num_decoder_tokens) print('max sequence length for inputs:', max_encoder_seq_len) print('max sequence length for outputs:', max_decoder_seq_len) input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict([(char, i) for i, char in enumerate(target_characters)]) encoder_input_data = np.zeros( (len(input_texts), max_encoder_seq_len, num_encoder_tokens), dtype='float32') decoder_input_data = np.zeros( (len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32') decoder_target_data = np.zeros( (len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32') # one hot representation for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): for t, char in enumerate(input_text): encoder_input_data[i, t, input_token_index[char]] = 1.0 for t, char in enumerate(target_text): # decoder_target_data is a head of decoder_input_data by one timestep decoder_input_data[i, t, target_token_index[char]] = 1.0 if t > 0: decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 logger.info("Data loaded.") # model logger.info("Training seq2seq model...") model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim) # save callbacks_list = callback(save_model_path, logger) model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list) logger.info("Training has finished.") eval(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim, input_token_index, target_token_index, max_decoder_seq_len, encoder_input_data, input_texts)
def train(train_path=None, save_model_path=None, encoder_model_path=None, decoder_model_path=None, save_input_token_path=None, save_target_token_path=None, batch_size=64, epochs=10, rnn_hidden_dim=200): print('Training model...') data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) print('input_texts:', input_texts[0]) print('target_texts:', target_texts[0]) input_characters = data_reader.read_vocab(input_texts) target_characters = data_reader.read_vocab(target_texts) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) max_input_texts_len = max([len(text) for text in input_texts]) max_target_texts_len = max([len(text) for text in target_texts]) print('num of samples:', len(input_texts)) print('num of unique input tokens:', num_encoder_tokens) print('num of unique output tokens:', num_decoder_tokens) print('max sequence length for inputs:', max_input_texts_len) print('max sequence length for outputs:', max_target_texts_len) input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict([(char, i) for i, char in enumerate(target_characters)]) # save word dict save_word_dict(input_token_index, save_input_token_path) save_word_dict(target_token_index, save_target_token_path) encoder_input_data = np.zeros( (len(input_texts), max_input_texts_len, num_encoder_tokens), dtype='float32') decoder_input_data = np.zeros( (len(input_texts), max_target_texts_len, num_decoder_tokens), dtype='float32') decoder_target_data = np.zeros( (len(input_texts), max_target_texts_len, num_decoder_tokens), dtype='float32') # one hot representation for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): for t, char in enumerate(input_text): encoder_input_data[i, t, input_token_index[char]] = 1.0 for t, char in enumerate(target_text): # decoder_target_data is a head of decoder_input_data by one timestep decoder_input_data[i, t, target_token_index[char]] = 1.0 if t > 0: decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 logger.info("Data loaded.") # split to train and val encoder_input_data_train, encoder_input_data_val, decoder_input_data_train, decoder_input_data_val, \ decoder_target_data_train, decoder_target_data_val = train_test_split( encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1) # model logger.info("Training seq2seq model...") model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim) # Run training callbacks_list = callback(save_model_path, logger) model.fit_generator( generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train, batch_size), steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size, epochs=epochs, verbose=1, validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val), callbacks=callbacks_list) encoder_model.save(encoder_model_path) decoder_model.save(decoder_model_path) logger.info("Model save to " + save_model_path) logger.info("Training has finished.") evaluate(encoder_model, decoder_model, num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim, target_token_index, max_target_texts_len, encoder_input_data_val, input_texts)
def train(train_path=None, save_model_path=None, encoder_model_path=None, decoder_model_path=None, save_input_token_path=None, save_target_token_path=None, batch_size=64, epochs=10, rnn_hidden_dim=200): print('Training model...') data_reader = CGEDReader(train_path) input_texts, target_texts = data_reader.build_dataset(train_path) print('input_texts:', input_texts[0]) print('target_texts:', target_texts[0]) max_input_texts_len = max([len(text) for text in input_texts]) max_target_texts_len = max([len(text) for text in target_texts]) print('num of samples:', len(input_texts)) print('max sequence length for inputs:', max_input_texts_len) print('max sequence length for outputs:', max_target_texts_len) # load or save word dict if os.path.exists(save_input_token_path) and os.path.exists( save_target_token_path): input_token_index = load_word_dict(save_input_token_path) target_token_index = load_word_dict(save_target_token_path) else: input_characters = data_reader.read_vocab(input_texts) target_characters = data_reader.read_vocab(target_texts) input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict([ (char, i) for i, char in enumerate(target_characters) ]) save_word_dict(input_token_index, save_input_token_path) save_word_dict(target_token_index, save_target_token_path) encoder_input_data = np.zeros( (len(input_texts), max_input_texts_len, len(input_token_index)), dtype='float32') decoder_input_data = np.zeros( (len(input_texts), max_target_texts_len, len(target_token_index)), dtype='float32') decoder_target_data = np.zeros( (len(input_texts), max_target_texts_len, len(target_token_index)), dtype='float32') # one hot representation for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): for t, char in enumerate(input_text): if char in input_token_index: encoder_input_data[i, t, input_token_index[char]] = 1.0 for t, char in enumerate(target_text): # decoder_target_data is a head of decoder_input_data by one timestep if char in target_token_index: decoder_input_data[i, t, target_token_index[char]] = 1.0 if t > 0: decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 logger.info("Data loaded.") # model logger.info("Training seq2seq model...") if os.path.exists(save_model_path) and os.path.exists(encoder_model_path): model = load_model(save_model_path) encoder_model = load_model(encoder_model_path) decoder_model = load_model(decoder_model_path) else: model, encoder_model, decoder_model = create_model( len(input_token_index), len(target_token_index), rnn_hidden_dim) # Run training callbacks_list = callback(save_model_path, logger) model.fit(x=[encoder_input_data, decoder_input_data], y=decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list) encoder_model.save(encoder_model_path) decoder_model.save(decoder_model_path) logger.info("Model save to " + save_model_path) logger.info("Training has finished.") evaluate(encoder_model, decoder_model, len(input_token_index), len(target_token_index), rnn_hidden_dim, target_token_index, max_target_texts_len, encoder_input_data, input_texts)