def _predict_batch_by_batch(predict_fn, batched_inputs, non_batched_inputs=None, batch_size=BATCH_SIZE, num_outputs=1): """ Splits prediction for big dataset in order to save GPU memory. Equivalent to predict_fn(*batched_inputs + non_batched_inputs). predict_fn: Compiled theano-function. batched_inputs: Inputs that we split into batches. On each iteration, we only pass one batch of this data into theano function. non_batched_inputs: Inputs that we do not split into batches. These inputs are the same for each call of predict_fn batch_size: int Size of each batch that we split our batched_inputs into num_ouputs: int, default=1 Number of returned items of each theano function. """ if non_batched_inputs is None: non_batched_inputs = [] results = [[] for _ in range(num_outputs)] for inputs_batch in get_training_batch(batched_inputs, batch_size): args = list(inputs_batch) + non_batched_inputs cur_result = predict_fn(*args) if num_outputs > 1: for i in range(num_outputs): results[i].append(cur_result[i]) else: results[0].append(cur_result) if num_outputs > 1: return tuple(np.concatenate(results[i]) for i in range(num_outputs)) else: return np.concatenate(results[0])
def _predict_batch_by_batch(predict_fn, batched_inputs, non_batched_inputs=None, batch_size=BATCH_SIZE, num_outputs=1): """ Splits prediction for big dataset in order to save GPU memory. Equivalent to predict_fn(*batched_inputs + non_batched_inputs). predict_fn: Compiled theano-function. batched_inputs: Inputs that we split into batches. On each iteration, we only pass one batch of this data into theano function. non_batched_inputs: Inputs that we do not split into batches. These inputs are the same for each call of predict_fn batch_size: int Size of each batch that we split our batched_inputs into num_ouputs: int, default=1 Number of returned items of each theano function. """ if non_batched_inputs is None: non_batched_inputs = [] results = [[] for _ in xrange(num_outputs)] for inputs_batch in get_training_batch(batched_inputs, batch_size): args = list(inputs_batch) + non_batched_inputs cur_result = predict_fn(*args) if num_outputs > 1: for i in xrange(num_outputs): results[i].append(cur_result[i]) else: results[0].append(cur_result) if num_outputs > 1: return tuple(np.concatenate(results[i]) for i in xrange(num_outputs)) else: return np.concatenate(results[0])
def train_model(nn_model): _logger.info('\nDefault model save path:\n{}\n'.format( nn_model.model_save_path)) datasets_collection = _get_datasets(nn_model, nn_model.is_reverse_model) _logger.info('Finished preprocessing! Start training') batch_id = 0 best_val_perplexities = (float('inf'), float('inf')) cur_val_metrics = None batches_num = (datasets_collection.train.x.shape[0] + BATCH_SIZE - 1) // BATCH_SIZE # The adding (BATCH_SIZE - 1) should be used here to count the last batch # that may be smaller than BATCH_SIZE cur_loss = 0 total_training_time = 0 start_time = time.time() for epoch_id in range(EPOCHS_NUM): _logger.info('Starting epoch #{}'.format(epoch_id)) for train_batch in get_training_batch( datasets_collection.train, BATCH_SIZE, random_permute=SHUFFLE_TRAINING_BATCHES): train_stats = TrainStats( cur_batch_id=batch_id, batches_num=batches_num, start_time=start_time, total_training_time=total_training_time, cur_loss=cur_loss, best_val_perplexities=best_val_perplexities, cur_val_metrics=cur_val_metrics) best_val_perplexities, cur_val_metrics = \ _analyse_model_performance_and_dump_results(nn_model, datasets_collection, train_stats) best_val_perplexities = \ _update_saved_nn_model(nn_model, cur_val_metrics, best_val_perplexities, train_stats) prev_time = time.time() loss = nn_model.train(*train_batch) cur_loss = _get_decayed_avg_loss(cur_loss, loss) if batch_id else loss total_training_time += time.time() - prev_time batch_id += 1
def _get_training_batch_generator(self): # set unique random seed for different workers to correctly process batches in multi-gpu training horovod_seed = self._horovod.rank() if self._horovod else 0 epoch_id = 0 while True: # inifinite batches generator epoch_id += 1 for train_batch in get_training_batch( self._training_data, self._params.train_batch_size, random_permute=SHUFFLE_TRAINING_BATCHES, random_seed=RANDOM_SEED * epoch_id + horovod_seed): context_tokens_ids, response_tokens_ids, condition_id = train_batch # response tokens are wraped with _start_ and _end_ tokens # output shape == (batch_size, seq_len) # get input response ids by removing last sequence token (_end_) input_response_tokens_ids = response_tokens_ids[:, :-1] # output shape == (batch_size, seq_len - 1) # get target response ids by removing the first (_start_) token of the sequence target_response_tokens_ids = response_tokens_ids[:, 1:] # output shape == (batch_size, seq_len - 1) # workaround for using sparse_categorical_crossentropy loss # see https://github.com/tensorflow/tensorflow/issues/17150#issuecomment-399776510 target_response_tokens_ids = np.expand_dims( target_response_tokens_ids, axis=-1) # output shape == (batch_size, seq_len - 1, 1) init_dec_hs = np.zeros(shape=(context_tokens_ids.shape[0], self._decoder_depth, self._params.hidden_layer_dim), dtype=K.floatx()) yield [ context_tokens_ids, input_response_tokens_ids, condition_id, init_dec_hs ], target_response_tokens_ids
def _get_batch_generator(self, input_data, batch_size): RANDOM_SEED = 7 DECODER_DEPTH = 2 UTT_HIDDEN_DIM = 600 epoch_id = 0 while True: # inifinite batches generator epoch_id += 1 for train_batch in get_training_batch(input_data, batch_size, random_permute=False, random_seed=RANDOM_SEED * epoch_id): context_tokens_ids, response_tokens_ids = train_batch # response tokens are wraped with _start_ and _end_ tokens # output shape == (batch_size, seq_len) # get input response ids by removing last sequence token (_end_) input_response_tokens_ids = response_tokens_ids[:, :-1] # output shape == (batch_size, seq_len - 1) # get target response ids by removing the first (_start_) token of the sequence target_response_tokens_ids = response_tokens_ids[:, 1:] # output shape == (batch_size, seq_len - 1) # workaround for using sparse_categorical_crossentropy loss # see https://github.com/tensorflow/tensorflow/issues/17150#issuecomment-399776510 target_response_tokens_ids = np.expand_dims( target_response_tokens_ids, axis=-1) # output shape == (batch_size, seq_len - 1, 1) init_dec_hs = np.zeros(shape=(context_tokens_ids.shape[0], DECODER_DEPTH, UTT_HIDDEN_DIM), dtype=K.floatx()) yield [ context_tokens_ids, input_response_tokens_ids, init_dec_hs ], target_response_tokens_ids
def train_model(nn_model, is_reverse_model=False): """ Main function fo training. Refactoring anticipated. """ validation_prediction_mode = PREDICTION_MODES.sampling if is_reverse_model else PREDICTION_MODE_FOR_TESTS train = load_conditioned_train_set(nn_model.token_to_index, nn_model.condition_to_index) context_free_val = load_context_free_val(nn_model.token_to_index) context_sensitive_val = load_context_sensitive_val( nn_model.token_to_index, nn_model.condition_to_index) if is_reverse_model: service_tokens = ServiceTokensIDs(nn_model.token_to_index) train = reverse_nn_input(train, service_tokens) context_free_val = reverse_nn_input(context_free_val, service_tokens) context_sensitive_val = reverse_nn_input(context_sensitive_val, service_tokens) # Train subset of same size as a context-free val for metrics calculation train_subset = generate_subset(train, VAL_SUBSET_SIZE) # Context-sensitive val subset of same size as a context-free val for metrics calculation context_sensitive_val_subset = generate_subset(context_sensitive_val, VAL_SUBSET_SIZE) _logger.info('Finished preprocessing! Start training') batch_id = 0 avg_loss = 0 total_training_time = 0 best_val_perplexities = (float('inf'), float('inf')) batches_num = (train.x.shape[0] - 1) / BATCH_SIZE + 1 start_time = time.time() cur_val_metrics = None try: for epoches_counter in xrange(1, EPOCHES_NUM + 1): _logger.info( 'Starting epoch #%d; time = %0.2f s(training of it = %0.2f s)' % (epoches_counter, time.time() - start_time, total_training_time)) for train_batch in get_training_batch( [train.x, train.y, train.condition_ids], BATCH_SIZE, random_permute=SHUFFLE_TRAINING_BATCHES): x_train_batch, y_train_batch, condition_ids_train_batch = train_batch batch_id += 1 prev_time = time.time() loss = nn_model.train(x_train_batch, y_train_batch, condition_ids_train_batch) cur_time = time.time() total_training_time += cur_time - prev_time total_time = cur_time - start_time avg_loss = LOG_LOSS_DECAY * avg_loss + ( 1 - LOG_LOSS_DECAY) * loss if batch_id > 1 else loss progress = 100 * float(batch_id) / batches_num avr_time_per_sample = total_time / batch_id expected_time_per_epoch = avr_time_per_sample * batches_num # use print here for better readability _logger.info('batch %s / %s (%d%%) \t' 'loss: %.2f \t ' 'time: epoch %.1f h | ' 'total %0.1f h | ' 'train %0.1f h (%.1f%%)' % (batch_id, batches_num, progress, avg_loss, expected_time_per_epoch / 3600, total_time / 3600, total_training_time / 3600, 100 * total_training_time / total_time)) if batch_id % SCREEN_LOG_FREQUENCY_PER_BATCHES == 0: _log_sample_answers( context_free_val.x[:SCREEN_LOG_NUM_TEST_LINES], nn_model, validation_prediction_mode, is_reverse_model) if batch_id % LOG_FREQUENCY_PER_BATCHES == 0: _calc_and_save_train_metrics(nn_model, train_subset, avg_loss) val_metrics = _calc_and_save_val_metrics( nn_model, context_sensitive_val_subset, context_free_val, prediction_mode=validation_prediction_mode) _save_val_results( nn_model, context_free_val.x, context_sensitive_val_subset.x, val_metrics, train_info=(start_time, batch_id, batches_num), prediction_mode=validation_prediction_mode) cur_val_metrics = val_metrics best_val_perplexities = \ _update_saved_nn_model(nn_model, (val_metrics['context_free_perplexity'], val_metrics['context_sensitive_perplexity']), best_val_perplexities, is_reverse_model=is_reverse_model) except (KeyboardInterrupt, SystemExit): _logger.info('Training cycle is stopped manually') _save_model(nn_model, get_model_full_path(is_reverse_model) + '_final') _save_val_results(nn_model, context_free_val.x, context_sensitive_val_subset.x, cur_val_metrics, train_info=(start_time, batch_id, batches_num), suffix='_final', prediction_mode=validation_prediction_mode)
def train_model(nn_model, is_reverse_model=False): """ Main function fo training. Refactoring anticipated. """ validation_prediction_mode = PREDICTION_MODES.sampling if is_reverse_model else PREDICTION_MODE_FOR_TESTS train = load_conditioned_train_set(nn_model.token_to_index, nn_model.condition_to_index) context_free_val = load_context_free_val(nn_model.token_to_index) context_sensitive_val = load_context_sensitive_val(nn_model.token_to_index, nn_model.condition_to_index) if is_reverse_model: service_tokens = ServiceTokensIDs(nn_model.token_to_index) train = reverse_nn_input(train, service_tokens) context_free_val = reverse_nn_input(context_free_val, service_tokens) context_sensitive_val = reverse_nn_input(context_sensitive_val, service_tokens) # Train subset of same size as a context-free val for metrics calculation train_subset = generate_subset(train, VAL_SUBSET_SIZE) # Context-sensitive val subset of same size as a context-free val for metrics calculation context_sensitive_val_subset = generate_subset(context_sensitive_val, VAL_SUBSET_SIZE) _logger.info('Finished preprocessing! Start training') batch_id = 0 avg_loss = 0 total_training_time = 0 best_val_perplexities = (float('inf'), float('inf')) batches_num = (train.x.shape[0] - 1) / BATCH_SIZE + 1 start_time = time.time() cur_val_metrics = None try: for epoches_counter in xrange(1, EPOCHES_NUM + 1): _logger.info('Starting epoch #%d; time = %0.2f s(training of it = %0.2f s)' % (epoches_counter, time.time() - start_time, total_training_time)) for train_batch in get_training_batch( [train.x, train.y, train.condition_ids], BATCH_SIZE, random_permute=SHUFFLE_TRAINING_BATCHES): x_train_batch, y_train_batch, condition_ids_train_batch = train_batch batch_id += 1 prev_time = time.time() loss = nn_model.train(x_train_batch, y_train_batch, condition_ids_train_batch) cur_time = time.time() total_training_time += cur_time - prev_time total_time = cur_time - start_time avg_loss = LOG_LOSS_DECAY * avg_loss + (1 - LOG_LOSS_DECAY) * loss if batch_id > 1 else loss progress = 100 * float(batch_id) / batches_num avr_time_per_sample = total_time / batch_id expected_time_per_epoch = avr_time_per_sample * batches_num # use print here for better readability _logger.info('batch %s / %s (%d%%) \t' 'loss: %.2f \t ' 'time: epoch %.1f h | ' 'total %0.1f h | ' 'train %0.1f h (%.1f%%)' % (batch_id, batches_num, progress, avg_loss, expected_time_per_epoch / 3600, total_time / 3600, total_training_time / 3600, 100 * total_training_time / total_time)) if batch_id % SCREEN_LOG_FREQUENCY_PER_BATCHES == 0: _log_sample_answers(context_free_val.x[:SCREEN_LOG_NUM_TEST_LINES], nn_model, validation_prediction_mode, is_reverse_model) if batch_id % LOG_FREQUENCY_PER_BATCHES == 0: _calc_and_save_train_metrics(nn_model, train_subset, avg_loss) val_metrics = _calc_and_save_val_metrics( nn_model, context_sensitive_val_subset, context_free_val, prediction_mode=validation_prediction_mode) _save_val_results( nn_model, context_free_val.x, context_sensitive_val_subset.x, val_metrics, train_info=(start_time, batch_id, batches_num), prediction_mode=validation_prediction_mode) cur_val_metrics = val_metrics best_val_perplexities = \ _update_saved_nn_model(nn_model, (val_metrics['context_free_perplexity'], val_metrics['context_sensitive_perplexity']), best_val_perplexities, is_reverse_model=is_reverse_model) except (KeyboardInterrupt, SystemExit): _logger.info('Training cycle is stopped manually') _save_model(nn_model, get_model_full_path(is_reverse_model) + '_final') _save_val_results( nn_model, context_free_val.x, context_sensitive_val_subset.x, cur_val_metrics, train_info=(start_time, batch_id, batches_num), suffix='_final', prediction_mode=validation_prediction_mode)