def evaluate(self, sess): global_step = sess.run(tf.train.get_or_create_global_step()) test_results = self.run_epoch_generator( sess, self.model, self._data['eval_loader'].get_iterator(), return_output=True, training=False) # y_preds: a list of (batch_size, horizon, num_nodes, output_dim) test_loss, y_preds = test_results['loss'], test_results['outputs'] utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step) y_preds = np.concatenate(y_preds, axis=0) scaler = self._data['scaler'] predictions = [] y_truths = [] for horizon_i in range(self._data['y_eval'].shape[1]): y_truth = scaler.inverse_transform( self._data['y_eval'][:, horizon_i, :, 0]) y_truths.append(y_truth) y_pred = scaler.inverse_transform(y_preds[:, horizon_i, :, 0]) predictions.append(y_pred) mse = metrics.masked_mse_np(preds=y_pred, labels=y_truth, null_val=0) mae = metrics.masked_mae_np(preds=y_pred, labels=y_truth, null_val=0) mape = metrics.masked_mape_np(preds=y_pred, labels=y_truth, null_val=0) rmse = metrics.masked_rmse_np(preds=y_pred, labels=y_truth, null_val=0) self._logger.info( "Horizon {:02d}, MSE: {:.2f}, MAE: {:.2f}, RMSE: {:.2f}, MAPE: {:.4f}" .format(horizon_i + 1, mse, mae, rmse, mape)) utils.add_simple_summary(self._writer, [ '%s_%d' % (item, horizon_i + 1) for item in ['metric/rmse', 'metric/mae', 'metric/mse'] ], [rmse, mae, mse], global_step=global_step) outputs = {'predictions': predictions, 'groundtruth': y_truths} return outputs
def evaluate(self, sess, **kwargs): global_step = sess.run(tf.train.get_or_create_global_step()) test_results = self.run_epoch_generator( sess, 'test', self._test_model, self._data['test_loader'].get_iterator(), self._data['scaler'], return_output=True, training=False) # y_preds: a list of (batch_size, horizon, num_nodes, output_dim) test_loss, y_preds = test_results['loss'], test_results['outputs'] self._logger.info('test_mae: %f', (np.asscalar(test_loss))) utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step) return
def evaluate(self, sess, data_test, **kwargs): global_step = sess.run(tf.train.get_or_create_global_step()) test_results = self.run_epoch_generator( sess, self._test_model, data_test['test_loader'].get_iterator(), return_output=True, training=False) # y_preds: a list of (batch_size, horizon, num_nodes, output_dim) test_loss, y_preds = test_results['loss'], test_results['outputs'] utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step) y_preds = np.concatenate(y_preds, axis=0) return y_preds
def evaluate(self, sess, **kwargs): y_preds_all = [] half_length = int(len(self.clusters) / 2) sclusters = self.clusters[0:32] for cluster in sclusters: node_count, adj_mx = self.cluster_data(cluster) adj_mx = utils.calculate_random_walk_matrix(adj_mx).T adj_mx = self._build_sparse_matrix(adj_mx) global_step = sess.run(tf.train.get_or_create_global_step()) scaler_path = self._kwargs['data'].get( 'dataset_dir') + '/scaler.npy' scaler_data_ = np.load(scaler_path) mean, var = scaler_data_[0], scaler_data_[1] scaler = StandardScaler(mean=mean, std=var) # change val to test before run test_data_path = self._kwargs['data'].get( 'dataset_dir') + '/test_' + str(cluster) + '.tfrecords' test_dataset = tf.data.TFRecordDataset([test_data_path]) test_dataset = test_dataset.map(self._parse_record_fn) test_dataset = test_dataset.make_one_shot_iterator() test_next_element = test_dataset.get_next() test_results = self.run_epoch_generator(sess, self._test_model, test_next_element, adj_mx, return_output=True, training=False) test_loss, y_preds = test_results['loss'], test_results['outputs'] utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step) y_preds = np.concatenate(y_preds, axis=0) y_preds = scaler.inverse_transform(y_preds[:, self.horizon - 1, :, 0]) y_preds = y_preds[:, 0:node_count] y_preds_all.append(y_preds) y_preds_all = np.concatenate(y_preds_all, axis=1) return y_preds_all
def _train(self, sess, base_lr, epoch, steps, patience=50, epochs=100, min_learning_rate=2e-6, lr_decay_ratio=0.1, save_model=1, test_every_n_epochs=10, save_epoch_interval=5, **train_kwargs): history = [] min_val_loss = float('inf') wait = 0 max_to_keep = train_kwargs.get('max_to_keep', 100) model_metaname = train_kwargs.get('model_metaname') if model_metaname is not None: pass #saver = tf.train.import_meta_graph(os.path.join('data/model/dcrnn_DR_2_h_12_64-64_lr_0.005_bs_32_0131205604_test', model_metaname)) else: saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep) model_filename = train_kwargs.get('model_filename') if model_filename is not None: saver.restore( sess, os.path.join(self._kwargs['base_dir'], model_filename)) self._epoch = epoch + 1 else: sess.run(tf.global_variables_initializer()) self._logger.info('Start training ...') while self._epoch <= epochs: # Learning rate schedule. new_lr = max( min_learning_rate, base_lr * (lr_decay_ratio**np.sum(self._epoch >= np.array(steps)))) self.set_lr(sess=sess, lr=new_lr) start_time = time.time() train_results = self.run_epoch_generator( sess, 'train', self._train_model, self._data['train_loader'].get_iterator(), None, training=True, return_output=True, writer=self._writer) train_loss, train_mae = train_results['loss'], train_results['mae'] if train_loss > 1e5: self._logger.warning('Gradient explosion detected. Ending...') break global_step = sess.run(tf.train.get_or_create_global_step()) # Compute validation error. print( '--------------------------------------------------------------------------------' ) val_results = self.run_epoch_generator( sess, 'val', self._test_model, self._data['val_loader'].get_iterator(), None, return_output=True, training=False) val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar( val_results['mae']) y_preds = val_results['outputs'] scaler = self._data['scaler'] y_preds = np.concatenate(y_preds, axis=0) for horizon_i in range(self._data['y_val'].shape[1]): y_truth = scaler.inverse_transform( self._data['y_val'][:, horizon_i, :, 0]) print('truth', y_truth[0, :5]) y_pred = scaler.inverse_transform(y_preds[0:5, horizon_i, :, 0]) print('pred', y_pred[0, :5]) utils.add_simple_summary(self._writer, [ 'loss/train_loss', 'metric/train_mae', 'loss/val_loss', 'metric/val_mae' ], [train_loss, train_mae, val_loss, val_mae], global_step=global_step) end_time = time.time() message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format( self._epoch, epochs, global_step, train_mae, val_mae, new_lr, (end_time - start_time)) self._logger.info(message) if self._epoch % test_every_n_epochs == test_every_n_epochs - 1: self.evaluate(sess) if val_loss < min_val_loss: self._logger.info(('Val loss decrease from %.4f to %.4f') % (min_val_loss, val_loss)) min_val_loss = val_loss else: wait += 1 if wait > patience: self._logger.warning('Early stopping at epoch: %d' % self._epoch) break if self._epoch % save_epoch_interval == 0: wait = 0 if save_model > 0: model_filename = self.save(sess, val_loss) self._logger.info( 'min Val loss %.4f ,Val loss %.4f, saving to %s' % (min_val_loss, val_loss, model_filename)) #model_filename history.append(val_mae) # Increases epoch. self._epoch += 1 sys.stdout.flush() return np.min(history)
def _train(self, sess, base_lr, epoch, steps, patience=50, epochs=100, min_learning_rate=2e-6, lr_decay_ratio=0.1, save_model=1, test_every_n_epochs=10, **train_kwargs): history = [] min_val_loss = float('inf') wait = 0 max_to_keep = train_kwargs.get('max_to_keep', 100) saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep) model_filename = train_kwargs.get('model_filename') if model_filename is not None: saver.restore(sess, model_filename) self._epoch = epoch + 1 else: sess.run(tf.global_variables_initializer()) self._logger.info('Start training ...') while self._epoch <= epochs: # Learning rate schedule. new_lr = max( min_learning_rate, base_lr * (lr_decay_ratio**np.sum(self._epoch >= np.array(steps)))) self.set_lr(sess=sess, lr=new_lr) start_time = time.time() train_results = self.run_epoch_generator( sess, self._train_model, self._data['train_loader'].get_iterator(), training=True, writer=self._writer) train_loss, train_mae = train_results['loss'], train_results['mae'] if train_loss > 1e5: self._logger.warning('Gradient explosion detected. Ending...') break global_step = sess.run(tf.train.get_or_create_global_step()) # Compute validation error. val_results = self.run_epoch_generator( sess, self._test_model, self._data['val_loader'].get_iterator(), training=False) val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar( val_results['mae']) utils.add_simple_summary(self._writer, [ 'loss/train_loss', 'metric/train_mae', 'loss/val_loss', 'metric/val_mae' ], [train_loss, train_mae, val_loss, val_mae], global_step=global_step) end_time = time.time() message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format( self._epoch, epochs, global_step, train_mae, val_mae, new_lr, (end_time - start_time)) self._logger.info(message) test_every_n_epochs = 1 if self._epoch % test_every_n_epochs == test_every_n_epochs - 1: self.evaluate(sess) if val_loss <= min_val_loss: wait = 0 if save_model > 0: model_filename = self.save(sess, val_loss) self._logger.info( 'Val loss decrease from %.4f to %.4f, saving to %s' % (min_val_loss, val_loss, model_filename)) min_val_loss = val_loss else: wait += 1 if wait > patience: self._logger.warning('Early stopping at epoch: %d' % self._epoch) break history.append(val_mae) # Increases epoch. self._epoch += 1 sys.stdout.flush() return np.min(history)
def _test(self, sess, **kwargs): global_step = sess.run(tf.train.get_or_create_global_step()) results_summary = pd.DataFrame(index=range(self._run_times)) results_summary['No.'] = range(self._run_times) n_metrics = 4 # Metrics: MSE, MAE, RMSE, MAPE, ER metrics_summary = np.zeros(shape=(self._run_times, self._horizon * n_metrics + 1)) for i in range(self._run_times): self._logger.info('|--- Run time: {}'.format(i)) # y_test = self._prepare_test_set() test_results = self._run_tm_prediction(sess, model=self._test_model) # y_preds: a list of (batch_size, horizon, num_nodes, output_dim) test_loss, y_preds = test_results['loss'], test_results['y_preds'] utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step) y_preds = test_results['y_preds'] y_preds = np.concatenate(y_preds, axis=0) y_truths = test_results['y_truths'] y_truths = np.concatenate(y_truths, axis=0) scaler = self._data['scaler'] predictions = [] for horizon_i in range(self._horizon): y_truth = scaler.inverse_transform(y_truths[:, horizon_i, :, 0]) y_pred = scaler.inverse_transform(y_preds[:, horizon_i, :, 0]) predictions.append(y_pred) mse = metrics.masked_mse_np(preds=y_pred, labels=y_truth, null_val=0) mae = metrics.masked_mae_np(preds=y_pred, labels=y_truth, null_val=0) mape = metrics.masked_mape_np(preds=y_pred, labels=y_truth, null_val=0) rmse = metrics.masked_rmse_np(preds=y_pred, labels=y_truth, null_val=0) self._logger.info( "Horizon {:02d}, MSE: {:.2f}, MAE: {:.2f}, RMSE: {:.2f}, MAPE: {:.4f}".format( horizon_i + 1, mse, mae, rmse, mape ) ) metrics_summary[i, horizon_i * n_metrics + 0] = mse metrics_summary[i, horizon_i * n_metrics + 1] = mae metrics_summary[i, horizon_i * n_metrics + 2] = rmse metrics_summary[i, horizon_i * n_metrics + 3] = mape tm_pred = scaler.inverse_transform(test_results['tm_pred']) g_truth = scaler.inverse_transform(self._data['test_data_norm'][self._seq_len:-self._horizon]) m_indicator = test_results['m_indicator'] er = error_ratio(y_pred=tm_pred, y_true=g_truth, measured_matrix=m_indicator) metrics_summary[i, -1] = er self._save_results(g_truth=g_truth, pred_tm=tm_pred, m_indicator=m_indicator, tag=str(i)) print('ER: {}'.format(er)) for horizon_i in range(self._horizon): results_summary['mse_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 0] results_summary['mae_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 1] results_summary['rmse_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 2] results_summary['mape_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 3] results_summary['er'] = metrics_summary[:, -1] results_summary.to_csv(self._log_dir + 'results_summary.csv', index=False) return
def _train(self, sess, base_lr, epoch, steps, patience=50, epochs=100, min_learning_rate=2e-6, lr_decay_ratio=0.1, save_model=1, test_every_n_epochs=10, **train_kwargs): history = [] min_val_loss = float('inf') wait = 0 training_history = pd.DataFrame() losses, val_losses = [], [] max_to_keep = train_kwargs.get('max_to_keep', 100) saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep) model_filename = train_kwargs.get('model_filename') continue_train = train_kwargs.get('continue_train') if continue_train is True and model_filename is not None: saver.restore(sess, model_filename) self._epoch = epoch + 1 else: sess.run(tf.global_variables_initializer()) self._logger.info('Start training ...') while self._epoch <= epochs: self._logger.info('Training epoch: {}/{}'.format( self._epoch, epochs)) # Learning rate schedule. new_lr = max( min_learning_rate, base_lr * (lr_decay_ratio**np.sum(self._epoch >= np.array(steps)))) self.set_lr(sess=sess, lr=new_lr) start_time = time.time() train_results = self.run_epoch_generator( sess, self.model, self._data['train_loader'].get_iterator(), training=True, writer=self._writer) train_loss, train_mse = train_results['loss'], train_results['mse'] # if train_loss > 1e5: # self._logger.warning('Gradient explosion detected. Ending...') # break global_step = sess.run(tf.train.get_or_create_global_step()) # Compute validation error. val_results = self.run_epoch_generator( sess, self.model, self._data['val_loader'].get_iterator(), training=False) val_loss, val_mse = val_results['loss'].item( ), val_results['mse'].item() utils.add_simple_summary(self._writer, [ 'loss/train_loss', 'metric/train_mse', 'loss/val_loss', 'metric/val_mse' ], [train_loss, train_mse, val_loss, val_mse], global_step=global_step) end_time = time.time() message = 'Epoch [{}/{}] ({}) train_mse: {:f}, val_mse: {:f} lr:{:f} {:.1f}s'.format( self._epoch, epochs, global_step, train_mse, val_mse, new_lr, (end_time - start_time)) self._logger.info(message) if val_loss <= min_val_loss: wait = 0 if save_model > 0: model_filename = self.save(sess, val_loss) self._logger.info( 'Val loss decrease from %f to %f, saving to %s' % (min_val_loss, val_loss, model_filename)) min_val_loss = val_loss else: wait += 1 if wait > patience: self._logger.warning('Early stopping at epoch: %d' % self._epoch) break history.append(val_mse) # Increases epoch. self._epoch += 1 losses.append(train_loss) val_losses.append(val_loss) sys.stdout.flush() training_history['epoch'] = np.arange(self._epoch) training_history['loss'] = losses training_history['val_loss'] = val_losses training_history.to_csv(self._log_dir + 'training_history.csv', index=False) return np.min(history)
def _train(self, sess, base_lr, epoch, steps, patience=50, epochs=100, min_learning_rate=2e-6, lr_decay_ratio=0.1, save_model=1, test_every_n_epochs=10, **train_kwargs): history = [] min_val_loss = float('inf') wait = 0 max_to_keep = train_kwargs.get('max_to_keep', 100) saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep) model_filename = train_kwargs.get('model_filename') output_file = train_kwargs.get('preds_file') gt_file = train_kwargs.get('groundtruth_file') if model_filename is not None: saver.restore(sess, model_filename) self._epoch = epoch + 1 else: sess.run(tf.global_variables_initializer()) self._logger.info('Start training ...') while self._epoch <= epochs: # Learning rate schedule. new_lr = max( min_learning_rate, base_lr * (lr_decay_ratio**np.sum(self._epoch >= np.array(steps)))) self.set_lr(sess=sess, lr=new_lr) start_time = time.time() train_results = self.run_epoch_generator( sess, self._train_model, self._data['train_loader'].get_iterator(), training=True, writer=self._writer) train_loss, train_mae, train_reg = train_results[ 'loss'], train_results['mae'], train_results['reg'] #print ('reg loss is:', train_reg) if train_loss > 1e5: self._logger.warning('Gradient explosion detected. Ending...') break global_step = sess.run(tf.train.get_or_create_global_step()) # Compute validation error. val_results = self.run_epoch_generator( sess, self._test_model, self._data['val_loader'].get_iterator(), training=False) val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar( val_results['mae']) utils.add_simple_summary(self._writer, [ 'loss/train_loss', 'metric/train_mae', 'loss/val_loss', 'metric/val_mae' ], [train_loss, train_mae, val_loss, val_mae], global_step=global_step) end_time = time.time() message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format( self._epoch, epochs, global_step, train_mae, val_mae, new_lr, (end_time - start_time)) self._logger.info(message) stt = time.time() outputs = self.evaluate(sess) #print (outputs['groundtruth'].shape) test_gdt = outputs['groundtruth'][:, :, :, 0] test_y = outputs['observed'][:, :, :, 0] best_pred = outputs['predictions'][:test_gdt.shape[0], :, :, 0] test_gdt = test_gdt.reshape( test_gdt.shape[0], -1, order='F' ) # record the corresponding steps for each node first, then node by node test_y = test_y.reshape(test_gdt.shape[0], -1, order='F') best_pred = best_pred.reshape(test_gdt.shape[0], -1, order='F') scaler = self._data['scaler'] print('Test running time: %fs' % (time.time() - stt)) # -------- # best_pred = scaler.inverse_transform(best_pred) # # np.save('best_pred.npy', best_pred) # mape = metrics.masked_mape_np(best_pred, test_y, test_gdt, null_val=0) # rmse = metrics.masked_rmse_np(best_pred, test_y, test_gdt, null_val=0) # self._logger.info( # 'Overall Test MAPE %.4f, RMSE %.4f' % (mape, rmse)) # -------- if val_loss <= min_val_loss: wait = 0 if save_model > 0: model_filename = self.save(sess, val_loss) self._logger.info( 'Val loss decrease from %.4f to %.4f, saving to %s' % (min_val_loss, val_loss, model_filename)) min_val_loss = val_loss test_y = scaler.inverse_transform(test_y) best_pred = scaler.inverse_transform(best_pred) # np.save('best_pred.npy', best_pred) mape = metrics.masked_mape_np(best_pred, test_y, test_gdt, null_val=0) rmse = metrics.masked_rmse_np(best_pred, test_y, test_gdt, null_val=0) self._logger.info('Overall Test MAPE %.4f, RMSE %.4f' % (mape, rmse)) print(best_pred.shape) #np.savetxt(output_file, best_pred, delimiter = ',') #np.savetxt(gt_file, test_gdt, delimiter = ',') else: wait += 1 if wait > patience: self._logger.warning('Early stopping at epoch: %d' % self._epoch) break # test_y = scaler.inverse_transform(test_y) # best_pred = scaler.inverse_transform(best_pred) # mape = metrics.masked_mape_np(best_pred, test_y, test_gdt, null_val=0) # rmse = metrics.masked_rmse_np(best_pred, test_y, test_gdt, null_val=0) # self._logger.info( # 'Overall Test MAPE %.4f, RMSE %.4f' % (mape, rmse)) history.append(val_mae) # Increases epoch. self._epoch += 1 sys.stdout.flush() return np.min(history)
def _train(self, sess, base_lr, epoch, steps, patience=50, epochs=100, min_learning_rate=2e-6, lr_decay_ratio=0.1, save_model=1, test_every_n_epochs=10, **train_kwargs): history = [] min_val_loss = float('inf') wait = 0 max_to_keep = train_kwargs.get('max_to_keep', 100) saver = tf.train.Saver( tf.global_variables(), max_to_keep=max_to_keep) # 第一个参数var_list指定要保存和恢复的变量 model_filename = train_kwargs.get('model_filename') if model_filename is not None: saver.restore(sess, model_filename) self._epoch = epoch + 1 else: sess.run(tf.global_variables_initializer()) self._logger.info('Start training ...') while self._epoch <= epochs: # Learning rate schedule. new_lr = max( min_learning_rate, base_lr * (lr_decay_ratio**np.sum(self._epoch >= np.array(steps)))) self.set_lr(sess=sess, lr=new_lr) start_time = time.time() train_results = self.run_epoch_generator( sess, self._train_model, self._data['train_loader'].get_iterator(), training=True, writer=self._writer) train_loss, train_mae = train_results['loss'], train_results['mae'] if train_loss > 1e5: self._logger.warning('Gradient explosion detected. Ending...') break global_step = sess.run(tf.train.get_or_create_global_step()) ''' global_step refer to the number of batches seen by the graph. Everytime a batch is provided, the weights are updated in the direction that minimizes the loss. global_step just keeps track of the number of batches seen so far. When it is passed in the minimize() argument list, the variable is increased by one. Have a look at optimizer.minimize(). You can get the global_step value using tf.train.global_step(). The 0 is the initial value of the global step in this context. 讲解global_step的好博客:https://blog.csdn.net/leviopku/article/details/78508951 ''' # Compute validation error. val_results = self.run_epoch_generator( sess, self._test_model, self._data['val_loader'].get_iterator(), training=False) val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar( val_results['mae']) utils.add_simple_summary(self._writer, [ 'loss/train_loss', 'metric/train_mae', 'loss/val_loss', 'metric/val_mae' ], [train_loss, train_mae, val_loss, val_mae], global_step=global_step) end_time = time.time() message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format( self._epoch, epochs, global_step, train_mae, val_mae, new_lr, (end_time - start_time)) self._logger.info(message) if self._epoch % test_every_n_epochs == test_every_n_epochs - 1: self.evaluate(sess) if val_loss <= min_val_loss: wait = 0 if save_model > 0: model_filename = self.save( sess, val_loss) # save()同时记录了config_x.yaml配置文件(x=epoch) self._logger.info( 'Val loss decrease from %.4f to %.4f, saving to %s' % (min_val_loss, val_loss, model_filename)) min_val_loss = val_loss else: wait += 1 if wait > patience: self._logger.warning('Early stopping at epoch: %d' % self._epoch) break history.append(val_mae) # Increases epoch. self._epoch += 1 sys.stdout.flush() # 即将缓冲区中的数据立刻写入文件,同时清空缓冲区,不需要是被动的等待输出缓冲区写入 return np.min(history)
def _train(self, sess, base_lr, epoch, steps, patience=50, epochs=100, min_learning_rate=2e-6, lr_decay_ratio=0.1, save_model=1, test_every_n_epochs=10, **train_kwargs): val_history, train_history = [], [] min_val_loss = float('inf') wait = 0 steps.append( float('inf')) # to keep finial learning rate up to the end max_to_keep = train_kwargs.get('max_to_keep', 100) cl_decay_steps = self._model_kwargs.get('cl_decay_steps') saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep) model_filename = train_kwargs.get('model_filename') if model_filename is not None: saver.restore(sess, model_filename) self._epoch = epoch + 1 min_val_loss = float( os.path.basename(model_filename).split('-')[1]) else: sess.run(tf.global_variables_initializer()) self._logger.info('Start training ...') while self._epoch < epochs: # Learning rate schedule. new_lr = max( min_learning_rate, base_lr * (lr_decay_ratio**np.sum(self._epoch >= np.array(steps)))) self.set_lr(sess=sess, lr=new_lr) start_time = time.time() train_results = self.run_epoch_generator( sess, self._train_model, self._data['train_loader'].get_iterator(), training=True, writer=self._writer) train_loss, train_mae = train_results['loss'], train_results['mae'] if train_loss > 1e5: self._logger.warning('Gradient explosion detected. Ending...') break global_step = sess.run(tf.train.get_or_create_global_step()) # Compute validation error. val_results = self.run_epoch_generator( sess, self._test_model, self._data['val_loader'].get_iterator(), training=False) val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar( val_results['mae']) utils.add_simple_summary(self._writer, [ 'loss/train_loss', 'metric/train_mae', 'loss/val_loss', 'metric/val_mae' ], [train_loss, train_mae, val_loss, val_mae], global_step=global_step) end_time = time.time() cl_threshold = self._train_model._compute_sampling_threshold( global_step, cl_decay_steps).eval() # current sampling probability of cl message = 'Epoch [{}/{}] ({}) train_mae: {:.5f}, val_mae: {:.5f}, lr: {:.5f}, cl_thres: {:.3f}, t: {:.1f}min'.format( self._epoch, epochs, global_step, train_mae, val_mae, new_lr, cl_threshold, (end_time - start_time) / 60) self._logger.info(message) if self._epoch % test_every_n_epochs == test_every_n_epochs - 1: self.evaluate(sess) if val_loss <= min_val_loss: wait = 0 if save_model > 0: model_filename = self.save(sess, val_loss) self._logger.info( 'Val loss decrease from %.4f to %.4f, saving to %s' % (min_val_loss, val_loss, model_filename)) min_val_loss = val_loss else: wait += 1 if wait >= patience: self._logger.warning('Lowering learning rate preliminary.') val_history.append(val_mae) train_history.append(train_mae) # Restore best weights before lowering lr in next epoch. if steps[np.sum(self._epoch >= np.array(steps) )] == self._epoch + 1 or wait >= patience: self._logger.info('Restore model from epoch {}: {}'.format( (self._epoch - wait), os.path.basename(model_filename))) saver.restore(sess, model_filename) self._epoch = self._epoch - wait # Go back to epoch... steps[np.sum(self._epoch >= np.array(steps))] = self._epoch + 1 wait = 0 # Reset patience iterator. # Increases epoch. self._epoch += 1 sys.stdout.flush() return val_history, train_history