def main(argv: Sequence[str]): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if not FLAGS.use_gpu: logging.info('Using TPU for training.') strategy = utils.get_tpu_strategy(FLAGS.tpu) else: logging.info('Using GPU for training.') strategy = tf.distribute.MirroredStrategy() train_dataset, steps_per_epoch = utils.load_dataset( FLAGS.data_dir, tfds.Split.TRAIN, FLAGS.batch_size) eval_identifiers = ['tune', 'test1', 'test2'] splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')] eval_datasets, steps_per_eval = utils.load_eval_datasets( eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size) logging.info('Steps for eval datasets: %s', steps_per_eval) graph_augmenter = None if FLAGS.augmentations: graph_augmenter = augmentation_utils.GraphAugment( FLAGS.augmentations, FLAGS.aug_ratio, FLAGS.aug_prob, FLAGS.perturb_node_features, FLAGS.drop_edges_only, FLAGS.perturb_edge_features, FLAGS.initialize_edge_features_randomly, FLAGS.mask_mean, FLAGS.mask_stddev) params = utils.ModelParameters(num_heads=FLAGS.num_heads, num_layers=FLAGS.num_layers, message_layer_size=FLAGS.message_layer_size, readout_layer_size=FLAGS.readout_layer_size, use_gp_layer=False, learning_rate=FLAGS.learning_rate, augmentations=FLAGS.augmentations, num_epochs=FLAGS.num_epochs, steps_per_epoch=steps_per_epoch) model_dir = FLAGS.output_dir utils.write_params(dataclasses.asdict(params), os.path.join(model_dir, 'params.json')) summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries')) run(train_dataset=train_dataset, eval_datasets=eval_datasets, steps_per_eval=steps_per_eval, params=params, model_dir=model_dir, strategy=strategy, summary_writer=summary_writer, loss_type=FLAGS.loss_type, graph_augmenter=graph_augmenter)
help='RNN input image step') parser.add_argument('--max_dist', type=float, default=25., help='max distance') parser.add_argument('--max_speed', type=float, default=10., help='max speed') parser.add_argument('--max_t', type=float, default=3., help='max time') opt = parser.parse_args() if opt.test_mode: opt.batch_size = 1 description = 'dropout' log_path = 'result/log/' + opt.dataset_name + '/' os.makedirs('result/saved_models/%s' % opt.dataset_name, exist_ok=True) os.makedirs('result/output/%s' % opt.dataset_name, exist_ok=True) os.makedirs('result/output2/%s' % opt.dataset_name, exist_ok=True) os.makedirs('result/output3/%s' % opt.dataset_name, exist_ok=True) if not opt.test_mode: logger = SummaryWriter(log_dir=log_path) write_params(log_path, parser, description) # generator = Generator(input_dim=128+32+1+1, output=2).to(device) # discriminator = Discriminator(opt.points_num*2+32+1).to(device) generator = Generator(input_dim=2 + 2 + 1 + 1, output=2).to(device) discriminator = Discriminator(opt.points_num * 2 + 2 + 1).to(device) # encoder = CNN(input_dim=1, out_dim=32).to(device) encoder = CNNNorm(input_dim=1, out_dim=2).to(device) encoder.load_state_dict( torch.load('result/saved_models/il-uncertainty-02/encoder_119000.pth')) # DO NOT TRAIN ENCODER encoder.eval() # discriminator.load_state_dict(torch.load('result/saved_models/train-gan-costmap-01/discriminator_120000.pth')) generator.load_state_dict( torch.load(
def run(train_dataset: tf.data.Dataset, eval_datasets: Dict[str, tf.data.Dataset], steps_per_eval: Dict[str, int], params: utils.ModelParameters, model_dir: str, strategy: tf.distribute.Strategy, summary_writer: tf.summary.SummaryWriter, loss_type: str, graph_augmenter: augmentation_utils.GraphAugment): """Trains and evaluates the model.""" with strategy.scope(): model = ub.models.mpnn( nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:], edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:], num_heads=params.num_heads, num_layers=params.num_layers, message_layer_size=params.message_layer_size, readout_layer_size=params.readout_layer_size, use_gp_layer=params.use_gp_layer) optimizer = tf.keras.optimizers.RMSprop( learning_rate=params.learning_rate) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.CategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/roc_auc': tf.keras.metrics.AUC(), } for dataset_name in eval_datasets: metrics[ f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy( ) metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC() metrics[ f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean( ) if dataset_name == 'test2': ece_num_bins = 5 else: ece_num_bins = 10 metrics[ f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError( num_bins=ece_num_bins) metrics[f'{dataset_name}/brier'] = rm.metrics.Brier() @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, sample_weights = inputs else: features, labels = inputs sample_weights = 1 if params.augmentations: # TODO(jihyeonlee): For now, choose 1 augmentation function from all # possible with equal probability. Allow user to specify number of # augmentations to apply per graph. features = graph_augmenter.augment(features) with tf.GradientTape() as tape: probs = model(features, training=True) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs) * sample_weights) l2_loss = sum(model.losses) if loss_type == 'focal': focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy() focal_loss = tf.reduce_mean( focal_loss_fn(labels, probs) * sample_weights) loss = focal_loss + l2_loss else: loss = negative_log_likelihood + l2_loss # Scale the loss given the tf.distribute.Strategy will reduce sum all # gradients. See details in # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1]) for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)): strategy.run(step_fn, args=(next(iterator), )) @tf.function def eval_step(iterator, dataset_name, num_steps): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, _ = inputs else: features, labels = inputs probs = model(features, training=False) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs)) metrics[f'{dataset_name}/negative_log_likelihood'].update_state( negative_log_likelihood) metrics[f'{dataset_name}/accuracy'].update_state(labels, probs) metrics[f'{dataset_name}/roc_auc'].update_state( labels[:, 1], probs[:, 1]) metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1]) metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1]) for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run(step_fn, args=(next(iterator), )) # Makes datasets into distributed version. train_dataset = strategy.experimental_distribute_dataset(train_dataset) eval_datasets = { ds_name: strategy.experimental_distribute_dataset(ds) for ds_name, ds in eval_datasets.items() } logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync) train_iterator = iter(train_dataset) start_time = time.time() metrics_history = collections.defaultdict(list) for epoch in range(params.num_epochs): logging.info('Starting to run epoch: %s', epoch) train_step(train_iterator) current_step = (epoch + 1) * params.steps_per_epoch max_steps = params.steps_per_epoch * params.num_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, params.num_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) # Start evaluation. logging.info('Starting to run eval at epoch: %s', epoch) for dataset_name, eval_dataset in eval_datasets.items(): eval_iterator = iter(eval_dataset) eval_step(eval_iterator, dataset_name, steps_per_eval[dataset_name]) metrics_history['epoch'].append(epoch + 1) with summary_writer.as_default(): for name, metric in metrics.items(): result = utils.get_metric_result_value(metric) tf.summary.scalar(name, result, step=epoch + 1) metrics_history[name].append(str(result)) for metric in metrics.values(): metric.reset_states() model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True) utils.write_params(metrics_history, os.path.join(model_dir, 'metrics_history.json'))
def main(): # build parser and check arguments args = _build_parser() _check_args(args) # Setup Estimator '''Estimator name: xgb: XGBoost Classifier log: Logistic Regression knn: KNeighbors Classifier rfo: RandomForest Classifier ada: AdaBoost Classifier ext: ExtraTrees Classifier svc: Support Vector Classifier keras: Keras Neural Networks ''' if not args.estimator == 'all': estimators = [args.estimator] elif args.estimator == 'all': estimators = ['xgb', 'lgb', 'log', 'rfo', 'ext', 'ada', 'knn', 'svc'] # Training neural nets with keras if args.train_nn: estimator_name = 'keras' print('Training %s...' % estimator_name) params = { 'n_features': n_features, 'n_classes': n_classes, 'dropout': args.dropout, 'hidden_unit': args.hidden_unit, 'n_layers': args.layers, 'optimizer': args.optimizer, 'init': args.init, 'batch_size': args.batch_size, 'epochs': args.epochs, } estimator = keras_model(**params) train_kwargs = { 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'score_name': args.score, 'num': args.num } _ = estimator.train(**train_kwargs) print('params: \n', params) # Training random search CV with scikit-learn models if args.train_random: for estimator_name in estimators: print('Training %s...' % estimator_name) if not estimator_name == 'keras': seed = args.seed if args.seed != None else np.random.randint( 100) estimator, params = select_model(estimator_name, n_features, n_classes, seed) # kwargs dict for train and predict train_kwargs = { 'estimator': estimator, 'params': params, 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'n_iter': args.n_iter, 'score_name': args.score, } # Train model and Predict results best_params, best_score, val_score = random_model( **train_kwargs) timestamp = get_timestamp() # Write params to file write_params(estimator_name, best_params, best_score, val_score, timestamp, args.num) elif estimator_name == 'keras': space_params = { 'n_features': n_features, 'n_classes': n_classes, 'dropout': hp.uniform('dropout', .20, .80), 'hidden_unit': hp.quniform('hidden_unit', 10, 50, q=1), 'n_layers': hp.choice('n_layers', [1, 2, 3, 4]), 'optimizer': hp.choice('optimizer', ['adam', 'adadelta', 'sgd']), 'init': hp.choice('init', ['glorot_uniform', 'normal', 'uniform']), 'batch_size': hp.choice('batch_size', [16, 32, 64, 128]), 'epochs': hp.quniform('epochs', 100, 1000, q=1), 'score_name': args.score, 'num': args.num, } trials = Trials() best_params = fmin(random_nn, space_params, algo=tpe.suggest, max_evals=args.n_iter, trials=trials) print('best_params \n', best_params) # Evaluate with ensemble method and predict result if args.predict: eva_kwargs = { 'estimators': estimators, 'threshold': args.threshold, 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'X_test': X_test, 'score_name': args.score, 'n_classes': n_classes, } # Predict with ensemble voting and write result prediction = ensemble(**eva_kwargs) if args.ensemble == 'vote': result = prediction.vote() elif args.ensemble == 'stack': result = prediction.stack(args.num_imp) timestamp = get_timestamp() write_result(result, label_list, timestamp)
def train(self, X_train, y_train, X_val, y_val, score_name, num): from keras.models import Sequential from keras.layers import Dense, Dropout params = { 'n_features': self.n_features, 'n_classes': self.n_classes, 'dropout': self.dropout, 'hidden_unit': self.hidden_unit, 'n_layers': self.n_layers, 'optimizer': self.optimizer, 'init': self.init, 'batch_size': self.batch_size, 'epochs': self.epochs, } # set last activation function and loss function if self.n_classes == 2: last_activation = 'sigmoid' loss_fn = 'binary_crossentropy' n_output = 1 else: last_activation = 'softmax' loss_fn = 'categorical_crossentropy' n_output = self.n_classes lb = LabelBinarizer() y_train_onehot = lb.fit_transform(y_train) y_val_onehot = lb.fit_transform(y_val) # create model model = Sequential() model.add( Dense(self.hidden_unit, input_dim=self.n_features, kernel_initializer=self.init, activation='relu')) model.add(Dropout(rate=self.dropout)) for i in range(self.n_layers): model.add( Dense(self.hidden_unit, kernel_initializer=self.init, activation='relu')) model.add(Dropout(rate=self.dropout)) model.add( Dense(n_output, kernel_initializer=self.init, activation=last_activation)) # Compile model model.compile(loss=loss_fn, optimizer=self.optimizer, metrics=['accuracy']) model.fit(X_train, y_train_onehot, batch_size=self.batch_size, epochs=self.epochs) best_score = model.evaluate(X_train, y_train_onehot) best_score = best_score[1] print("\n %s: %.2f%%" % (model.metrics_names[1], best_score * 100)) y_pred = model.predict(X_val) if self.n_classes == 2: y_pred = np.hstack([np.ones_like(y_pred) - y_pred, y_pred]) y_pred = np.argmax(y_pred, axis=1) # Prediction evaluate score val_score = matric_score(y_val, y_pred, score_name) print("\nValidation Test %s score: %.2f%%" % (score_name, val_score * 100.0)) conf_matrix = confusion_matrix(y_val, y_pred) print("\nConfusion Matrix: \n", conf_matrix) cl_report = classification_report(y_val, y_pred) print("\nClassification Report: \n", cl_report) # Save model and weights timestamp = f"{datetime.datetime.now():%Y%m%d%H%M}" save_path = 'saves/' if not os.path.exists(save_path): os.makedirs(save_path) model_json = model.to_json() model_path = os.path.join(save_path, timestamp + '.json') with open(model_path, "w") as json_file: json_file.write(model_json) print('Saved model to %s' % model_path) # serialize weights to HDF5 weights_path = os.path.join(save_path, timestamp + '.h5') model.save_weights(weights_path) print("Saved weight to %s" % weights_path) # Write params to file estimator_name = 'keras' write_params(estimator_name, params, best_score, val_score, timestamp, num) return val_score
def main(argv: Sequence[str]): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if not FLAGS.use_gpu: logging.info('Using TPU for training.') strategy = utils.get_tpu_strategy(FLAGS.tpu) else: logging.info('Using GPU for training.') strategy = tf.distribute.MirroredStrategy() train_dataset, steps_per_epoch = utils.load_dataset(FLAGS.data_dir, tfds.Split.TRAIN, FLAGS.batch_size) eval_identifiers = ['tune', 'test1', 'test2'] splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')] eval_datasets, steps_per_eval = utils.load_eval_datasets( eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size) logging.info('Steps for eval datasets: %s', steps_per_eval) params = utils.ModelParameters( num_heads=FLAGS.num_heads, num_layers=FLAGS.num_layers, message_layer_size=FLAGS.message_layer_size, readout_layer_size=FLAGS.readout_layer_size, use_gp_layer=FLAGS.use_gp_layer, learning_rate=FLAGS.learning_rate, num_epochs=FLAGS.num_epochs, steps_per_epoch=steps_per_epoch) gp_layer_kwargs = dict( num_inducing=FLAGS.gp_num_inducing, gp_kernel_scale=FLAGS.gp_kernel_scale, gp_output_bias=FLAGS.gp_output_bias, normalize_input=FLAGS.gp_normalize_input, gp_cov_momentum=FLAGS.gp_cov_momentum, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) model_dir = FLAGS.output_dir utils.write_params( dataclasses.asdict(params), os.path.join(model_dir, 'params.json')) utils.write_params(gp_layer_kwargs, os.path.join(model_dir, 'gp_layer_kwargs.json')) summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries')) run(train_dataset=train_dataset, eval_datasets=eval_datasets, steps_per_eval=steps_per_eval, params=params, model_dir=model_dir, gp_layer_kwargs=gp_layer_kwargs, strategy=strategy, summary_writer=summary_writer, loss_type=FLAGS.loss_type, use_spec_norm=FLAGS.use_spec_norm, spec_norm_multiplier=FLAGS.spec_norm_multiplier, use_spec_norm_mp=FLAGS.use_spec_norm_mp, spec_norm_multiplier_mp=FLAGS.spec_norm_multiplier_mp)
def run( train_dataset: tf.data.Dataset, eval_datasets: Dict[str, tf.data.Dataset], steps_per_eval: Dict[str, int], params: utils.ModelParameters, model_dir: str, gp_layer_kwargs: Dict[str, Any], strategy: tf.distribute.Strategy, summary_writer: tf.summary.SummaryWriter, loss_type: str, use_spec_norm: bool, spec_norm_multiplier: float, use_spec_norm_mp: bool, spec_norm_multiplier_mp: float): """Trains and evaluates the model. Args: train_dataset: tf dataset that provides training data. eval_datasets: A dictionary of tf datasets that provides data for model evaluation. steps_per_eval: A dictionary of steps needed for each evaluation dataset. params: ModelParameters object containing MPNN model parameters. model_dir: Directory for files generated during training and evaluation. gp_layer_kwargs: A dictionary of parameters used for GP layer. strategy: tf Distributed training strategy object. summary_writer: tf summary writer to log training and evaluation metrics. loss_type: str, loss type to use during training. Currently only supports focal loss and cross-entropy loss. use_spec_norm: Whether to use Spectral normalization for the dense layer. spec_norm_multiplier: Multiplier used to control the magnitude of eigenvalue of the dense layer weight matrix. use_spec_norm_mp: Whether to use Spectral normalization for the MP layer. spec_norm_multiplier_mp: Multiplier used to control the magnitude of eigenvalue of the MP layer weight matrix. """ with strategy.scope(): model = ub.models.mpnn( nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:], edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:], num_heads=params.num_heads, num_layers=params.num_layers, message_layer_size=params.message_layer_size, readout_layer_size=params.readout_layer_size, use_gp_layer=params.use_gp_layer, gp_layer_kwargs=gp_layer_kwargs, use_spec_norm=use_spec_norm, spec_norm_multiplier=spec_norm_multiplier, use_spec_norm_mp=use_spec_norm_mp, spec_norm_multiplier_mp=spec_norm_multiplier_mp) optimizer = tf.keras.optimizers.RMSprop(learning_rate=params.learning_rate) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.CategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/roc_auc': tf.keras.metrics.AUC(), } for dataset_name in eval_datasets: metrics[ f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy() metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC() metrics[ f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean() if dataset_name == 'test2': ece_num_bins = 5 else: ece_num_bins = 10 metrics[f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError( num_bins=ece_num_bins) metrics[f'{dataset_name}/brier'] = rm.metrics.Brier() def per_replica_train_step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, sample_weights = inputs else: features, labels = inputs sample_weights = 1 with tf.GradientTape() as tape: probs = model(features, training=True) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs) * sample_weights) l2_loss = sum(model.losses) if loss_type == 'focal': focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy() focal_loss = tf.reduce_mean( focal_loss_fn(labels, probs) * sample_weights) loss = focal_loss + l2_loss else: loss = negative_log_likelihood + l2_loss # Scale the loss given the tf.distribute.Strategy will reduce sum all # gradients. See details in # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1]) def per_replica_eval_step_fn(inputs, dataset_name): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, _ = inputs else: features, labels = inputs probs = model(features, training=False) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs)) metrics[f'{dataset_name}/negative_log_likelihood'].update_state( negative_log_likelihood) metrics[f'{dataset_name}/accuracy'].update_state(labels, probs) metrics[f'{dataset_name}/roc_auc'].update_state(labels[:, 1], probs[:, 1]) metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1]) metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1]) @tf.function def distributed_train_step(iterator): """Training StepFn.""" for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)): strategy.run(per_replica_train_step_fn, args=(next(iterator),)) @tf.function def distributed_eval_step(iterator, dataset_name, num_steps): """Evaluation StepFn.""" for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run( per_replica_eval_step_fn, args=(next(iterator), dataset_name)) # Makes datasets into distributed version. train_dataset = strategy.experimental_distribute_dataset(train_dataset) eval_datasets = { ds_name: strategy.experimental_distribute_dataset(ds) for ds_name, ds in eval_datasets.items() } logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync) train_iterator = iter(train_dataset) start_time = time.time() metrics_history = collections.defaultdict(list) for epoch in range(params.num_epochs): logging.info('Starting to run epoch: %s', epoch) distributed_train_step(train_iterator) current_step = (epoch + 1) * params.steps_per_epoch max_steps = params.steps_per_epoch * params.num_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, params.num_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) # Start evaluation. logging.info('Starting to run eval at epoch: %s', epoch) for dataset_name, eval_dataset in eval_datasets.items(): eval_iterator = iter(eval_dataset) distributed_eval_step(eval_iterator, dataset_name, steps_per_eval[dataset_name]) metrics_history['epoch'].append(epoch + 1) with summary_writer.as_default(): for name, metric in metrics.items(): result = utils.get_metric_result_value(metric) tf.summary.scalar(name, result, step=epoch + 1) metrics_history[name].append(str(result)) for metric in metrics.values(): metric.reset_states() model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True) utils.write_params(metrics_history, os.path.join(model_dir, 'metrics_history.json'))
def test_write_params(self): test_output_dir = self.create_tempdir().full_path filename = os.path.join(test_output_dir, 'test_params.json') utils.write_params({'a': 1.0}, filename) self.assertTrue(tf.io.gfile.exists(filename))