Exemplo n.º 1
0
def main(argv: Sequence[str]):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    tf.io.gfile.makedirs(FLAGS.output_dir)
    logging.info('Saving checkpoints at %s', FLAGS.output_dir)
    tf.random.set_seed(FLAGS.seed)

    if not FLAGS.use_gpu:
        logging.info('Using TPU for training.')
        strategy = utils.get_tpu_strategy(FLAGS.tpu)
    else:
        logging.info('Using GPU for training.')
        strategy = tf.distribute.MirroredStrategy()

    train_dataset, steps_per_epoch = utils.load_dataset(
        FLAGS.data_dir, tfds.Split.TRAIN, FLAGS.batch_size)

    eval_identifiers = ['tune', 'test1', 'test2']
    splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')]
    eval_datasets, steps_per_eval = utils.load_eval_datasets(
        eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size)

    logging.info('Steps for eval datasets: %s', steps_per_eval)
    graph_augmenter = None
    if FLAGS.augmentations:
        graph_augmenter = augmentation_utils.GraphAugment(
            FLAGS.augmentations, FLAGS.aug_ratio, FLAGS.aug_prob,
            FLAGS.perturb_node_features, FLAGS.drop_edges_only,
            FLAGS.perturb_edge_features,
            FLAGS.initialize_edge_features_randomly, FLAGS.mask_mean,
            FLAGS.mask_stddev)

    params = utils.ModelParameters(num_heads=FLAGS.num_heads,
                                   num_layers=FLAGS.num_layers,
                                   message_layer_size=FLAGS.message_layer_size,
                                   readout_layer_size=FLAGS.readout_layer_size,
                                   use_gp_layer=False,
                                   learning_rate=FLAGS.learning_rate,
                                   augmentations=FLAGS.augmentations,
                                   num_epochs=FLAGS.num_epochs,
                                   steps_per_epoch=steps_per_epoch)

    model_dir = FLAGS.output_dir
    utils.write_params(dataclasses.asdict(params),
                       os.path.join(model_dir, 'params.json'))

    summary_writer = tf.summary.create_file_writer(
        os.path.join(model_dir, 'summaries'))
    run(train_dataset=train_dataset,
        eval_datasets=eval_datasets,
        steps_per_eval=steps_per_eval,
        params=params,
        model_dir=model_dir,
        strategy=strategy,
        summary_writer=summary_writer,
        loss_type=FLAGS.loss_type,
        graph_augmenter=graph_augmenter)
Exemplo n.º 2
0
                    help='RNN input image step')
parser.add_argument('--max_dist', type=float, default=25., help='max distance')
parser.add_argument('--max_speed', type=float, default=10., help='max speed')
parser.add_argument('--max_t', type=float, default=3., help='max time')
opt = parser.parse_args()
if opt.test_mode: opt.batch_size = 1

description = 'dropout'
log_path = 'result/log/' + opt.dataset_name + '/'
os.makedirs('result/saved_models/%s' % opt.dataset_name, exist_ok=True)
os.makedirs('result/output/%s' % opt.dataset_name, exist_ok=True)
os.makedirs('result/output2/%s' % opt.dataset_name, exist_ok=True)
os.makedirs('result/output3/%s' % opt.dataset_name, exist_ok=True)
if not opt.test_mode:
    logger = SummaryWriter(log_dir=log_path)
    write_params(log_path, parser, description)

# generator = Generator(input_dim=128+32+1+1, output=2).to(device)
# discriminator = Discriminator(opt.points_num*2+32+1).to(device)

generator = Generator(input_dim=2 + 2 + 1 + 1, output=2).to(device)
discriminator = Discriminator(opt.points_num * 2 + 2 + 1).to(device)
# encoder = CNN(input_dim=1, out_dim=32).to(device)
encoder = CNNNorm(input_dim=1, out_dim=2).to(device)
encoder.load_state_dict(
    torch.load('result/saved_models/il-uncertainty-02/encoder_119000.pth'))
# DO NOT TRAIN ENCODER
encoder.eval()
# discriminator.load_state_dict(torch.load('result/saved_models/train-gan-costmap-01/discriminator_120000.pth'))
generator.load_state_dict(
    torch.load(
Exemplo n.º 3
0
def run(train_dataset: tf.data.Dataset, eval_datasets: Dict[str,
                                                            tf.data.Dataset],
        steps_per_eval: Dict[str, int], params: utils.ModelParameters,
        model_dir: str, strategy: tf.distribute.Strategy,
        summary_writer: tf.summary.SummaryWriter, loss_type: str,
        graph_augmenter: augmentation_utils.GraphAugment):
    """Trains and evaluates the model."""
    with strategy.scope():
        model = ub.models.mpnn(
            nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:],
            edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:],
            num_heads=params.num_heads,
            num_layers=params.num_layers,
            message_layer_size=params.message_layer_size,
            readout_layer_size=params.readout_layer_size,
            use_gp_layer=params.use_gp_layer)
        optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=params.learning_rate)
        metrics = {
            'train/negative_log_likelihood': tf.keras.metrics.Mean(),
            'train/accuracy': tf.keras.metrics.CategoricalAccuracy(),
            'train/loss': tf.keras.metrics.Mean(),
            'train/roc_auc': tf.keras.metrics.AUC(),
        }

        for dataset_name in eval_datasets:
            metrics[
                f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy(
                )
            metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC()
            metrics[
                f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean(
                )
            if dataset_name == 'test2':
                ece_num_bins = 5
            else:
                ece_num_bins = 10
            metrics[
                f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError(
                    num_bins=ece_num_bins)
            metrics[f'{dataset_name}/brier'] = rm.metrics.Brier()

    @tf.function
    def train_step(iterator):
        """Training StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            if len(inputs) == 3:
                features, labels, sample_weights = inputs
            else:
                features, labels = inputs
                sample_weights = 1

            if params.augmentations:
                # TODO(jihyeonlee): For now, choose 1 augmentation function from all
                # possible with equal probability. Allow user to specify number of
                # augmentations to apply per graph.
                features = graph_augmenter.augment(features)

            with tf.GradientTape() as tape:
                probs = model(features, training=True)
                negative_log_likelihood = tf.reduce_mean(
                    tf.keras.losses.categorical_crossentropy(labels, probs) *
                    sample_weights)

                l2_loss = sum(model.losses)
                if loss_type == 'focal':
                    focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy()
                    focal_loss = tf.reduce_mean(
                        focal_loss_fn(labels, probs) * sample_weights)
                    loss = focal_loss + l2_loss
                else:
                    loss = negative_log_likelihood + l2_loss
                # Scale the loss given the tf.distribute.Strategy will reduce sum all
                # gradients. See details in
                # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/accuracy'].update_state(labels, probs)
            metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1])

        for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)):
            strategy.run(step_fn, args=(next(iterator), ))

    @tf.function
    def eval_step(iterator, dataset_name, num_steps):
        """Evaluation StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            if len(inputs) == 3:
                features, labels, _ = inputs
            else:
                features, labels = inputs

            probs = model(features, training=False)
            negative_log_likelihood = tf.reduce_mean(
                tf.keras.losses.categorical_crossentropy(labels, probs))

            metrics[f'{dataset_name}/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics[f'{dataset_name}/accuracy'].update_state(labels, probs)
            metrics[f'{dataset_name}/roc_auc'].update_state(
                labels[:, 1], probs[:, 1])
            metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1],
                                                     label=labels[:, 1])
            metrics[f'{dataset_name}/brier'].add_batch(probs,
                                                       label=labels[:, 1])

        for _ in tf.range(tf.cast(num_steps, tf.int32)):
            strategy.run(step_fn, args=(next(iterator), ))

    # Makes datasets into distributed version.
    train_dataset = strategy.experimental_distribute_dataset(train_dataset)
    eval_datasets = {
        ds_name: strategy.experimental_distribute_dataset(ds)
        for ds_name, ds in eval_datasets.items()
    }
    logging.info('Number of replicas in sync: %s',
                 strategy.num_replicas_in_sync)

    train_iterator = iter(train_dataset)
    start_time = time.time()
    metrics_history = collections.defaultdict(list)
    for epoch in range(params.num_epochs):
        logging.info('Starting to run epoch: %s', epoch)
        train_step(train_iterator)

        current_step = (epoch + 1) * params.steps_per_epoch
        max_steps = params.steps_per_epoch * params.num_epochs
        time_elapsed = time.time() - start_time
        steps_per_sec = float(current_step) / time_elapsed
        eta_seconds = (max_steps - current_step) / steps_per_sec
        message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
                   'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                       current_step / max_steps, epoch + 1, params.num_epochs,
                       steps_per_sec, eta_seconds / 60, time_elapsed / 60))
        logging.info(message)

        # Start evaluation.
        logging.info('Starting to run eval at epoch: %s', epoch)
        for dataset_name, eval_dataset in eval_datasets.items():
            eval_iterator = iter(eval_dataset)
            eval_step(eval_iterator, dataset_name,
                      steps_per_eval[dataset_name])

        metrics_history['epoch'].append(epoch + 1)
        with summary_writer.as_default():
            for name, metric in metrics.items():
                result = utils.get_metric_result_value(metric)
                tf.summary.scalar(name, result, step=epoch + 1)
                metrics_history[name].append(str(result))

        for metric in metrics.values():
            metric.reset_states()

        model.save(os.path.join(model_dir, f'model_{epoch + 1}'),
                   overwrite=True)

    utils.write_params(metrics_history,
                       os.path.join(model_dir, 'metrics_history.json'))
def main():

    # build parser and check arguments
    args = _build_parser()
    _check_args(args)

    # Setup Estimator
    '''Estimator name: 
    xgb: XGBoost Classifier
    log: Logistic Regression
    knn: KNeighbors Classifier
    rfo: RandomForest Classifier 
    ada: AdaBoost Classifier
    ext: ExtraTrees Classifier
    svc: Support Vector Classifier
    keras: Keras Neural Networks
    '''

    if not args.estimator == 'all':
        estimators = [args.estimator]
    elif args.estimator == 'all':
        estimators = ['xgb', 'lgb', 'log', 'rfo', 'ext', 'ada', 'knn', 'svc']

    # Training neural nets with keras
    if args.train_nn:
        estimator_name = 'keras'
        print('Training %s...' % estimator_name)

        params = {
            'n_features': n_features,
            'n_classes': n_classes,
            'dropout': args.dropout,
            'hidden_unit': args.hidden_unit,
            'n_layers': args.layers,
            'optimizer': args.optimizer,
            'init': args.init,
            'batch_size': args.batch_size,
            'epochs': args.epochs,
        }
        estimator = keras_model(**params)

        train_kwargs = {
            'X_train': X_train,
            'y_train': y_train,
            'X_val': X_val,
            'y_val': y_val,
            'score_name': args.score,
            'num': args.num
        }
        _ = estimator.train(**train_kwargs)
        print('params: \n', params)

    # Training random search CV with scikit-learn models
    if args.train_random:
        for estimator_name in estimators:
            print('Training %s...' % estimator_name)

            if not estimator_name == 'keras':
                seed = args.seed if args.seed != None else np.random.randint(
                    100)
                estimator, params = select_model(estimator_name, n_features,
                                                 n_classes, seed)

                # kwargs dict for train and predict
                train_kwargs = {
                    'estimator': estimator,
                    'params': params,
                    'X_train': X_train,
                    'y_train': y_train,
                    'X_val': X_val,
                    'y_val': y_val,
                    'n_iter': args.n_iter,
                    'score_name': args.score,
                }

                # Train model and Predict results
                best_params, best_score, val_score = random_model(
                    **train_kwargs)
                timestamp = get_timestamp()

                # Write params to file
                write_params(estimator_name, best_params, best_score,
                             val_score, timestamp, args.num)

            elif estimator_name == 'keras':

                space_params = {
                    'n_features':
                    n_features,
                    'n_classes':
                    n_classes,
                    'dropout':
                    hp.uniform('dropout', .20, .80),
                    'hidden_unit':
                    hp.quniform('hidden_unit', 10, 50, q=1),
                    'n_layers':
                    hp.choice('n_layers', [1, 2, 3, 4]),
                    'optimizer':
                    hp.choice('optimizer', ['adam', 'adadelta', 'sgd']),
                    'init':
                    hp.choice('init', ['glorot_uniform', 'normal', 'uniform']),
                    'batch_size':
                    hp.choice('batch_size', [16, 32, 64, 128]),
                    'epochs':
                    hp.quniform('epochs', 100, 1000, q=1),
                    'score_name':
                    args.score,
                    'num':
                    args.num,
                }
                trials = Trials()
                best_params = fmin(random_nn,
                                   space_params,
                                   algo=tpe.suggest,
                                   max_evals=args.n_iter,
                                   trials=trials)
                print('best_params \n', best_params)

    # Evaluate with ensemble method and predict result
    if args.predict:

        eva_kwargs = {
            'estimators': estimators,
            'threshold': args.threshold,
            'X_train': X_train,
            'y_train': y_train,
            'X_val': X_val,
            'y_val': y_val,
            'X_test': X_test,
            'score_name': args.score,
            'n_classes': n_classes,
        }

        # Predict with ensemble voting and write result
        prediction = ensemble(**eva_kwargs)
        if args.ensemble == 'vote':
            result = prediction.vote()
        elif args.ensemble == 'stack':
            result = prediction.stack(args.num_imp)

        timestamp = get_timestamp()
        write_result(result, label_list, timestamp)
Exemplo n.º 5
0
    def train(self, X_train, y_train, X_val, y_val, score_name, num):

        from keras.models import Sequential
        from keras.layers import Dense, Dropout

        params = {
            'n_features': self.n_features,
            'n_classes': self.n_classes,
            'dropout': self.dropout,
            'hidden_unit': self.hidden_unit,
            'n_layers': self.n_layers,
            'optimizer': self.optimizer,
            'init': self.init,
            'batch_size': self.batch_size,
            'epochs': self.epochs,
        }

        # set last activation function and loss function
        if self.n_classes == 2:
            last_activation = 'sigmoid'
            loss_fn = 'binary_crossentropy'
            n_output = 1
        else:
            last_activation = 'softmax'
            loss_fn = 'categorical_crossentropy'
            n_output = self.n_classes

        lb = LabelBinarizer()
        y_train_onehot = lb.fit_transform(y_train)
        y_val_onehot = lb.fit_transform(y_val)

        # create model
        model = Sequential()
        model.add(
            Dense(self.hidden_unit,
                  input_dim=self.n_features,
                  kernel_initializer=self.init,
                  activation='relu'))
        model.add(Dropout(rate=self.dropout))
        for i in range(self.n_layers):
            model.add(
                Dense(self.hidden_unit,
                      kernel_initializer=self.init,
                      activation='relu'))
            model.add(Dropout(rate=self.dropout))
        model.add(
            Dense(n_output,
                  kernel_initializer=self.init,
                  activation=last_activation))

        # Compile model
        model.compile(loss=loss_fn,
                      optimizer=self.optimizer,
                      metrics=['accuracy'])

        model.fit(X_train,
                  y_train_onehot,
                  batch_size=self.batch_size,
                  epochs=self.epochs)

        best_score = model.evaluate(X_train, y_train_onehot)
        best_score = best_score[1]
        print("\n %s: %.2f%%" % (model.metrics_names[1], best_score * 100))

        y_pred = model.predict(X_val)

        if self.n_classes == 2:
            y_pred = np.hstack([np.ones_like(y_pred) - y_pred, y_pred])

        y_pred = np.argmax(y_pred, axis=1)

        # Prediction evaluate score
        val_score = matric_score(y_val, y_pred, score_name)
        print("\nValidation Test %s score: %.2f%%" %
              (score_name, val_score * 100.0))

        conf_matrix = confusion_matrix(y_val, y_pred)
        print("\nConfusion Matrix: \n", conf_matrix)

        cl_report = classification_report(y_val, y_pred)
        print("\nClassification Report: \n", cl_report)

        # Save model and weights
        timestamp = f"{datetime.datetime.now():%Y%m%d%H%M}"
        save_path = 'saves/'
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        model_json = model.to_json()
        model_path = os.path.join(save_path, timestamp + '.json')
        with open(model_path, "w") as json_file:
            json_file.write(model_json)
            print('Saved model to %s' % model_path)

        # serialize weights to HDF5
        weights_path = os.path.join(save_path, timestamp + '.h5')
        model.save_weights(weights_path)
        print("Saved weight to %s" % weights_path)

        # Write params to file
        estimator_name = 'keras'
        write_params(estimator_name, params, best_score, val_score, timestamp,
                     num)

        return val_score
Exemplo n.º 6
0
def main(argv: Sequence[str]):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  tf.io.gfile.makedirs(FLAGS.output_dir)
  logging.info('Saving checkpoints at %s', FLAGS.output_dir)
  tf.random.set_seed(FLAGS.seed)

  if not FLAGS.use_gpu:
    logging.info('Using TPU for training.')
    strategy = utils.get_tpu_strategy(FLAGS.tpu)
  else:
    logging.info('Using GPU for training.')
    strategy = tf.distribute.MirroredStrategy()

  train_dataset, steps_per_epoch = utils.load_dataset(FLAGS.data_dir,
                                                      tfds.Split.TRAIN,
                                                      FLAGS.batch_size)

  eval_identifiers = ['tune', 'test1', 'test2']
  splits = [tfds.Split.VALIDATION, tfds.Split.TEST, tfds.Split('test2')]
  eval_datasets, steps_per_eval = utils.load_eval_datasets(
      eval_identifiers, splits, FLAGS.data_dir, FLAGS.batch_size)

  logging.info('Steps for eval datasets: %s', steps_per_eval)

  params = utils.ModelParameters(
      num_heads=FLAGS.num_heads,
      num_layers=FLAGS.num_layers,
      message_layer_size=FLAGS.message_layer_size,
      readout_layer_size=FLAGS.readout_layer_size,
      use_gp_layer=FLAGS.use_gp_layer,
      learning_rate=FLAGS.learning_rate,
      num_epochs=FLAGS.num_epochs,
      steps_per_epoch=steps_per_epoch)

  gp_layer_kwargs = dict(
      num_inducing=FLAGS.gp_num_inducing,
      gp_kernel_scale=FLAGS.gp_kernel_scale,
      gp_output_bias=FLAGS.gp_output_bias,
      normalize_input=FLAGS.gp_normalize_input,
      gp_cov_momentum=FLAGS.gp_cov_momentum,
      gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty)

  model_dir = FLAGS.output_dir
  utils.write_params(
      dataclasses.asdict(params), os.path.join(model_dir, 'params.json'))
  utils.write_params(gp_layer_kwargs,
                     os.path.join(model_dir, 'gp_layer_kwargs.json'))
  summary_writer = tf.summary.create_file_writer(
      os.path.join(model_dir, 'summaries'))
  run(train_dataset=train_dataset,
      eval_datasets=eval_datasets,
      steps_per_eval=steps_per_eval,
      params=params,
      model_dir=model_dir,
      gp_layer_kwargs=gp_layer_kwargs,
      strategy=strategy,
      summary_writer=summary_writer,
      loss_type=FLAGS.loss_type,
      use_spec_norm=FLAGS.use_spec_norm,
      spec_norm_multiplier=FLAGS.spec_norm_multiplier,
      use_spec_norm_mp=FLAGS.use_spec_norm_mp,
      spec_norm_multiplier_mp=FLAGS.spec_norm_multiplier_mp)
Exemplo n.º 7
0
def run(
    train_dataset: tf.data.Dataset,
    eval_datasets: Dict[str, tf.data.Dataset],
    steps_per_eval: Dict[str, int],
    params: utils.ModelParameters,
    model_dir: str,
    gp_layer_kwargs: Dict[str, Any],
    strategy: tf.distribute.Strategy,
    summary_writer: tf.summary.SummaryWriter,
    loss_type: str,
    use_spec_norm: bool,
    spec_norm_multiplier: float,
    use_spec_norm_mp: bool,
    spec_norm_multiplier_mp: float):
  """Trains and evaluates the model.

  Args:
    train_dataset: tf dataset that provides training data.
    eval_datasets: A dictionary of tf datasets that provides data for model
      evaluation.
    steps_per_eval: A dictionary of steps needed for each evaluation dataset.
    params: ModelParameters object containing MPNN model parameters.
    model_dir: Directory for files generated during training and evaluation.
    gp_layer_kwargs: A dictionary of parameters used for GP layer.
    strategy: tf Distributed training strategy object.
    summary_writer: tf summary writer to log training and evaluation metrics.
    loss_type: str, loss type to use during training. Currently only
      supports focal loss and cross-entropy loss.
    use_spec_norm: Whether to use Spectral normalization for the dense layer.
    spec_norm_multiplier: Multiplier used to control the magnitude of
      eigenvalue of the dense layer weight matrix.
    use_spec_norm_mp: Whether to use Spectral normalization for the MP layer.
    spec_norm_multiplier_mp: Multiplier used to control the magnitude of
      eigenvalue of the MP layer weight matrix.

  """
  with strategy.scope():
    model = ub.models.mpnn(
        nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:],
        edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:],
        num_heads=params.num_heads,
        num_layers=params.num_layers,
        message_layer_size=params.message_layer_size,
        readout_layer_size=params.readout_layer_size,
        use_gp_layer=params.use_gp_layer,
        gp_layer_kwargs=gp_layer_kwargs,
        use_spec_norm=use_spec_norm,
        spec_norm_multiplier=spec_norm_multiplier,
        use_spec_norm_mp=use_spec_norm_mp,
        spec_norm_multiplier_mp=spec_norm_multiplier_mp)
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=params.learning_rate)
    metrics = {
        'train/negative_log_likelihood': tf.keras.metrics.Mean(),
        'train/accuracy': tf.keras.metrics.CategoricalAccuracy(),
        'train/loss': tf.keras.metrics.Mean(),
        'train/roc_auc': tf.keras.metrics.AUC(),
    }

    for dataset_name in eval_datasets:
      metrics[
          f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy()
      metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC()
      metrics[
          f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean()
      if dataset_name == 'test2':
        ece_num_bins = 5
      else:
        ece_num_bins = 10
      metrics[f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError(
          num_bins=ece_num_bins)
      metrics[f'{dataset_name}/brier'] = rm.metrics.Brier()

  def per_replica_train_step_fn(inputs):
    """Per-Replica StepFn."""
    if len(inputs) == 3:
      features, labels, sample_weights = inputs
    else:
      features, labels = inputs
      sample_weights = 1

    with tf.GradientTape() as tape:
      probs = model(features, training=True)
      negative_log_likelihood = tf.reduce_mean(
          tf.keras.losses.categorical_crossentropy(labels, probs) *
          sample_weights)

      l2_loss = sum(model.losses)
      if loss_type == 'focal':
        focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy()
        focal_loss = tf.reduce_mean(
            focal_loss_fn(labels, probs) * sample_weights)
        loss = focal_loss + l2_loss
      else:
        loss = negative_log_likelihood + l2_loss
      # Scale the loss given the tf.distribute.Strategy will reduce sum all
      # gradients. See details in
      # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function
      scaled_loss = loss / strategy.num_replicas_in_sync

    grads = tape.gradient(scaled_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    metrics['train/loss'].update_state(loss)
    metrics['train/negative_log_likelihood'].update_state(
        negative_log_likelihood)
    metrics['train/accuracy'].update_state(labels, probs)
    metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1])

  def per_replica_eval_step_fn(inputs, dataset_name):
    """Per-Replica StepFn."""
    if len(inputs) == 3:
      features, labels, _ = inputs
    else:
      features, labels = inputs

    probs = model(features, training=False)
    negative_log_likelihood = tf.reduce_mean(
        tf.keras.losses.categorical_crossentropy(labels, probs))

    metrics[f'{dataset_name}/negative_log_likelihood'].update_state(
        negative_log_likelihood)
    metrics[f'{dataset_name}/accuracy'].update_state(labels, probs)
    metrics[f'{dataset_name}/roc_auc'].update_state(labels[:, 1], probs[:, 1])
    metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1])
    metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1])

  @tf.function
  def distributed_train_step(iterator):
    """Training StepFn."""
    for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)):
      strategy.run(per_replica_train_step_fn, args=(next(iterator),))

  @tf.function
  def distributed_eval_step(iterator, dataset_name, num_steps):
    """Evaluation StepFn."""
    for _ in tf.range(tf.cast(num_steps, tf.int32)):
      strategy.run(
          per_replica_eval_step_fn, args=(next(iterator), dataset_name))

  # Makes datasets into distributed version.
  train_dataset = strategy.experimental_distribute_dataset(train_dataset)
  eval_datasets = {
      ds_name: strategy.experimental_distribute_dataset(ds)
      for ds_name, ds in eval_datasets.items()
  }
  logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync)

  train_iterator = iter(train_dataset)
  start_time = time.time()
  metrics_history = collections.defaultdict(list)
  for epoch in range(params.num_epochs):
    logging.info('Starting to run epoch: %s', epoch)
    distributed_train_step(train_iterator)

    current_step = (epoch + 1) * params.steps_per_epoch
    max_steps = params.steps_per_epoch * params.num_epochs
    time_elapsed = time.time() - start_time
    steps_per_sec = float(current_step) / time_elapsed
    eta_seconds = (max_steps - current_step) / steps_per_sec
    message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
               'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                   current_step / max_steps, epoch + 1, params.num_epochs,
                   steps_per_sec, eta_seconds / 60, time_elapsed / 60))
    logging.info(message)

    # Start evaluation.
    logging.info('Starting to run eval at epoch: %s', epoch)
    for dataset_name, eval_dataset in eval_datasets.items():
      eval_iterator = iter(eval_dataset)
      distributed_eval_step(eval_iterator, dataset_name,
                            steps_per_eval[dataset_name])

    metrics_history['epoch'].append(epoch + 1)
    with summary_writer.as_default():
      for name, metric in metrics.items():
        result = utils.get_metric_result_value(metric)
        tf.summary.scalar(name, result, step=epoch + 1)
        metrics_history[name].append(str(result))


    for metric in metrics.values():
      metric.reset_states()

    model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True)

  utils.write_params(metrics_history,
                     os.path.join(model_dir, 'metrics_history.json'))
Exemplo n.º 8
0
 def test_write_params(self):
     test_output_dir = self.create_tempdir().full_path
     filename = os.path.join(test_output_dir, 'test_params.json')
     utils.write_params({'a': 1.0}, filename)
     self.assertTrue(tf.io.gfile.exists(filename))