示例#1
0
def perform_evaluation(model, builder, eval_steps, ckpt, strategy, topology):
  """Perform evaluation."""
  if FLAGS.train_mode == 'pretrain' and not FLAGS.lineareval_while_pretraining:
    logging.info('Skipping eval during pretraining without linear eval.')
    return
  # Build input pipeline.
  ds = data_lib.build_distributed_dataset(builder, FLAGS.eval_batch_size, False,
                                          strategy, topology)
  summary_writer = tf.summary.create_file_writer(FLAGS.model_dir)

  # Build metrics.
  with strategy.scope():
    regularization_loss = tf.keras.metrics.Mean('eval/regularization_loss')
    label_top_1_accuracy = tf.keras.metrics.Accuracy(
        'eval/label_top_1_accuracy')
    label_top_5_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(
        5, 'eval/label_top_5_accuracy')
    all_metrics = [
        regularization_loss, label_top_1_accuracy, label_top_5_accuracy
    ]

    # Restore checkpoint.
    logging.info('Restoring from %s', ckpt)
    checkpoint = tf.train.Checkpoint(
        model=model, global_step=tf.Variable(0, dtype=tf.int64))
    checkpoint.restore(ckpt).expect_partial()
    global_step = checkpoint.global_step
    logging.info('Performing eval at step %d', global_step.numpy())

  def single_step(features, labels):
    _, supervised_head_outputs = model(features, training=False)
    assert supervised_head_outputs is not None
    outputs = supervised_head_outputs
    l = labels['labels']
    metrics.update_finetune_metrics_eval(label_top_1_accuracy,
                                         label_top_5_accuracy, outputs, l)
    reg_loss = model_lib.add_weight_decay(model, adjust_per_optimizer=True)
    regularization_loss.update_state(reg_loss)

  with strategy.scope():

    @tf.function
    def run_single_step(iterator):
      images, labels = next(iterator)
      features, labels = images, {'labels': labels}
      strategy.run(single_step, (features, labels))

    iterator = iter(ds)
    for i in range(eval_steps):
      run_single_step(iterator)
      logging.info('Completed eval for %d / %d steps', i + 1, eval_steps)
    logging.info('Finished eval for %s', ckpt)

  # Write summaries
  cur_step = global_step.numpy()
  logging.info('Writing summaries for %d step', cur_step)
  with summary_writer.as_default():
    metrics.log_and_write_metrics_to_summary(all_metrics, cur_step)
    summary_writer.flush()

  # Record results as JSON.
  result_json_path = os.path.join(FLAGS.model_dir, 'result.json')
  result = {metric.name: metric.result().numpy() for metric in all_metrics}
  result['global_step'] = global_step.numpy()
  logging.info(result)
  with tf.io.gfile.GFile(result_json_path, 'w') as f:
    json.dump({k: float(v) for k, v in result.items()}, f)
  result_json_path = os.path.join(
      FLAGS.model_dir, 'result_%d.json'%result['global_step'])
  with tf.io.gfile.GFile(result_json_path, 'w') as f:
    json.dump({k: float(v) for k, v in result.items()}, f)
  flag_json_path = os.path.join(FLAGS.model_dir, 'flags.json')
  with tf.io.gfile.GFile(flag_json_path, 'w') as f:
    serializable_flags = {}
    for key, val in FLAGS.flag_values_dict().items():
      # Some flag value types e.g. datetime.timedelta are not json serializable,
      # filter those out.
      if json_serializable(val):
        serializable_flags[key] = val
    json.dump(serializable_flags, f)

  # Export as SavedModel for finetuning and inference.
  if FLAGS.train_mode == 'finetune':
    save(model, global_step=result['global_step'])

  return result
示例#2
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  '''
  builder = tfds.builder(FLAGS.dataset, data_dir=FLAGS.data_dir)
  builder.download_and_prepare()
  num_train_examples = builder.info.splits[FLAGS.train_split].num_examples
  num_eval_examples = builder.info.splits[FLAGS.eval_split].num_examples
  num_classes = builder.info.features['label'].num_classes
  '''
  if tf.executing_eagerly():
    print('Eager mode now!')
  else:
    print('not Eager mode!')

  set_seed(FLAGS.seed)
  # data.pyのbuild_distributed_datasetをうまく書き換えることでbuilderの扱い方をいい感じにする
  # build_distributed_datasetのなかでデータのオブジェクトを呼び出しているだけ(多分)
  builder = pd.read_csv(FLAGS.data_path + 'train.csv')

  #builder_s = builder.sample(frac=1, random_state=FLAGS.seed).reset_index(drop=True)
  #num_fold = len(builder) // 5
  #test_builder = builder[num_fold * FLAGS.fold: num_fold * (FLAGS.fold + 1)]
  #train_builder = builder.drop(builder.index[num_fold * FLAGS.fold: num_fold * (FLAGS.fold + 1)])

  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=FLAGS.seed)
  i = 0
  for train_idx, test_idx in kf.split(builder['image_id'], builder['label']):
    if i == FLAGS.fold:
      train_builder = builder.iloc[train_idx]
      test_builder = builder.iloc[test_idx]
    i += 1

  #train_builder, test_builder = train_test_split(builder, test_size=FLAGS.test_ratio, stratify=builder['label'], random_state=1)
  num_train_examples = len(train_builder)
  num_eval_examples = len(test_builder)
  num_classes = 5
  train_steps = model_lib.get_train_steps(num_train_examples)
  eval_steps = FLAGS.eval_steps or int(
      math.ceil(num_eval_examples / FLAGS.eval_batch_size))
  if FLAGS.train_mode == 'finetune':
    # trainの中からsupervisedとして使うデータを抽出
    if FLAGS.stratify:
      train_builder, _ = train_test_split(train_builder, train_size=FLAGS.supervised_ratio, random_state=FLAGS.seed, stratify=train_builder['label'])
      print(train_builder['label'].value_counts())
    else:
      train_builder, _ = train_test_split(train_builder, train_size=FLAGS.supervised_ratio, random_state=FLAGS.seed)
    num_train_examples = len(train_builder)
    train_steps = FLAGS.pretrain_steps + num_train_examples * FLAGS.train_epochs // FLAGS.train_batch_size

  # これ以降はbuilderはbuild_distributed_datasetの引数としてしか登場しない
  epoch_steps = int(round(num_train_examples / FLAGS.train_batch_size))

  logging.info('# train examples: %d', num_train_examples)
  logging.info('# train_steps: %d', train_steps)
  logging.info('# eval examples: %d', num_eval_examples)
  logging.info('# eval steps: %d', eval_steps)

  checkpoint_steps = (
      FLAGS.checkpoint_steps or (FLAGS.checkpoint_epochs * epoch_steps))

  topology = None
  if FLAGS.use_tpu:
    if FLAGS.tpu_name:
      cluster = tf.distribute.cluster_resolver.TPUClusterResolver(
          FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
      cluster = tf.distribute.cluster_resolver.TPUClusterResolver(FLAGS.master)
    tf.config.experimental_connect_to_cluster(cluster)
    topology = tf.tpu.experimental.initialize_tpu_system(cluster)
    logging.info('Topology:')
    logging.info('num_tasks: %d', topology.num_tasks)
    logging.info('num_tpus_per_task: %d', topology.num_tpus_per_task)
    strategy = tf.distribute.experimental.TPUStrategy(cluster)

  else:
    # For (multiple) GPUs.
    strategy = tf.distribute.MirroredStrategy()
    logging.info('Running using MirroredStrategy on %d replicas',
                 strategy.num_replicas_in_sync)

  with strategy.scope():
    model = model_lib.Model(num_classes)

  if FLAGS.mode == 'check':
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir, min_interval_secs=15):
      result = check(model, test_builder, eval_steps, ckpt, strategy,
                                  topology)
      if result['global_step'] >= train_steps:
        logging.info('Eval complete. Exiting...')
        return
  elif FLAGS.mode == 'eval':
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir, min_interval_secs=15):
      result = perform_evaluation(model, test_builder, eval_steps, ckpt, strategy,
                                  topology)
      if result['global_step'] >= train_steps:
        logging.info('Eval complete. Exiting...')
        return
  else:
    summary_writer = tf.summary.create_file_writer(FLAGS.model_dir)
    with strategy.scope():
      # Build input pipeline.
      ds = data_lib.build_distributed_dataset(train_builder, FLAGS.train_batch_size,
                                              True, strategy, topology)

      # Build LR schedule and optimizer.
      learning_rate = model_lib.WarmUpAndCosineDecay(FLAGS.learning_rate,
                                                     num_train_examples)
      optimizer = model_lib.build_optimizer(learning_rate)

      # Build metrics.
      all_metrics = []  # For summaries.
      weight_decay_metric = tf.keras.metrics.Mean('train/weight_decay')
      total_loss_metric = tf.keras.metrics.Mean('train/total_loss')
      all_metrics.extend([weight_decay_metric, total_loss_metric])
      if FLAGS.train_mode == 'pretrain':
        contrast_loss_metric = tf.keras.metrics.Mean('train/contrast_loss')
        contrast_acc_metric = tf.keras.metrics.Mean('train/contrast_acc')
        contrast_entropy_metric = tf.keras.metrics.Mean(
            'train/contrast_entropy')
        all_metrics.extend([
            contrast_loss_metric, contrast_acc_metric, contrast_entropy_metric
        ])
      if FLAGS.train_mode == 'finetune' or FLAGS.lineareval_while_pretraining:
        supervised_loss_metric = tf.keras.metrics.Mean('train/supervised_loss')
        supervised_acc_metric = tf.keras.metrics.Mean('train/supervised_acc')
        all_metrics.extend([supervised_loss_metric, supervised_acc_metric])

      # Restore checkpoint if available.
      checkpoint_manager = try_restore_from_checkpoint(
          model, optimizer.iterations, optimizer)

    steps_per_loop = checkpoint_steps

    def single_step(features, labels):
      with tf.GradientTape() as tape:
        # Log summaries on the last step of the training loop to match
        # logging frequency of other scalar summaries.
        #
        # Notes:
        # 1. Summary ops on TPUs get outside compiled so they do not affect
        #    performance.
        # 2. Summaries are recorded only on replica 0. So effectively this
        #    summary would be written once per host when should_record == True.
        # 3. optimizer.iterations is incremented in the call to apply_gradients.
        #    So we use  `iterations + 1` here so that the step number matches
        #    those of scalar summaries.
        # 4. We intentionally run the summary op before the actual model
        #    training so that it can run in parallel.
        should_record = tf.equal((optimizer.iterations + 1) % steps_per_loop, 0)
        with tf.summary.record_if(should_record):
          # Only log augmented images for the first tower.
          tf.summary.image(
              'image', features[:, :, :, :3], step=optimizer.iterations + 1)
        projection_head_outputs, supervised_head_outputs = model(
            features, training=True)
        loss = None
        if projection_head_outputs is not None:
          outputs = projection_head_outputs
          con_loss, logits_con, labels_con = obj_lib.add_contrastive_loss(
              outputs,
              hidden_norm=FLAGS.hidden_norm,
              temperature=FLAGS.temperature,
              strategy=strategy)
          if loss is None:
            loss = con_loss
          else:
            loss += con_loss
          metrics.update_pretrain_metrics_train(contrast_loss_metric,
                                                contrast_acc_metric,
                                                contrast_entropy_metric,
                                                con_loss, logits_con,
                                                labels_con)
        if supervised_head_outputs is not None:
          outputs = supervised_head_outputs
          l = labels['labels']
          if FLAGS.train_mode == 'pretrain' and FLAGS.lineareval_while_pretraining:
            l = tf.concat([l, l], 0)
          sup_loss = obj_lib.add_supervised_loss(labels=l, logits=outputs)
          if loss is None:
            loss = sup_loss
          else:
            loss += sup_loss
          metrics.update_finetune_metrics_train(supervised_loss_metric,
                                                supervised_acc_metric, sup_loss,
                                                l, outputs)
        weight_decay = model_lib.add_weight_decay(
            model, adjust_per_optimizer=True)
        weight_decay_metric.update_state(weight_decay)
        loss += weight_decay
        total_loss_metric.update_state(loss)
        # The default behavior of `apply_gradients` is to sum gradients from all
        # replicas so we divide the loss by the number of replicas so that the
        # mean gradient is applied.
        loss = loss / strategy.num_replicas_in_sync
        logging.info('Trainable variables:')
        for var in model.trainable_variables:
          logging.info(var.name)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

    with strategy.scope():

      @tf.function
      def train_multiple_steps(iterator):
        # `tf.range` is needed so that this runs in a `tf.while_loop` and is
        # not unrolled.
        for _ in tf.range(steps_per_loop):
          # Drop the "while" prefix created by tf.while_loop which otherwise
          # gets prefixed to every variable name. This does not affect training
          # but does affect the checkpoint conversion script.
          # TODO(b/161712658): Remove this.
          with tf.name_scope(''):
            images, labels = next(iterator)
            features, labels = images, {'labels': labels}
            strategy.run(single_step, (features, labels))

      global_step = optimizer.iterations
      cur_step = global_step.numpy()
      iterator = iter(ds)
      while cur_step + 1 < train_steps:
        # Calls to tf.summary.xyz lookup the summary writer resource which is
        # set by the summary writer's context manager.
        with summary_writer.as_default():
          train_multiple_steps(iterator)
          cur_step = global_step.numpy()
          checkpoint_manager.save(cur_step)
          logging.info('Completed: %d / %d steps', cur_step, train_steps)
          metrics.log_and_write_metrics_to_summary(all_metrics, cur_step)
          tf.summary.scalar(
              'learning_rate',
              learning_rate(tf.cast(global_step, dtype=tf.float32)),
              global_step)
          summary_writer.flush()
        for metric in all_metrics:
          metric.reset_states()
      logging.info('Training complete...')

    if FLAGS.mode == 'train_then_eval':
      perform_evaluation(model, test_builder, eval_steps,
                         checkpoint_manager.latest_checkpoint, strategy,
                         topology)
示例#3
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  # hub_module = hub.load('style_transfer_content_weights_params')
  print("LOADING TF-HUB MODULE")
  hub_module = hub.load('https://tfhub.dev/google/magenta/arbitrary-image-stylization-v1-256/2')
  style_dataset = tfds.load('dtd',  batch_size=50, split='train', decoders={'image': style_preprocessing_decoder(),})  

  builder = tfds.builder(FLAGS.dataset, data_dir=FLAGS.data_dir)
  builder.download_and_prepare()
  num_train_examples = builder.info.splits[FLAGS.train_split].num_examples
  num_eval_examples = builder.info.splits[FLAGS.eval_split].num_examples
  num_classes = builder.info.features['label'].num_classes

  train_steps = model_lib.get_train_steps(num_train_examples)
  eval_steps = FLAGS.eval_steps or int(
      math.ceil(num_eval_examples / FLAGS.eval_batch_size))
  epoch_steps = int(round(num_train_examples / FLAGS.train_batch_size))

  logging.info('# train examples: %d', num_train_examples)
  logging.info('# train_steps: %d', train_steps)
  logging.info('# eval examples: %d', num_eval_examples)
  logging.info('# eval steps: %d', eval_steps)

  checkpoint_steps = (
      FLAGS.checkpoint_steps or (FLAGS.checkpoint_epochs * epoch_steps))

  topology = None
  if FLAGS.use_tpu:
    if FLAGS.tpu_name:
      cluster = tf.distribute.cluster_resolver.TPUClusterResolver(
          FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
      cluster = tf.distribute.cluster_resolver.TPUClusterResolver(FLAGS.master)
    tf.config.experimental_connect_to_cluster(cluster)
    topology = tf.tpu.experimental.initialize_tpu_system(cluster)
    logging.info('Topology:')
    logging.info('num_tasks: %d', topology.num_tasks)
    logging.info('num_tpus_per_task: %d', topology.num_tpus_per_task)
    strategy = tf.distribute.experimental.TPUStrategy(cluster)

  else:
    # For (multiple) GPUs.
    strategy = tf.distribute.MirroredStrategy()
    logging.info('Running using MirroredStrategy on %d replicas',
                 strategy.num_replicas_in_sync)

  with strategy.scope():
    model = model_lib.Model(num_classes)
    print("GOT MODEL")

  if FLAGS.mode == 'eval':
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir, min_interval_secs=15):
      result = perform_evaluation(model, builder, eval_steps, ckpt, strategy,
                                  topology)
      if result['global_step'] >= train_steps:
        logging.info('Eval complete. Exiting...')
        return
  else:
    summary_writer = tf.summary.create_file_writer(FLAGS.model_dir)
    with strategy.scope():
      # Build input pipeline.
      ds = data_lib.build_distributed_dataset(builder, FLAGS.train_batch_size,
                                              True, strategy, topology, hub_module, style_dataset)

      # Build LR schedule and optimizer.
      learning_rate = model_lib.WarmUpAndCosineDecay(FLAGS.learning_rate,
                                                     num_train_examples)
      optimizer = model_lib.build_optimizer(learning_rate)

      # Build metrics.
      all_metrics = []  # For summaries.
      weight_decay_metric = tf.keras.metrics.Mean('train/weight_decay')
      total_loss_metric = tf.keras.metrics.Mean('train/total_loss')
      all_metrics.extend([weight_decay_metric, total_loss_metric])
      if FLAGS.train_mode == 'pretrain':
        contrast_loss_metric = tf.keras.metrics.Mean('train/contrast_loss')
        contrast_acc_metric = tf.keras.metrics.Mean('train/contrast_acc')
        contrast_entropy_metric = tf.keras.metrics.Mean(
            'train/contrast_entropy')
        all_metrics.extend([
            contrast_loss_metric, contrast_acc_metric, contrast_entropy_metric
        ])
      if FLAGS.train_mode == 'finetune' or FLAGS.lineareval_while_pretraining:
        supervised_loss_metric = tf.keras.metrics.Mean('train/supervised_loss')
        supervised_acc_metric = tf.keras.metrics.Mean('train/supervised_acc')
        all_metrics.extend([supervised_loss_metric, supervised_acc_metric])

      # Restore checkpoint if available.
      checkpoint_manager = try_restore_from_checkpoint(
          model, optimizer.iterations, optimizer)

    steps_per_loop = checkpoint_steps

    def single_step(features, labels):
      with tf.GradientTape() as tape:
        # Log summaries on the last step of the training loop to match
        # logging frequency of other scalar summaries.
        #
        # Notes:
        # 1. Summary ops on TPUs get outside compiled so they do not affect
        #    performance.
        # 2. Summaries are recorded only on replica 0. So effectively this
        #    summary would be written once per host when should_record == True.
        # 3. optimizer.iterations is incremented in the call to apply_gradients.
        #    So we use  `iterations + 1` here so that the step number matches
        #    those of scalar summaries.
        # 4. We intentionally run the summary op before the actual model
        #    training so that it can run in parallel.
        should_record = tf.equal((optimizer.iterations + 1) % steps_per_loop, 0)
        with tf.summary.record_if(should_record):
          # Only log augmented images for the first tower.
          tf.summary.image(
              'image', features[:, :, :, :3], step=optimizer.iterations + 1)
        projection_head_outputs, supervised_head_outputs = model(
            features, training=True)
        loss = None
        if projection_head_outputs is not None:
          outputs = projection_head_outputs
          con_loss, logits_con, labels_con = obj_lib.add_contrastive_loss(
              outputs,
              hidden_norm=FLAGS.hidden_norm,
              temperature=FLAGS.temperature,
              strategy=strategy)
          if loss is None:
            loss = con_loss
          else:
            loss += con_loss
          metrics.update_pretrain_metrics_train(contrast_loss_metric,
                                                contrast_acc_metric,
                                                contrast_entropy_metric,
                                                con_loss, logits_con,
                                                labels_con)
        if supervised_head_outputs is not None:
          outputs = supervised_head_outputs
          l = labels['labels']
          if FLAGS.train_mode == 'pretrain' and FLAGS.lineareval_while_pretraining:
            l = tf.concat([l, l], 0)
          sup_loss = obj_lib.add_supervised_loss(labels=l, logits=outputs)
          if loss is None:
            loss = sup_loss
          else:
            loss += sup_loss
          metrics.update_finetune_metrics_train(supervised_loss_metric,
                                                supervised_acc_metric, sup_loss,
                                                l, outputs)
        weight_decay = model_lib.add_weight_decay(
            model, adjust_per_optimizer=True)
        weight_decay_metric.update_state(weight_decay)
        loss += weight_decay
        total_loss_metric.update_state(loss)
        # The default behavior of `apply_gradients` is to sum gradients from all
        # replicas so we divide the loss by the number of replicas so that the
        # mean gradient is applied.
        loss = loss / strategy.num_replicas_in_sync
        logging.info('Trainable variables:')
        for var in model.trainable_variables:
          logging.info(var.name)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

    with strategy.scope():

      @tf.function
      def train_multiple_steps(iterator):
        # `tf.range` is needed so that this runs in a `tf.while_loop` and is
        # not unrolled.
        for _ in tf.range(steps_per_loop):
          # Drop the "while" prefix created by tf.while_loop which otherwise
          # gets prefixed to every variable name. This does not affect training
          # but does affect the checkpoint conversion script.
          # TODO(b/161712658): Remove this.
          with tf.name_scope(''):
            images, labels = next(iterator)
            features, labels = images, {'labels': labels}
            strategy.run(single_step, (features, labels))

      global_step = optimizer.iterations
      cur_step = global_step.numpy()
      iterator = iter(ds)
      while cur_step < train_steps:
        # Calls to tf.summary.xyz lookup the summary writer resource which is
        # set by the summary writer's context manager.
        with summary_writer.as_default():
          train_multiple_steps(iterator)
          cur_step = global_step.numpy()
          checkpoint_manager.save(cur_step)
          logging.info('Completed: %d / %d steps', cur_step, train_steps)
          metrics.log_and_write_metrics_to_summary(all_metrics, cur_step)
          tf.summary.scalar(
              'learning_rate',
              learning_rate(tf.cast(global_step, dtype=tf.float32)),
              global_step)
          summary_writer.flush()
        for metric in all_metrics:
          metric.reset_states()
      logging.info('Training complete...')

    if FLAGS.mode == 'train_then_eval':
      perform_evaluation(model, builder, eval_steps,
                         checkpoint_manager.latest_checkpoint, strategy,
                         topology)