示例#1
0
def get_datasets():
    # Load dataset
    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])

    # Preprocess train dataset
    train_dataset = train_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
    )
    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])

    train_features = {x: train_dataset[x] for x in ["input_ids", "attention_mask"]}
    tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"]))

    # Preprocess test dataset
    test_dataset = test_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
    )
    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])

    test_features = {x: test_dataset[x] for x in ["input_ids", "attention_mask"]}
    tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"]))

    if SDP_ENABLED:
        tf_train_dataset = tf_train_dataset.shard(sdp.size(), sdp.rank())
        tf_test_dataset = tf_test_dataset.shard(sdp.size(), sdp.rank())
    tf_train_dataset = tf_train_dataset.batch(args.train_batch_size, drop_remainder=True)
    tf_test_dataset = tf_test_dataset.batch(args.eval_batch_size, drop_remainder=True)

    return tf_train_dataset, tf_test_dataset
def train(args):
    # Load data from S3
    #     train_dir = os.environ.get('SM_CHANNEL_TRAIN')
    train_dir = args.train
    batch_size = args.batch_size
    dataset = get_train_data(train_dir, batch_size)

    model = get_resnet50(transfer_learning=True)

    loss_fn = tf.losses.SparseCategoricalCrossentropy()
    acc = tf.metrics.SparseCategoricalAccuracy(name='train_accuracy')

    # SMDataParallel: dist.size()
    # LR for 8 node run : 0.000125
    # LR for single node run : 0.001
    opt = tf.optimizers.Adam(args.learning_rate * dist.size())

    checkpoint_dir = os.environ['SM_MODEL_DIR']
    checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)

    @tf.function
    def training_step(images, labels, first_batch):
        with tf.GradientTape() as tape:
            probs = model(images, training=True)
            loss_value = loss_fn(labels, probs)
            acc_value = acc(labels, probs)

        # SMDataParallel: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape
        tape = dist.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, model.trainable_variables)
        opt.apply_gradients(zip(grads, model.trainable_variables))

        if first_batch:
            # SMDataParallel: Broadcast model and optimizer variables
            dist.broadcast_variables(model.variables, root_rank=0)
            dist.broadcast_variables(opt.variables(), root_rank=0)

        # SMDataParallel: all_reduce call
        loss_value = dist.oob_allreduce(
            loss_value)  # Average the loss across workers
        acc_value = dist.oob_allreduce(acc_value)

        return loss_value, acc_value

    for epoch in range(args.epochs):
        for batch, (images,
                    labels) in enumerate(dataset.take(10000 // dist.size())):
            loss_value, acc_value = training_step(images, labels, batch == 0)

            if batch % 100 == 0 and dist.rank() == 0:
                logger.info(
                    '*** Epoch %d   Step   #%d Accuracy: %.6f   Loss: %.6f ***'
                    % (epoch, batch, acc_value, loss_value))

    # SMDataParallel: Save checkpoints only from master node.
    if dist.rank() == 0:
        model.save(os.path.join(checkpoint_dir, '1'))
示例#3
0
    def _get_run_config(mode,
                        model_dir,
                        use_xla,
                        use_dali,
                        gpu_memory_fraction,
                        gpu_id=0,
                        seed=None):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
            raise ValueError(
                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')"
                % mode)

        if seed is not None:
            if hvd_utils.is_using_hvd():
                tf_random_seed = 2 * (seed + hvd.rank())
            else:
                tf_random_seed = 2 * seed
        else:
            tf_random_seed = None

        config = tf.estimator.RunConfig(
            model_dir=model_dir,
            tf_random_seed=tf_random_seed,
            save_summary_steps=100 if mode in ['train', 'validation'] else
            1e9,  # disabled in benchmark mode
            save_checkpoints_steps=None,
            save_checkpoints_secs=None,
            session_config=Runner._get_session_config(
                mode=mode,
                use_xla=use_xla,
                use_dali=use_dali,
                gpu_memory_fraction=gpu_memory_fraction,
                gpu_id=gpu_id),
            keep_checkpoint_max=5,
            keep_checkpoint_every_n_hours=1e6,  # disabled
            log_step_count_steps=1e9,
            train_distribute=None,
            device_fn=None,
            protocol=None,
            eval_distribute=None,
            experimental_distribute=None)

        if mode == 'train':
            if hvd_utils.is_using_hvd():
                config = config.replace(
                    save_checkpoints_steps=1000 if hvd.rank() == 0 else None,
                    keep_checkpoint_every_n_hours=3)
            else:
                config = config.replace(save_checkpoints_steps=1000,
                                        keep_checkpoint_every_n_hours=3)

        return config
示例#4
0
def get_tfrecords_input_fn(filenames, batch_size, height, width, training,
                           distort_color, num_threads, deterministic):

    shuffle_buffer_size = 4096

    if deterministic:
        if hvd_utils.is_using_hvd():
            seed = 13 * (1 + hvd.rank())
        else:
            seed = 13
    else:
        seed = None

    ds = tf.data.Dataset.from_tensor_slices(filenames)

    if hvd_utils.is_using_hvd() and training:
        ds = ds.shard(hvd.size(), hvd.rank())

    ds = ds.apply(
        tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset,
                                                 cycle_length=10,
                                                 block_length=8,
                                                 sloppy=not deterministic,
                                                 prefetch_input_elements=16))

    counter = tf.data.Dataset.range(sys.maxsize)
    ds = tf.data.Dataset.zip((ds, counter))

    def preproc_func(record, counter_):
        return image_processing.preprocess_image_record(
            record, height, width, _NUM_CHANNELS, training)

    if training:
        ds = ds.apply(
            tf.data.experimental.shuffle_and_repeat(
                buffer_size=shuffle_buffer_size, seed=seed))
    else:
        ds = ds.repeat()

    ds = ds.apply(
        tf.data.experimental.map_and_batch(
            map_func=preproc_func,
            num_parallel_calls=num_threads,
            batch_size=batch_size,
            drop_remainder=True,
        ))

    ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return ds
示例#5
0
    def __init__(
        self,
        model_name,
        n_classes,
        layers_count,
        layers_depth,
        expansions,
        compute_format='NCHW',
        input_format='NHWC',
        weight_init='fan_out',
        dtype=tf.float32,
        use_dali=False,
        cardinality=1,
        use_se=False,
        se_ratio=1,
    ):

        self.model_hparams = tf.contrib.training.HParams(
            n_classes=n_classes,
            compute_format=compute_format,
            input_format=input_format,
            dtype=dtype,
            layers_count=layers_count,
            layers_depth=layers_depth,
            expansions=expansions,
            model_name=model_name,
            use_dali=use_dali,
            cardinality=cardinality,
            use_se=use_se,
            se_ratio=se_ratio)

        self.batch_norm_hparams = tf.contrib.training.HParams(
            decay=0.9,
            epsilon=1e-5,
            scale=True,
            center=True,
            param_initializers={
                'beta': tf.constant_initializer(0.0),
                'gamma': tf.constant_initializer(1.0),
                'moving_mean': tf.constant_initializer(0.0),
                'moving_variance': tf.constant_initializer(1.0)
            },
        )

        self.conv2d_hparams = tf.contrib.training.HParams(
            kernel_initializer=tf.variance_scaling_initializer(
                scale=2.0, distribution='truncated_normal', mode=weight_init),
            bias_initializer=tf.constant_initializer(0.0))

        self.dense_hparams = tf.contrib.training.HParams(
            kernel_initializer=tf.variance_scaling_initializer(
                scale=2.0, distribution='truncated_normal', mode=weight_init),
            bias_initializer=tf.constant_initializer(0.0))
        if hvd.rank() == 0:
            print("Model HParams:")
            print("Name", model_name)
            print("Number of classes", n_classes)
            print("Compute_format", compute_format)
            print("Input_format", input_format)
            print("dtype", str(dtype))
示例#6
0
        def training_data_fn():

            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    print("Using DALI input... ")

                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                if hvd.rank() == 0:
                    print("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )
示例#7
0
def train(mnist_epochs):
    """
    Train CNN
    :param mnist_epochs: number of training steps to run for
    :return: None
    """
    for batch, (images, labels) in enumerate(
            dataset.take(mnist_epochs // dist.size())):
        loss_value = training_step(images, labels, batch)
        # print loss every 50 epochs in master worker
        if batch % 50 == 0 and dist.rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))
示例#8
0
def get_dataset(
    tokenizer: PreTrainedTokenizer,
    processor: SquadProcessor,
    data_dir: str,
    filename: str,
    per_gpu_batch_size: int,
    shard: bool,
    drop_remainder: bool,
    shuffle: bool = True,
    max_seq_length: int = 384,
    doc_stride: int = 128,
    max_query_length: int = 64,
    evaluate: bool = False,
    return_raw_features: bool = False,
    repeat: bool = False,
) -> tf.data.Dataset:
    # Convert the data from a JSON file into a tf.data.Dataset
    # This function should also work to fetch the val_dataset
    if evaluate:
        examples: List[SquadExample] = processor.get_dev_examples(
            data_dir, filename=filename)
    else:
        examples: List[SquadExample] = processor.get_train_examples(
            data_dir, filename=filename)
    # dataset is a tuple of (features, dataset)
    dataset: List[SquadFeatures] = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
        return_dataset=None if return_raw_features else "tf",
        threads=16,
    )
    if return_raw_features:
        return dataset
    else:
        if shard:
            dataset = dataset.shard(smddp.size(), smddp.rank())
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1000,
                                      reshuffle_each_iteration=True)
        if repeat:
            dataset = dataset.repeat()
        dataset = dataset.batch(per_gpu_batch_size,
                                drop_remainder=drop_remainder)
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1000,
                                      reshuffle_each_iteration=True)
        return dataset
示例#9
0
def get_dataset(mnist_batch_size):
    """
    get the dataset slice that corresponds to the worker
    :param mnist_batch_size: size of the batch. a hyperparameter
    :return: a tf.data.Dataset object
    """
    data_slice_name = 'mnist-%d.npz' % dist.rank()
    keras_path = os.path.join(str(
        Path.home()), ".keras/datasets/")  # keras expects data to be here
    os.makedirs(keras_path, exist_ok=True)
    copyfile("/opt/ml/input/data/training/" + data_slice_name, keras_path +
             data_slice_name)  # copy file from magic S3 location to keras path
    (mnist_images,
     mnist_labels), _ = tf.keras.datasets.mnist.load_data(path=data_slice_name)
    # need to convert from 0-255 int pixel format to float, labels to ints
    data_slice = tf.data.Dataset.from_tensor_slices((
        tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
        tf.cast(mnist_labels, tf.int64))).repeat().shuffle(100000).batch(
            mnist_batch_size)  # repeat infinitely, shuffle, and set batch size
    return data_slice
    def __init__(self,
                 filenames,
                 idx_filenames,
                 height,
                 width,
                 batch_size,
                 num_threads,
                 dtype=tf.uint8,
                 dali_cpu=True,
                 deterministic=False,
                 training=False):
        device_id = hvd.local_rank()
        shard_id = hvd.rank()
        num_gpus = hvd.size()
        pipe = HybridPipe(tfrec_filenames=filenames,
                          tfrec_idx_filenames=idx_filenames,
                          height=height,
                          width=width,
                          batch_size=batch_size,
                          num_threads=num_threads,
                          device_id=device_id,
                          shard_id=shard_id,
                          num_gpus=num_gpus,
                          deterministic=deterministic,
                          dali_cpu=dali_cpu,
                          training=training)

        daliop = dali_tf.DALIIterator()

        with tf.device("/gpu:0"):
            self.images, self.labels = daliop(pipeline=pipe,
                                              shapes=[(batch_size, height,
                                                       width, 3),
                                                      (batch_size, 1)],
                                              dtypes=[tf.float32, tf.int64],
                                              device_id=device_id)
示例#11
0
    parser.add_argument('--rank', type=int, default=0)

    # SageMaker Container environment
    parser.add_argument('--model_dir', type=str, default='../model')
    parser.add_argument('--data_dir', type=str, default='../data')

    args = parser.parse_args()

    try:
        args.model_dir = os.environ['SM_MODEL_DIR']
        args.data_dir = os.environ['SM_CHANNEL_TRAINING']
    except KeyError as e:
        print(
            "The model starts training on the local host without SageMaker TrainingJob."
        )
        if not os.path.exists(args.model_dir):
            os.makedirs(args.model_dir)
        pass

    ########################################################
    ####### 2. SageMaker Distributed Data Parallel   #######
    #######  - Get all number of GPU and rank number #######
    ########################################################

    args.size = smdp.size()  # all number of GPU
    args.rank = smdp.rank()  # total rank in all hosts
    args.local_rank = smdp.local_rank()  # rank per host

    ########################################################

    train(args)
def main(unused_argv):
  tf.logging.set_verbosity(tf.logging.INFO)
  if FLAGS.amp:
      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
  else:
      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0"

  # Set seed to reduce randomness
  np.random.seed(FLAGS.seed)
  tf.set_random_seed(FLAGS.seed)

  hvd.init()

  flags.mark_flag_as_required('model_dir')
  flags.mark_flag_as_required('pipeline_config_path')
  session_config = tf.ConfigProto()
  session_config.gpu_options.per_process_gpu_memory_fraction=0.9
  session_config.gpu_options.visible_device_list = str(hvd.local_rank())
  if FLAGS.allow_xla:
      session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
  model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
  config = tf.estimator.RunConfig(tf_random_seed=(FLAGS.seed + hvd.rank()),
                                  model_dir=model_dir, session_config=session_config)

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
      eval_count=FLAGS.eval_count,
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
      sample_1_of_n_eval_on_train_examples=(
          FLAGS.sample_1_of_n_eval_on_train_examples))
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
  eval_input_fns = train_and_eval_dict['eval_input_fns']
  eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
  predict_input_fn = train_and_eval_dict['predict_input_fn']
  train_steps = train_and_eval_dict['train_steps']

  if FLAGS.checkpoint_dir:
    if FLAGS.eval_training_data:
      name = 'training_data'
      input_fn = eval_on_train_input_fn
    else:
      name = 'validation_data'
      # The first eval input will be evaluated.
      input_fn = eval_input_fns[0]
    if FLAGS.run_once:
      estimator.evaluate(input_fn,
                         steps=None,
                         checkpoint_path=tf.train.latest_checkpoint(
                             FLAGS.checkpoint_dir))
    else:
      model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn,
                                train_steps, name)
  else:
    train_spec, eval_specs = model_lib.create_train_and_eval_specs(
        train_input_fn,
        eval_input_fns,
        eval_on_train_input_fn,
        predict_input_fn,
        train_steps,
        eval_on_train_data=False)

    train_hooks = [hvd.BroadcastGlobalVariablesHook(0), DLLoggerHook(hvd.size()*train_and_eval_dict['train_batch_size'], hvd.rank())]
    eval_hooks = []

    for x in range(FLAGS.eval_count):
        estimator.train(train_input_fn,
                        hooks=train_hooks,
                        steps=train_steps // FLAGS.eval_count)


        if hvd.rank() == 0 and not FLAGS.train_only:
            eval_input_fn = eval_input_fns[0]
            results = estimator.evaluate(eval_input_fn,
                               steps=None,
                               hooks=eval_hooks)
示例#13
0
def MPI_rank():
    return hr.rank()
def main(_):
  os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

  # Set seed to reduce randomness
  random.seed(FLAGS.seed)
  np.random.seed(FLAGS.seed)
  tf.set_random_seed(FLAGS.seed)

  if FLAGS.herring:
    import smdistributed.dataparallel.tensorflow as hvd
    hvd.init()

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  tf.io.gfile.makedirs(FLAGS.output_dir)

  input_files = []
  for input_file_dir in FLAGS.input_files_dir.split(","):
    input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*")))

  if FLAGS.herring and len(input_files) < hvd.size():
      raise ValueError("Input Files must be sharded")
  if FLAGS.amp and FLAGS.manual_fp16:
      raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  config = tf.compat.v1.ConfigProto()
  if FLAGS.herring:
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    if hvd.rank() == 0:
      tf.compat.v1.logging.info("***** Configuaration *****")
      for key in FLAGS.__flags.keys():
          tf.compat.v1.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
      tf.compat.v1.logging.info("**************************")

#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
  if FLAGS.use_xla: 
      config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
      config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
      if FLAGS.amp:
        tf.enable_resource_variables()

  run_config = tf.estimator.RunConfig(
      tf_random_seed=(FLAGS.seed if not FLAGS.herring else (FLAGS.seed + hvd.rank())),
      model_dir=FLAGS.output_dir,
      session_config=config,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None,
      save_summary_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None,
      # This variable controls how often estimator reports examples/sec.
      # Default value is every 100 steps.
      # When --report_loss is True, we set to very large value to prevent
      # default info reporting from estimator.
      # Ideally we should set it to None, but that does not work.
      log_step_count_steps=10000 if FLAGS.report_loss else 100)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate if not FLAGS.herring else FLAGS.learning_rate*hvd.size(),
      num_train_steps=FLAGS.num_train_steps,
      num_warmup_steps=FLAGS.num_warmup_steps,
      use_one_hot_embeddings=False,
      hvd=None if not FLAGS.herring else hvd)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

  if FLAGS.do_train:

    training_hooks = []
    if FLAGS.herring and hvd.size() > 1:
      training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if (not FLAGS.herring or hvd.rank() == 0):
      global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.herring else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
      training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps, FLAGS.save_checkpoints_steps, FLAGS.report_loss))

    tf.compat.v1.logging.info("***** Running training *****")
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(
        input_files=input_files,
        batch_size=FLAGS.train_batch_size,
        max_seq_length=FLAGS.max_seq_length,
        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
        is_training=True,
        hvd=None if not FLAGS.herring else hvd)

    train_start_time = time.time()
    estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
    train_time_elapsed = time.time() - train_start_time

    if (not FLAGS.herring or hvd.rank() == 0):
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = FLAGS.num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
                        FLAGS.num_train_steps * global_batch_size)
        tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
                        (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size)
        tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
        tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
        dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

  if FLAGS.do_eval and (not FLAGS.herring or hvd.rank() == 0):
    tf.compat.v1.logging.info("***** Running evaluation *****")
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    eval_files = []
    for eval_file_dir in FLAGS.eval_files_dir.split(","):
        eval_files.extend(tf.io.gfile.glob(os.path.join(eval_file_dir, "*")))

    eval_input_fn = input_fn_builder(
        input_files=eval_files,
        batch_size=FLAGS.eval_batch_size,
        max_seq_length=FLAGS.max_seq_length,
        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
        is_training=False,
        hvd=None if not FLAGS.herring else hvd)

    eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
    eval_start_time = time.time()
    result = estimator.evaluate(
        input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)

    eval_time_elapsed = time.time() - eval_start_time
    time_list = eval_hooks[-1].time_list
    time_list.sort()
    # Removing outliers (init/warmup) in throughput computation.
    eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
    num_sentences = (int(len(time_list) * 0.99)) * FLAGS.eval_batch_size

    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

    tf.compat.v1.logging.info("-----------------------------")
    tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
                    eval_hooks[-1].count * FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
                    num_sentences)
    tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
    tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
    tf.compat.v1.logging.info("Inference Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
    dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
    tf.compat.v1.logging.info("-----------------------------")

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    with tf.io.gfile.GFile(output_eval_file, "w") as writer:
      tf.compat.v1.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))
示例#15
0
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    # Training
    if args.do_train:

        # train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size)
        train_results = fit(
            model, loss, optimizer, tf_train_dataset, args.epochs, args.train_batch_size, max_steps=None
        )
        logger.info("*** Train ***")

        output_eval_file = os.path.join(args.output_data_dir, "train_results.txt")

        if not SDP_ENABLED or sdp.rank() == 0:
            with open(output_eval_file, "w") as writer:
                logger.info("***** Train results *****")
                logger.info(train_results)
                for key, value in train_results.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

    # Evaluation
    if args.do_eval and (not SDP_ENABLED or sdp.rank() == 0):

        result = model.evaluate(tf_test_dataset, batch_size=args.eval_batch_size, return_dict=True)
        logger.info("*** Evaluate ***")

        output_eval_file = os.path.join(args.output_data_dir, "eval_results.txt")
示例#16
0
def get_dataset_from_tfrecords(
    *,
    model_type: str,
    filenames: List[str],
    per_gpu_batch_size: int,
    max_seq_length: int,
    max_predictions_per_seq: int = None,
    buffer_size: int = 1000,
    shard: bool = True,
) -> "tf.data.Dataset":
    """ Reads the dataset from TFRecords and returns it.
    Returns a dataset that includes batching, but not gradient accumulation.
    """
    def _parse_function(example_proto):
        # Parse the input `tf.Example` proto using the dictionary above.
        return tf.io.parse_single_example(example_proto, name_to_features)

    if model_type in ["albert", "bert"]:
        assert max_predictions_per_seq is not None, "Pass --max_predictions_per_seq"
        name_to_features = {
            "input_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to input_ids
            "input_mask":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to attention_mask
            "segment_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to token_type_ids
            "masked_lm_positions":
            tf.io.FixedLenFeature(
                [max_predictions_per_seq], tf.int64
            ),  # The number in the sequence that is masked, in range [0, max_seq_length]. 0 signifies a pad.
            "masked_lm_ids":
            tf.io.FixedLenFeature(
                [max_predictions_per_seq], tf.int64
            ),  # The token id that is masked, in range [0, vocab_size]. 0 signifies a pad.
            "masked_lm_weights":
            tf.io.FixedLenFeature(
                [max_predictions_per_seq],
                tf.float32),  # 1 if useful, 0 signifies a pad token
            "next_sentence_labels":
            tf.io.FixedLenFeature([1], tf.int64),
        }
    elif model_type in ["electra"]:
        name_to_features = {
            "input_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to input_ids
            "token_type_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to token_type_ids
            "attention_mask":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to attention_mask
        }
    else:
        raise ValueError(
            f"model_type={model_type} must be one of ['albert', 'bert', 'electra']"
        )

    # Example input pipeline here: https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/run_pretraining.py#L443
    assert len(filenames) > 0, f"Filenames is an empty list"
    # Shard and shuffle the filenames
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    if shard:
        import smdistributed.dataparallel.tensorflow as smddp

        dataset = dataset.shard(smddp.size(), smddp.rank())
    dataset = dataset.shuffle(buffer_size=len(filenames),
                              reshuffle_each_iteration=True)
    dataset = dataset.repeat()

    # `cycle_length` is the number of parallel files that get read
    num_cpu_threads = 2 * 96
    cycle_length = min(num_cpu_threads, len(filenames))
    # file_to_dataset_func = lambda file: tf.data.TFRecordDataset(file).map(_parse_function)
    file_to_dataset_func = lambda file: tf.data.TFRecordDataset(file)
    dataset = dataset.interleave(
        file_to_dataset_func,
        cycle_length=cycle_length,
        block_length=1,
        num_parallel_calls=cycle_length,
    )
    # Map and batch will be automatically fused together, see https://www.tensorflow.org/api_docs/python/tf/data/experimental/map_and_batch
    dataset = dataset.map(_parse_function, num_parallel_calls=num_cpu_threads)
    dataset = dataset.shuffle(buffer_size=buffer_size,
                              reshuffle_each_iteration=True)
    dataset = dataset.batch(per_gpu_batch_size, drop_remainder=True)
    # Shuffle the batches and prefetch some batches
    dataset = dataset.shuffle(buffer_size=buffer_size,
                              reshuffle_each_iteration=True)

    return dataset
    def __init__(self,
                 tfrec_filenames,
                 tfrec_idx_filenames,
                 height,
                 width,
                 batch_size,
                 num_threads,
                 device_id,
                 shard_id,
                 num_gpus,
                 deterministic=False,
                 dali_cpu=True,
                 training=True):

        kwargs = dict()
        if deterministic:
            kwargs['seed'] = 7 * (1 + hvd.rank())
        super(HybridPipe, self).__init__(batch_size, num_threads, device_id,
                                         **kwargs)

        self.training = training
        self.input = dali.ops.TFRecordReader(
            path=tfrec_filenames,
            index_path=tfrec_idx_filenames,
            random_shuffle=True,
            shard_id=shard_id,
            num_shards=num_gpus,
            initial_fill=10000,
            features={
                'image/encoded':
                dali.tfrecord.FixedLenFeature((), dali.tfrecord.string, ""),
                'image/class/label':
                dali.tfrecord.FixedLenFeature([1], dali.tfrecord.int64, -1),
                'image/class/text':
                dali.tfrecord.FixedLenFeature([], dali.tfrecord.string, ''),
                'image/object/bbox/xmin':
                dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
                'image/object/bbox/ymin':
                dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
                'image/object/bbox/xmax':
                dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0),
                'image/object/bbox/ymax':
                dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0)
            })

        if self.training:
            self.decode = dali.ops.ImageDecoderRandomCrop(
                device="cpu" if dali_cpu else "mixed",
                output_type=dali.types.RGB,
                random_aspect_ratio=[0.75, 1.33],
                random_area=[0.05, 1.0],
                num_attempts=100)
            self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu",
                                          resize_x=width,
                                          resize_y=height)
        else:
            self.decode = dali.ops.ImageDecoder(
                device="cpu" if dali_cpu else "mixed",
                output_type=dali.types.RGB)
            # Make sure that every image > 224 for CropMirrorNormalize
            self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu",
                                          resize_shorter=256)

        self.normalize = dali.ops.CropMirrorNormalize(
            device="gpu",
            output_dtype=dali.types.FLOAT,
            crop=(height, width),
            image_type=dali.types.RGB,
            mean=[123.68, 116.28, 103.53],
            std=[58.395, 57.120, 57.385],
            output_layout=dali.types.NHWC)
        self.cast_float = dali.ops.Cast(device="gpu", dtype=dali.types.FLOAT)
        self.mirror = dali.ops.CoinFlip()
        self.iter = 0
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and limitations under the License.

import tensorflow as tf
tf.random.set_seed(42)
import smdistributed.dataparallel.tensorflow as dist

dist.init()

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], 'GPU')

(mnist_images, mnist_labels), _ = \
    tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % dist.rank())

dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
             tf.float32), tf.cast(mnist_labels, tf.int64)))
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
示例#19
0
import dllogger

from utils import hvd_utils
from runtime import Runner
from model.resnet import model_architectures

from utils.cmdline_helper import parse_cmdline

if __name__ == "__main__":

    tf.logging.set_verbosity(tf.logging.ERROR)

    FLAGS = parse_cmdline(model_architectures.keys())
    hvd.init()

    if hvd.rank() == 0:
        log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
        os.makedirs(FLAGS.results_dir, exist_ok=True)

        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
    else:
        dllogger.init(backends=[])
    dllogger.log(data=vars(FLAGS), step='PARAMETER')

    runner = Runner(
        # ========= Model HParams ========= #
        n_classes=1001,
示例#20
0
# Third Party
import smdistributed.dataparallel.tensorflow as smdataparallel
import tensorflow as tf

# Register smdataparallel shutdown hook
smdataparallel.init()

gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[smdataparallel.local_rank()], "GPU")

(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
    path="mnist-%d.npz" % smdataparallel.rank()
)

dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))
)
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential(
    [
        tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
        tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation="relu"),
示例#21
0
tf.random.set_seed(42)

# SMDataParallel: Initialize
dist.init()

gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    # SMDataParallel: Pin GPUs to a single SMDataParallel process [use SMDataParallel local_rank() API]
    tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], "GPU")

(mnist_images,
 mnist_labels), _ = tf.keras.datasets.mnist.load_data(path="mnist-%d.npz" %
                                                      dist.rank())

dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
             tf.float32), tf.cast(mnist_labels, tf.int64)))
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
    tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation="softmax"),
示例#22
0

def train(mnist_epochs):
    """
    Train CNN
    :param mnist_epochs: number of training steps to run for
    :return: None
    """
    for batch, (images, labels) in enumerate(
            dataset.take(mnist_epochs // dist.size())):
        loss_value = training_step(images, labels, batch)
        # print loss every 50 epochs in master worker
        if batch % 50 == 0 and dist.rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))


if __name__ == "__main__":
    # runs training, but distributed
    dist.init()
    config_gpus()
    print("Worker number:", dist.rank())
    epochs, batch_size, learning_rate = get_hyperparameters()
    model, loss, optimizer = create_model(learning_rate)
    dataset = get_dataset(batch_size)
    train(epochs)

    # save model as master
    if dist.rank() == 0:
        checkpoint_dir = "/opt/ml/model"
        model.save(os.path.join(checkpoint_dir, '1'))
示例#23
0
    def train(self,
              iter_unit,
              num_iter,
              run_iter,
              batch_size,
              warmup_steps=50,
              weight_decay=1e-4,
              lr_init=0.1,
              lr_warmup_epochs=5,
              momentum=0.9,
              log_every_n_steps=1,
              loss_scale=256,
              label_smoothing=0.0,
              mixup=0.0,
              use_cosine_lr=False,
              use_static_loss_scaling=False,
              is_benchmark=False,
              quantize=False,
              symmetric=False,
              quant_delay=0,
              finetune_checkpoint=None,
              use_final_conv=False,
              use_qdq=False):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError(
                '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])'
                % iter_unit)

        if self.run_hparams.data_dir is None and not is_benchmark:
            raise ValueError('`data_dir` must be specified for training!')

        if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16:
            if use_static_loss_scaling:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0"
            else:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
        else:
            use_static_loss_scaling = False  # Make sure it hasn't been set to True on FP32 training

        num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size()
        global_batch_size = batch_size * num_gpus

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
                data_dir=self.run_hparams.data_dir,
                mode="train",
                iter_unit=iter_unit,
                num_iter=num_iter,
                global_batch_size=global_batch_size,
            )

            steps_per_epoch = num_steps / num_epochs

        else:
            num_epochs = 1
            num_steps = num_iter
            steps_per_epoch = num_steps
            num_decay_steps = num_steps
            num_samples = num_steps * batch_size

        if run_iter == -1:
            run_iter = num_steps
        else:
            run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter

        if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
                data_idx_dir=self.run_hparams.data_idx_dir, mode="train")

        training_hooks = []

        if hvd.rank() == 0:
            print('Starting Model Training...')
            print("Training Epochs", num_epochs)
            print("Total Steps", num_steps)
            print("Steps per Epoch", steps_per_epoch)
            print("Decay Steps", num_decay_steps)
            print("Weight Decay Factor", weight_decay)
            print("Init Learning Rate", lr_init)
            print("Momentum", momentum)
            print("Num GPUs", num_gpus)
            print("Per-GPU Batch Size", batch_size)

            if is_benchmark:
                self.training_logging_hook = hooks.BenchmarkLoggingHook(
                    global_batch_size=global_batch_size,
                    warmup_steps=warmup_steps)
            else:
                self.training_logging_hook = hooks.TrainingLoggingHook(
                    global_batch_size=global_batch_size,
                    num_steps=num_steps,
                    num_samples=num_samples,
                    num_epochs=num_epochs,
                    steps_per_epoch=steps_per_epoch)
            training_hooks.append(self.training_logging_hook)

        if hvd_utils.is_using_hvd():
            bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
            training_hooks.append(bcast_hook)

        training_hooks.append(hooks.PrefillStagingAreasHook())
        training_hooks.append(hooks.TrainingPartitionHook())

        estimator_params = {
            'batch_size': batch_size,
            'steps_per_epoch': steps_per_epoch,
            'num_gpus': num_gpus,
            'momentum': momentum,
            'lr_init': lr_init,
            'lr_warmup_epochs': lr_warmup_epochs,
            'weight_decay': weight_decay,
            'loss_scale': loss_scale,
            'apply_loss_scaling': use_static_loss_scaling,
            'label_smoothing': label_smoothing,
            'mixup': mixup,
            'num_decay_steps': num_decay_steps,
            'use_cosine_lr': use_cosine_lr,
            'use_final_conv': use_final_conv,
            'quantize': quantize,
            'use_qdq': use_qdq,
            'symmetric': symmetric,
            'quant_delay': quant_delay
        }

        if finetune_checkpoint:
            estimator_params['finetune_checkpoint'] = finetune_checkpoint

        image_classifier = self._get_estimator(
            mode='train',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
            gpu_id=self.run_hparams.gpu_id)

        def training_data_fn():

            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    print("Using DALI input... ")

                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                if hvd.rank() == 0:
                    print("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )

        try:
            current_step = image_classifier.get_variable_value("global_step")
        except ValueError:
            current_step = 0

        run_iter = max(0, min(run_iter, num_steps - current_step))
        print("Current step:", current_step)

        if run_iter > 0:
            try:
                image_classifier.train(
                    input_fn=training_data_fn,
                    steps=run_iter,
                    hooks=training_hooks,
                )
            except KeyboardInterrupt:
                print("Keyboard interrupt")

        if hvd.rank() == 0:
            if run_iter > 0:
                print('Ending Model Training ...')
                train_throughput = self.training_logging_hook.mean_throughput.value(
                )
                train_time = self.training_logging_hook.train_time
                dllogger.log(data={'train_throughput': train_throughput},
                             step=tuple())
                dllogger.log(data={'Total Training time': train_time},
                             step=tuple())
            else:
                print(
                    'Model already trained required number of steps. Skipped')
示例#24
0
def main(args):
    # Hyper-parameters
    epochs = args.epochs
    lr = args.learning_rate
    batch_size = args.batch_size
    momentum = args.momentum
    weight_decay = args.weight_decay
    optimizer = args.optimizer
    model_type = args.model_type

    # SageMaker options
    training_dir = args.train
    validation_dir = args.validation
    eval_dir = args.eval

    # Change: Initialize SMDataParallel and get the size of the cluster
    smdp.init()
    size = smdp.size()

    # Change: Pin GPU to local process (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        # SMDataParallel: Pin GPUs to a single SMDataParallel process [use SMDataParallel local_rank() API]
        tf.config.experimental.set_visible_devices(gpus[smdp.local_rank()],
                                                   'GPU')

    # Get dataset
    train_dataset = get_dataset(training_dir + '/train.tfrecords', batch_size)
    train_dataset = train_dataset.take(NUM_TRAIN_IMAGES // size).shuffle(10000)

    val_dataset = get_dataset(validation_dir + '/validation.tfrecords',
                              batch_size)
    eval_dataset = get_dataset(eval_dir + '/eval.tfrecords', batch_size)

    # Load model
    model = get_model(model_type)

    # Optimizer
    if optimizer.lower() == 'adam':
        opt = Adam(lr=lr * size, decay=weight_decay)
    elif optimizer.lower() == 'rmsprop':
        opt = RMSprop(lr=lr * size, decay=weight_decay)
    else:
        opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum)

    # Loss function
    loss = tf.keras.losses.CategoricalCrossentropy()

    # Metrics to track
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.CategoricalAccuracy(
        name='train_accuracy')

    val_loss = tf.keras.metrics.Mean(name='val_loss')
    val_accuracy = tf.keras.metrics.CategoricalAccuracy(name='val_accuracy')

    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')

    # Training step
    @tf.function
    def training_step(images, labels, first_batch):
        with tf.GradientTape() as tape:
            train_pred = model(images, training=True)
            loss_value = loss(labels, train_pred)
        # Change: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape
        tape = smdp.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, model.trainable_variables)
        opt.apply_gradients(zip(grads, model.trainable_variables))

        if first_batch:
            # Change: Broadcast model and optimizer variables
            smdp.broadcast_variables(model.variables, root_rank=0)
            smdp.broadcast_variables(opt.variables(), root_rank=0)

        # Change: all_reduce call
        train_loss_value = smdp.oob_allreduce(
            loss_value)  # Average the loss across workers

        train_loss(train_loss_value)
        train_accuracy(labels, train_pred)
        return

    # Test step
    @tf.function
    def test_step(images, labels):
        val_pred = model(images, training=False)
        val_loss_value = loss(labels, val_pred)

        val_loss(val_loss_value)
        val_accuracy(labels, val_pred)
        return

    if smdp.rank() == 0:
        tb_log_dir = '/opt/ml/output/tensorboard/'
        train_summary_writer = tf.summary.create_file_writer(tb_log_dir)
        test_summary_writer = tf.summary.create_file_writer(tb_log_dir)

    # Training loop
    for epoch in range(epochs):
        train_loss.reset_states()
        train_accuracy.reset_states()
        val_loss.reset_states()
        val_accuracy.reset_states()

        for batch, (images, labels) in enumerate(train_dataset):
            start_time = time.time()
            training_step(images, labels, batch == 0)
            epoch_time = time.time() - start_time

        for images, labels in val_dataset:
            test_step(images, labels)

        if smdp.rank() == 0:
            with train_summary_writer.as_default():
                tf.summary.scalar('train_loss',
                                  train_loss.result(),
                                  step=epoch)
                tf.summary.scalar('train_accuracy',
                                  train_accuracy.result(),
                                  step=epoch)

            with test_summary_writer.as_default():
                tf.summary.scalar('val_loss', val_loss.result(), step=epoch)
                tf.summary.scalar('val_accuracy',
                                  val_accuracy.result(),
                                  step=epoch)

            print(
                f'Epoch: {epoch + 1}, '
                f'Epoch duration: {epoch_time} sec, '
                f'Training loss: {train_loss.result()}, '
                f'Training accuracy: {train_accuracy.result() * 100}',
                f'Validation Loss: {val_loss.result()}, '
                f'Validation Accuracy: {val_accuracy.result() * 100}')

    for images, labels in eval_dataset:
        test_pred = model(images, training=False)
        test_loss_value = loss(labels, test_pred)

        test_loss(test_loss_value)
        test_accuracy(labels, test_pred)

    print('====== Test Results ======')
    print(f'Test loss: {test_loss.result()}, '
          f'Test accuracy: {test_accuracy.result() * 100}')
    print('====== End of training ======')

    # Change: Save checkpoints only from master node.
    if smdp.rank() == 0:
        model.save(os.path.join(os.environ["SM_MODEL_DIR"], '1'))
示例#25
0
    def __init__(
            self,
            # ========= Model HParams ========= #
            n_classes=1001,
            architecture='resnet50',
            input_format='NHWC',  # NCHW or NHWC
            compute_format='NCHW',  # NCHW or NHWC
            dtype=tf.float32,  # tf.float32 or tf.float16
            n_channels=3,
            height=224,
            width=224,
            distort_colors=False,
            model_dir=None,
            log_dir=None,
            data_dir=None,
            data_idx_dir=None,
            weight_init="fan_out",

            # ======= Optimization HParams ======== #
            use_xla=False,
            use_tf_amp=False,
            use_dali=False,
            gpu_memory_fraction=1.0,
            gpu_id=0,

            # ======== Debug Flags ======== #
            debug_verbosity=0,
            seed=None):

        if dtype not in [tf.float32, tf.float16]:
            raise ValueError(
                "Unknown dtype received: %s (allowed: `tf.float32` and `tf.float16`)"
                % dtype)

        if compute_format not in ["NHWC", 'NCHW']:
            raise ValueError(
                "Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])"
                % compute_format)

        if input_format not in ["NHWC", 'NCHW']:
            raise ValueError(
                "Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])"
                % input_format)

        if n_channels not in [1, 3]:
            raise ValueError(
                "Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))"
                % n_channels)

        tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None

        # ============================================
        # Optimsation Flags - Do not remove
        # ============================================

        os.environ['CUDA_CACHE_DISABLE'] = '0'

        os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

        #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
        os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd(
        ) else str(hvd.size())

        os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

        os.environ['TF_ADJUST_HUE_FUSED'] = '1'
        os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

        os.environ['TF_SYNC_ON_FINISH'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
        os.environ['TF_DISABLE_NVTX_RANGES'] = '1'
        os.environ["TF_XLA_FLAGS"] = (
            os.environ.get("TF_XLA_FLAGS", "") +
            " --tf_xla_enable_lazy_compilation=false")

        # ============================================
        # TF-AMP Setup - Do not remove
        # ============================================

        if dtype == tf.float16:
            if use_tf_amp:
                raise RuntimeError(
                    "TF AMP can not be activated for FP16 precision")

        elif use_tf_amp:
            os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
        else:
            os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "0"

        # =================================================

        model_hparams = tf.contrib.training.HParams(
            width=height,
            height=width,
            n_channels=n_channels,
            n_classes=n_classes,
            dtype=dtype,
            input_format=input_format,
            compute_format=compute_format,
            distort_colors=distort_colors,
            seed=tf_seed)

        num_preprocessing_threads = 10 if not use_dali else 4
        run_config_performance = tf.contrib.training.HParams(
            num_preprocessing_threads=num_preprocessing_threads,
            use_tf_amp=use_tf_amp,
            use_xla=use_xla,
            use_dali=use_dali,
            gpu_memory_fraction=gpu_memory_fraction,
            gpu_id=gpu_id)

        run_config_additional = tf.contrib.training.HParams(
            model_dir=model_dir
            if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            log_dir=log_dir
            if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            data_dir=data_dir,
            data_idx_dir=data_idx_dir,
            num_preprocessing_threads=num_preprocessing_threads)

        self.run_hparams = Runner._build_hparams(model_hparams,
                                                 run_config_additional,
                                                 run_config_performance)

        model_name = architecture
        architecture = resnet.model_architectures[architecture]

        self._model = resnet.ResnetModel(
            model_name=model_name,
            n_classes=model_hparams.n_classes,
            layers_count=architecture["layers"],
            layers_depth=architecture["widths"],
            expansions=architecture["expansions"],
            input_format=model_hparams.input_format,
            compute_format=model_hparams.compute_format,
            dtype=model_hparams.dtype,
            weight_init=weight_init,
            use_dali=use_dali,
            cardinality=architecture['cardinality']
            if 'cardinality' in architecture else 1,
            use_se=architecture['use_se']
            if 'use_se' in architecture else False,
            se_ratio=architecture['se_ratio']
            if 'se_ratio' in architecture else 1)

        if self.run_hparams.seed is not None:
            np.random.seed(self.run_hparams.seed)
            tf.set_random_seed(self.run_hparams.seed)

        self.training_logging_hook = None
        self.eval_logging_hook = None
示例#26
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments)
    )
    (
        model_args,
        data_args,
        train_args,
        log_args,
        path_args,
        remaining_strings,
    ) = parser.parse_args_into_dataclasses(return_remaining_strings=True)
    # SageMaker may have some extra strings. TODO: Test this on SM.
    assert len(remaining_strings) == 0, f"The args {remaining_strings} could not be parsed."

    tf.random.set_seed(train_args.seed)
    tf.autograph.set_verbosity(0)

    # Settings init
    parse_bool = lambda arg: arg == "true"
    do_gradient_accumulation = train_args.gradient_accumulation_steps > 1
    do_xla = not parse_bool(train_args.skip_xla)
    do_eager = parse_bool(train_args.eager)
    skip_sop = parse_bool(train_args.skip_sop)
    skip_mlm = parse_bool(train_args.skip_mlm)
    pre_layer_norm = parse_bool(model_args.pre_layer_norm)
    fast_squad = parse_bool(log_args.fast_squad)
    dummy_eval = parse_bool(log_args.dummy_eval)
    is_sagemaker = path_args.filesystem_prefix.startswith("/opt/ml")
    disable_tqdm = is_sagemaker
    global max_grad_norm
    max_grad_norm = train_args.max_grad_norm

    # TODO : Change to obfuscate smddpcommon. This code does not use GradientTape, so need to pass it like this.
    if train_args.bucket_cap_mb:
        bucket_cap_bytes = int(train_args.bucket_cap_mb * 1024 * 1024)
    else:
        bucket_cap_bytes = int(64 * 1024 * 1024)
    hc.setBucketSize(bucket_cap_bytes)

    gpus = tf.config.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.set_visible_devices(gpus[smddp.local_rank()], "GPU")
    # XLA, AutoGraph
    tf.config.optimizer.set_jit(do_xla)
    tf.config.experimental_run_functions_eagerly(do_eager)

    if smddp.rank() == 0:
        # Run name should only be used on one process to avoid race conditions
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        platform = "sm" if is_sagemaker else "eks"
        if skip_sop:
            loss_str = "-skipsop"
        elif skip_mlm:
            loss_str = "-skipmlm"
        else:
            loss_str = ""

        if log_args.run_name is None:
            metadata = (
                f"{model_args.model_type}"
                f"-{model_args.model_size}"
                f"-{model_args.load_from}"
                f"-{smddp.size()}gpus"
                f"-{train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps}globalbatch"
                f"-{train_args.learning_rate}maxlr"
                f"-{train_args.learning_rate_decay_power}power"
                f"-{train_args.optimizer}opt"
                f"-{train_args.total_steps}steps"
                f"-{'preln' if pre_layer_norm else 'postln'}"
                f"{loss_str}"
                f"-{model_args.hidden_dropout_prob}dropout"
            )
            run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}"
        else:
            run_name = log_args.run_name

        # Logging should only happen on a single process
        # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time
        level = logging.INFO
        format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s"
        if not os.path.exists(path_args.log_dir):
            os.makedirs(path_args.log_dir)
        handlers = [
            logging.FileHandler(
                os.path.join(path_args.filesystem_prefix, path_args.log_dir, f"{run_name}.log")
            ),
            TqdmLoggingHandler(),
        ]
        logging.basicConfig(level=level, format=format, handlers=handlers)

        # Check that arguments passed in properly, only after registering the alert_func and logging
        assert not (skip_sop and skip_mlm), "Cannot use --skip_sop and --skip_mlm"

    wrap_global_functions(do_gradient_accumulation)

    # Create optimizer and enable AMP loss scaling.
    if train_args.optimizer == "lamb":
        optimizer = get_lamb_optimizer(train_args)
    elif train_args.optimizer == "adamw":
        optimizer = get_adamw_optimizer(train_args)

    if _PRE_TF_2_4_0:
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer, loss_scale="dynamic"
        )
    else:
        optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

    gradient_accumulator = GradientAccumulator()

    loaded_optimizer_weights = None

    model = create_model(model_class=TFAutoModelForPreTraining, model_args=model_args)
    tokenizer = create_tokenizer(model_args.model_type)
    if model_args.load_from == "checkpoint":
        checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path)
        model_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix(checkpoint_path)
        if smddp.rank() == 0:
            model.load_weights(model_ckpt)
            if model_args.load_optimizer_state == "true":
                loaded_optimizer_weights = np.load(optimizer_ckpt, allow_pickle=True)
            # We do not set the weights yet, we have to do a first step to initialize the optimizer.

    # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories
    # Move to same folder structure and remove if/else
    train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir, "*.tfrecord")
    validation_glob = os.path.join(path_args.filesystem_prefix, path_args.val_dir, "*.tfrecord")

    train_filenames = glob.glob(train_glob)
    validation_filenames = glob.glob(validation_glob)

    train_dataset = get_dataset_from_tfrecords(
        model_type=model_args.model_type,
        filenames=train_filenames,
        max_seq_length=data_args.max_seq_length,
        max_predictions_per_seq=data_args.max_predictions_per_seq,
        per_gpu_batch_size=train_args.per_gpu_batch_size,
    )  # Of shape [per_gpu_batch_size, ...]
    # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, per_gpu_batch_size, ...]
    train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps)
    # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps.
    train_dataset = train_dataset.prefetch(buffer_size=8)

    # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks
    if smddp.rank() == 0:
        validation_dataset = get_dataset_from_tfrecords(
            model_type=model_args.model_type,
            filenames=validation_filenames,
            max_seq_length=data_args.max_seq_length,
            max_predictions_per_seq=data_args.max_predictions_per_seq,
            per_gpu_batch_size=train_args.per_gpu_batch_size,
        )
        # validation_dataset = validation_dataset.batch(1)
        validation_dataset = validation_dataset.prefetch(buffer_size=8)

        pbar = tqdm.tqdm(total=train_args.total_steps, disable=disable_tqdm)
        summary_writer = None  # Only create a writer if we make it through a successful step
        logger.info(f"Starting training, job name {run_name}")

    i = 1
    start_time = time.perf_counter()
    train_start_time = time.perf_counter()
    for batch in train_dataset:
        learning_rate = optimizer.learning_rate(step=tf.constant(i, dtype=tf.float32))
        # weight_decay = wd_schedule(step=tf.constant(i, dtype=tf.float32))
        loss_scale = optimizer.loss_scale() if _PRE_TF_2_4_0 else optimizer.loss_scale
        loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step(
            model=model,
            optimizer=optimizer,
            gradient_accumulator=gradient_accumulator,
            batch=batch,
            gradient_accumulation_steps=train_args.gradient_accumulation_steps,
            skip_sop=skip_sop,
            skip_mlm=skip_mlm,
        )

        # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors
        if i == 1:
            if smddp.rank() == 0 and loaded_optimizer_weights is not None:
                optimizer.set_weights(loaded_optimizer_weights)
            print (" RANK {} is broadcasting".format(smddp.rank()))
            #smddp.broadcast_variables(model.variables + optimizer.variables(), root_rank=0)
            smddp.broadcast_variables(model.variables, root_rank=0)
            smddp.broadcast_variables(optimizer.variables(), root_rank=0)
            print(" RANK {} is done broadcasting".format(smddp.rank()))
            # smddp.broadcast_variables(optimizer.variables(), root_rank=0)
            i = optimizer.get_weights()[0]

        is_final_step = i >= train_args.total_steps
        do_squad = (log_args.squad_frequency != 0) and (
            (i % log_args.squad_frequency == 0) or is_final_step
        )
        # Squad requires all the ranks to train, but results are only returned on rank 0
        if do_squad:
            from albert.run_squad import get_squad_results_while_pretraining
            squad_results = get_squad_results_while_pretraining(
                model=model,
                tokenizer=tokenizer,
                model_size=model_args.model_size,
                filesystem_prefix=path_args.filesystem_prefix,
                step=i,
                dataset=data_args.squad_version,
                fast=log_args.fast_squad,
                dummy_eval=log_args.dummy_eval,
            )
            if smddp.rank() == 0:
                squad_exact, squad_f1 = squad_results["exact"], squad_results["f1"]
                logger.info(f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}")
            # Re-wrap autograph so it doesn't get arg mismatches
            wrap_global_functions(do_gradient_accumulation)
            gc.collect()

        if smddp.rank() == 0:
            do_log = i % log_args.log_frequency == 0
            do_checkpoint = (log_args.checkpoint_frequency != 0) and (
                (i % log_args.checkpoint_frequency == 0) or is_final_step
            )
            do_validation = (log_args.validation_frequency != 0) and (
                (i % log_args.validation_frequency == 0) or is_final_step
            )

            pbar.update(1)
            description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}"
            pbar.set_description(description)
            if do_log:
                elapsed_time = time.perf_counter() - start_time
                if i == 1:
                    logger.info(f"First step: {elapsed_time:.3f} secs")
                elif is_final_step:
                    total_time = time.perf_counter() - train_start_time
                    seq_per_sec = i * train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps / total_time
                    logger.info(f"Final step {i}: {description} -- Average seq_per_sec: {seq_per_sec:.2f} -- Total Time: {total_time}")
                else:
                    it_per_sec = log_args.log_frequency / elapsed_time
                    logger.info(f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}")
                    start_time = time.perf_counter()

            if do_checkpoint:
                checkpoint_prefix = os.path.join(
                    path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{i}"
                )
                model_ckpt = f"{checkpoint_prefix}.ckpt"
                optimizer_ckpt = f"{checkpoint_prefix}-optimizer.npy"
                logger.info(f"Saving model at {model_ckpt}, optimizer at {optimizer_ckpt}")
                model.save_weights(model_ckpt)
                # model.load_weights(model_ckpt)

                optimizer_weights = optimizer.get_weights()
                np.save(optimizer_ckpt, optimizer_weights)
                # optimizer.set_weights(optimizer_weights)

            if do_validation:
                val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation(
                    model=model,
                    validation_dataset=validation_dataset,
                    skip_sop=skip_sop,
                    skip_mlm=skip_mlm,
                )
                description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}"
                logger.info(f"Validation step {i} -- {description}")

            # Create summary_writer after the first step
            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    os.path.join(path_args.filesystem_prefix, path_args.log_dir, run_name)
                )
                config = {
                    **asdict(model_args),
                    **asdict(data_args),
                    **asdict(train_args),
                    **asdict(log_args),
                    "global_batch_size": train_args.per_gpu_batch_size * smddp.size(),
                }
                if is_wandb_available():
                    wandb.init(config=config, project=model_args.model_type)
                    wandb.run.save()
                    wandb_run_name = wandb.run.name

            train_metrics = {
                "weight_norm": weight_norm,
                "grad_norm": grad_norm,
                "loss_scale": loss_scale,
                "learning_rate": learning_rate,
                "train/loss": loss,
                "train/mlm_loss": mlm_loss,
                "train/mlm_acc": mlm_acc,
                "train/sop_loss": sop_loss,
                "train/sop_acc": sop_acc,
            }
            all_metrics = {**train_metrics}
            if do_validation:
                val_metrics = {
                    "val/loss": val_loss,
                    "val/mlm_loss": val_mlm_loss,
                    "val/mlm_acc": val_mlm_acc,
                    "val/sop_loss": val_sop_loss,
                    "val/sop_acc": val_sop_acc,
                }
                all_metrics = {**all_metrics, **val_metrics}
            if do_squad:
                squad_metrics = {
                    "squad/f1": squad_f1,
                    "squad/exact": squad_exact,
                }
                all_metrics = {**all_metrics, **squad_metrics}

            # Log to TensorBoard
            with summary_writer.as_default():
                for name, val in all_metrics.items():
                    tf.summary.scalar(name, val, step=i)
            # Log to Weights & Biases
            if is_wandb_available():
                wandb.log({"step": i, **all_metrics})

        i += 1
        if is_final_step:
            break

    if smddp.rank() == 0:
        pbar.close()
        logger.info(f"Finished pretraining, job name {run_name}")
示例#27
0
    def evaluate(
        self,
        iter_unit,
        num_iter,
        batch_size,
        warmup_steps=50,
        log_every_n_steps=1,
        is_benchmark=False,
        export_dir=None,
        quantize=False,
        symmetric=False,
        use_qdq=False,
        use_final_conv=False,
    ):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError(
                '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])'
                % iter_unit)

        if self.run_hparams.data_dir is None and not is_benchmark:
            raise ValueError('`data_dir` must be specified for evaluation!')

        if hvd_utils.is_using_hvd() and hvd.rank() != 0:
            raise RuntimeError('Multi-GPU inference is not supported')

        estimator_params = {
            'quantize': quantize,
            'symmetric': symmetric,
            'use_qdq': use_qdq,
            'use_final_conv': use_final_conv
        }

        image_classifier = self._get_estimator(
            mode='validation',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
            gpu_id=self.run_hparams.gpu_id)

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
                data_dir=self.run_hparams.data_dir,
                mode="validation",
                iter_unit=iter_unit,
                num_iter=num_iter,
                global_batch_size=batch_size,
            )

        else:
            num_epochs = 1
            num_decay_steps = -1
            num_steps = num_iter

        if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
                data_idx_dir=self.run_hparams.data_idx_dir, mode="validation")

        eval_hooks = []

        if hvd.rank() == 0:
            self.eval_logging_hook = hooks.BenchmarkLoggingHook(
                global_batch_size=batch_size, warmup_steps=warmup_steps)
            eval_hooks.append(self.eval_logging_hook)

            print('Starting Model Evaluation...')
            print("Evaluation Epochs", num_epochs)
            print("Evaluation Steps", num_steps)
            print("Decay Steps", num_decay_steps)
            print("Global Batch Size", batch_size)

        def evaluation_data_fn():

            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    print("Using DALI input... ")

                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=False,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:
                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=False,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                print("Using Synthetic Data ...\n")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )

        try:
            eval_results = image_classifier.evaluate(
                input_fn=evaluation_data_fn,
                steps=num_steps,
                hooks=eval_hooks,
            )

            eval_throughput = self.eval_logging_hook.mean_throughput.value()
            eval_latencies = np.array(self.eval_logging_hook.latencies) * 1000
            eval_latencies_q = np.quantile(eval_latencies, q=[0.9, 0.95, 0.99])
            eval_latencies_mean = np.mean(eval_latencies)

            dllogger.log(data={
                'top1_accuracy': float(eval_results['top1_accuracy']),
                'top5_accuracy': float(eval_results['top5_accuracy']),
                'eval_throughput': eval_throughput,
                'eval_latency_avg': eval_latencies_mean,
                'eval_latency_p90': eval_latencies_q[0],
                'eval_latency_p95': eval_latencies_q[1],
                'eval_latency_p99': eval_latencies_q[2],
            },
                         step=tuple())

            if export_dir is not None:
                dllogger.log(data={'export_dir': export_dir}, step=tuple())
                input_receiver_fn = data_utils.get_serving_input_receiver_fn(
                    batch_size=None,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    dtype=self.run_hparams.dtype)

                image_classifier.export_savedmodel(export_dir,
                                                   input_receiver_fn)

        except KeyboardInterrupt:
            print("Keyboard interrupt")

        print('Model evaluation finished')
示例#28
0
def build(input_reader_config, batch_size=None, transform_input_data_fn=None, multi_gpu=True):
  """Builds a tf.data.Dataset.

  Builds a tf.data.Dataset by applying the `transform_input_data_fn` on all
  records. Applies a padded batch to the resulting dataset.

  Args:
    input_reader_config: A input_reader_pb2.InputReader object.
    batch_size: Batch size. If batch size is None, no batching is performed.
    transform_input_data_fn: Function to apply transformation to all records,
      or None if no extra decoding is required.

  Returns:
    A tf.data.Dataset based on the input_reader_config.

  Raises:
    ValueError: On invalid input reader proto.
    ValueError: If no input paths are specified.
  """
  if not isinstance(input_reader_config, input_reader_pb2.InputReader):
    raise ValueError('input_reader_config not of type '
                     'input_reader_pb2.InputReader.')

  if input_reader_config.WhichOneof('input_reader') == 'tf_record_input_reader':
    config = input_reader_config.tf_record_input_reader
    if not config.input_path:
      raise ValueError('At least one input path must be specified in '
                       '`input_reader_config`.')

    label_map_proto_file = None
    if input_reader_config.HasField('label_map_path'):
      label_map_proto_file = input_reader_config.label_map_path
    decoder = tf_example_decoder.TfExampleDecoder(
        load_instance_masks=input_reader_config.load_instance_masks,
        instance_mask_type=input_reader_config.mask_type,
        label_map_proto_file=label_map_proto_file,
        use_display_name=input_reader_config.use_display_name,
        num_additional_channels=input_reader_config.num_additional_channels)

    def process_fn(value):
      """Sets up tf graph that decodes, transforms and pads input data."""
      processed_tensors = decoder.decode(value)
      if transform_input_data_fn is not None:
        processed_tensors = transform_input_data_fn(processed_tensors)
      return processed_tensors

    dataset = read_dataset(
        functools.partial(tf.data.TFRecordDataset, buffer_size=8 * 1000 * 1000),
        config.input_path[:], input_reader_config)
    if multi_gpu:
        dataset = dataset.shard(hvd.size(), hvd.rank())
    # TODO(rathodv): make batch size a required argument once the old binaries
    # are deleted.
    if batch_size:
      num_parallel_calls = batch_size * input_reader_config.num_parallel_batches
    else:
      num_parallel_calls = input_reader_config.num_parallel_map_calls
    dataset = dataset.map(
        process_fn,
        num_parallel_calls=num_parallel_calls)
    if batch_size:
      dataset = dataset.apply(
          tf.contrib.data.batch_and_drop_remainder(batch_size))
    dataset = dataset.prefetch(input_reader_config.num_prefetch_batches)
    return dataset

  raise ValueError('Unsupported input_reader_config.')