예제 #1
0
def get_datasets():
    # Load dataset
    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])

    # Preprocess train dataset
    train_dataset = train_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
    )
    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])

    train_features = {x: train_dataset[x] for x in ["input_ids", "attention_mask"]}
    tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"]))

    # Preprocess test dataset
    test_dataset = test_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
    )
    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])

    test_features = {x: test_dataset[x] for x in ["input_ids", "attention_mask"]}
    tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"]))

    if SDP_ENABLED:
        tf_train_dataset = tf_train_dataset.shard(sdp.size(), sdp.rank())
        tf_test_dataset = tf_test_dataset.shard(sdp.size(), sdp.rank())
    tf_train_dataset = tf_train_dataset.batch(args.train_batch_size, drop_remainder=True)
    tf_test_dataset = tf_test_dataset.batch(args.eval_batch_size, drop_remainder=True)

    return tf_train_dataset, tf_test_dataset
def train(args):
    # Load data from S3
    #     train_dir = os.environ.get('SM_CHANNEL_TRAIN')
    train_dir = args.train
    batch_size = args.batch_size
    dataset = get_train_data(train_dir, batch_size)

    model = get_resnet50(transfer_learning=True)

    loss_fn = tf.losses.SparseCategoricalCrossentropy()
    acc = tf.metrics.SparseCategoricalAccuracy(name='train_accuracy')

    # SMDataParallel: dist.size()
    # LR for 8 node run : 0.000125
    # LR for single node run : 0.001
    opt = tf.optimizers.Adam(args.learning_rate * dist.size())

    checkpoint_dir = os.environ['SM_MODEL_DIR']
    checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)

    @tf.function
    def training_step(images, labels, first_batch):
        with tf.GradientTape() as tape:
            probs = model(images, training=True)
            loss_value = loss_fn(labels, probs)
            acc_value = acc(labels, probs)

        # SMDataParallel: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape
        tape = dist.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, model.trainable_variables)
        opt.apply_gradients(zip(grads, model.trainable_variables))

        if first_batch:
            # SMDataParallel: Broadcast model and optimizer variables
            dist.broadcast_variables(model.variables, root_rank=0)
            dist.broadcast_variables(opt.variables(), root_rank=0)

        # SMDataParallel: all_reduce call
        loss_value = dist.oob_allreduce(
            loss_value)  # Average the loss across workers
        acc_value = dist.oob_allreduce(acc_value)

        return loss_value, acc_value

    for epoch in range(args.epochs):
        for batch, (images,
                    labels) in enumerate(dataset.take(10000 // dist.size())):
            loss_value, acc_value = training_step(images, labels, batch == 0)

            if batch % 100 == 0 and dist.rank() == 0:
                logger.info(
                    '*** Epoch %d   Step   #%d Accuracy: %.6f   Loss: %.6f ***'
                    % (epoch, batch, acc_value, loss_value))

    # SMDataParallel: Save checkpoints only from master node.
    if dist.rank() == 0:
        model.save(os.path.join(checkpoint_dir, '1'))
예제 #3
0
def create_train_and_eval_specs(train_input_fn,
                                eval_input_fns,
                                eval_on_train_input_fn,
                                predict_input_fn,
                                train_steps,
                                eval_on_train_data=False,
                                final_exporter_name='Servo',
                                eval_spec_names=None):
  """Creates a `TrainSpec` and `EvalSpec`s.

  Args:
    train_input_fn: Function that produces features and labels on train data.
    eval_input_fns: A list of functions that produce features and labels on eval
      data.
    eval_on_train_input_fn: Function that produces features and labels for
      evaluation on train data.
    predict_input_fn: Function that produces features for inference.
    train_steps: Number of training steps.
    eval_on_train_data: Whether to evaluate model on training data. Default is
      False.
    final_exporter_name: String name given to `FinalExporter`.
    eval_spec_names: A list of string names for each `EvalSpec`.

  Returns:
    Tuple of `TrainSpec` and list of `EvalSpecs`. If `eval_on_train_data` is
    True, the last `EvalSpec` in the list will correspond to training data. The
    rest EvalSpecs in the list are evaluation datas.
  """
  train_spec = tf.estimator.TrainSpec(
      input_fn=train_input_fn,
      max_steps=train_steps // hvd.size(), # no `steps' attribute; only max_steps available
      hooks=[hvd.BroadcastGlobalVariablesHook(0)])

  if eval_spec_names is None:
    eval_spec_names = [str(i) for i in range(len(eval_input_fns))]

  eval_specs = []
  for index, (eval_spec_name, eval_input_fn) in enumerate(
      zip(eval_spec_names, eval_input_fns)):
    # Uses final_exporter_name as exporter_name for the first eval spec for
    # backward compatibility.
    if index == 0:
      exporter_name = final_exporter_name
    else:
      exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name)
    exporter = tf.estimator.FinalExporter(
        name=exporter_name, serving_input_receiver_fn=predict_input_fn)
    eval_specs.append(
        tf.estimator.EvalSpec(
            name=eval_spec_name,
            input_fn=eval_input_fn,
            steps=None,
            exporters=exporter))

  if eval_on_train_data:
    eval_specs.append(
        tf.estimator.EvalSpec(
            name='eval_on_train', input_fn=eval_on_train_input_fn, steps=None))

  return train_spec, eval_specs
예제 #4
0
    def _get_distribution_strategy(self) -> TFDistributionStrategy:
        try:
            import horovod.tensorflow as hvd

            if hvd.size():
                return TFDistributionStrategy.HOROVOD
        except (ModuleNotFoundError, ValueError, ImportError):
            pass

        # smdistributed.dataparallel should be invoked via `mpirun`.
        # It supports EC2 machines with 8 GPUs per machine.
        if check_smdataparallel_env():
            try:
                import smdistributed.dataparallel.tensorflow as smdataparallel

                # The total number of GPUs across all the nodes in the cluster
                if smdataparallel.size():
                    return TFDistributionStrategy.SMDATAPARALLEL
            except (ModuleNotFoundError, ValueError, ImportError):
                pass

        strat = tf.distribute.get_strategy()
        if is_mirrored_strategy(strat):
            return TFDistributionStrategy.MIRRORED

        if isinstance(strat, _DefaultDistributionStrategy):
            # single device
            return TFDistributionStrategy.NONE

        # Disable PS till we verify proper support of PS on SM
        # if self.tf_config_json and is_parameter_server_strategy(self.tf_config):
        #     return TFDistributionStrategy.PARAMETER_SERVER

        return TFDistributionStrategy.UNSUPPORTED
예제 #5
0
def train(mnist_epochs):
    """
    Train CNN
    :param mnist_epochs: number of training steps to run for
    :return: None
    """
    for batch, (images, labels) in enumerate(
            dataset.take(mnist_epochs // dist.size())):
        loss_value = training_step(images, labels, batch)
        # print loss every 50 epochs in master worker
        if batch % 50 == 0 and dist.rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))
예제 #6
0
def get_dataset(
    tokenizer: PreTrainedTokenizer,
    processor: SquadProcessor,
    data_dir: str,
    filename: str,
    per_gpu_batch_size: int,
    shard: bool,
    drop_remainder: bool,
    shuffle: bool = True,
    max_seq_length: int = 384,
    doc_stride: int = 128,
    max_query_length: int = 64,
    evaluate: bool = False,
    return_raw_features: bool = False,
    repeat: bool = False,
) -> tf.data.Dataset:
    # Convert the data from a JSON file into a tf.data.Dataset
    # This function should also work to fetch the val_dataset
    if evaluate:
        examples: List[SquadExample] = processor.get_dev_examples(
            data_dir, filename=filename)
    else:
        examples: List[SquadExample] = processor.get_train_examples(
            data_dir, filename=filename)
    # dataset is a tuple of (features, dataset)
    dataset: List[SquadFeatures] = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
        return_dataset=None if return_raw_features else "tf",
        threads=16,
    )
    if return_raw_features:
        return dataset
    else:
        if shard:
            dataset = dataset.shard(smddp.size(), smddp.rank())
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1000,
                                      reshuffle_each_iteration=True)
        if repeat:
            dataset = dataset.repeat()
        dataset = dataset.batch(per_gpu_batch_size,
                                drop_remainder=drop_remainder)
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1000,
                                      reshuffle_each_iteration=True)
        return dataset
예제 #7
0
def get_tfrecords_input_fn(filenames, batch_size, height, width, training,
                           distort_color, num_threads, deterministic):

    shuffle_buffer_size = 4096

    if deterministic:
        if hvd_utils.is_using_hvd():
            seed = 13 * (1 + hvd.rank())
        else:
            seed = 13
    else:
        seed = None

    ds = tf.data.Dataset.from_tensor_slices(filenames)

    if hvd_utils.is_using_hvd() and training:
        ds = ds.shard(hvd.size(), hvd.rank())

    ds = ds.apply(
        tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset,
                                                 cycle_length=10,
                                                 block_length=8,
                                                 sloppy=not deterministic,
                                                 prefetch_input_elements=16))

    counter = tf.data.Dataset.range(sys.maxsize)
    ds = tf.data.Dataset.zip((ds, counter))

    def preproc_func(record, counter_):
        return image_processing.preprocess_image_record(
            record, height, width, _NUM_CHANNELS, training)

    if training:
        ds = ds.apply(
            tf.data.experimental.shuffle_and_repeat(
                buffer_size=shuffle_buffer_size, seed=seed))
    else:
        ds = ds.repeat()

    ds = ds.apply(
        tf.data.experimental.map_and_batch(
            map_func=preproc_func,
            num_parallel_calls=num_threads,
            batch_size=batch_size,
            drop_remainder=True,
        ))

    ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return ds
예제 #8
0
  def __init__(self, runtime_config, model_fn):
    super(EstimatorExecuter, self).__init__(runtime_config, model_fn)

    if MPI_is_distributed():
      os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
      os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1'
      # os.environ['HOROVOD_AUTOTUNE'] = '2'

      logging.info("SageMaker Distributed Data Parallel successfully initialized ...")

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed() else str(hvd.size())

    os.environ['TF_SYNC_ON_FINISH'] = '0'
예제 #9
0
    def _get_session_config(mode,
                            use_xla,
                            use_dali,
                            gpu_memory_fraction,
                            gpu_id=0):

        if mode not in ["train", 'validation', 'benchmark', 'inference']:
            raise ValueError(
                "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')"
                % mode)

        # Limit available GPU memory (tune the size)
        if use_dali:
            gpu_options = tf.GPUOptions(
                per_process_gpu_memory_fraction=gpu_memory_fraction)
            config = tf.ConfigProto(gpu_options=gpu_options)
            config.gpu_options.allow_growth = False
        else:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True

        config.allow_soft_placement = True
        config.log_device_placement = False

        config.gpu_options.visible_device_list = str(gpu_id)

        if hvd_utils.is_using_hvd():
            config.gpu_options.visible_device_list = str(hvd.local_rank())

        if use_xla:
            config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        config.gpu_options.force_gpu_compatible = True  # Force pinned memory

        # Bug - disable bn+relu fusion
        from tensorflow.core.protobuf import rewriter_config_pb2
        config.graph_options.rewrite_options.remapping = (
            rewriter_config_pb2.RewriterConfig.OFF)

        if mode == 'train':
            config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads

            if hvd_utils.is_using_hvd():
                config.inter_op_parallelism_threads = max(
                    2, (multiprocessing.cpu_count() // hvd.size()) - 2)
            else:
                config.inter_op_parallelism_threads = 4

        return config
예제 #10
0
    def _get_num_workers(self):
        self._assert_distribution_strategy()
        if self.distribution_strategy == TFDistributionStrategy.HOROVOD:
            import horovod.tensorflow as hvd

            return hvd.size()
        elif self.distribution_strategy == TFDistributionStrategy.SMDATAPARALLEL:
            import smdistributed.dataparallel.tensorflow as smdataparallel

            return smdataparallel.size()
        elif self.distribution_strategy == TFDistributionStrategy.MIRRORED:
            strategy = tf.distribute.get_strategy()
            return strategy.num_replicas_in_sync
        elif self.distribution_strategy == TFDistributionStrategy.PARAMETER_SERVER:
            return get_num_workers_from_tf_config(self.tf_config_json)
        elif self.distribution_strategy == TFDistributionStrategy.NONE:
            return 1
        elif self.distribution_strategy == TFDistributionStrategy.UNSUPPORTED:
            return 1
예제 #11
0
def create_model(mnist_learning_rate):
    """
    Creates a new keras model for learning
    :param mnist_learning_rate: learning rate for the Adam Optimizer
    :return: mode, loss function, and optimizer
    """
    # neural net
    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    mnist_loss = tf.losses.SparseCategoricalCrossentropy()
    # learning rate is proportional to number of workers
    mnist_optimizer = tf.optimizers.Adam(mnist_learning_rate * dist.size())
    return mnist_model, mnist_loss, mnist_optimizer
예제 #12
0
    def _get_num_workers(self):
        self._assert_distribution_strategy()
        if self.distribution_strategy == TFDistributionStrategy.HOROVOD:
            if _smp_imported and smp.core.initialized:
                # when model parallel is being used, there will be multiple hvd process groups,
                # hence use smp.size
                return smp.size()

            import horovod.tensorflow as hvd

            return hvd.size()
        elif self.distribution_strategy == TFDistributionStrategy.SMDATAPARALLEL:
            import smdistributed.dataparallel.tensorflow as smdataparallel

            return smdataparallel.size()
        elif self.distribution_strategy == TFDistributionStrategy.MIRRORED:
            strategy = tf.distribute.get_strategy()
            return strategy.num_replicas_in_sync
        elif self.distribution_strategy == TFDistributionStrategy.PARAMETER_SERVER:
            return get_num_workers_from_tf_config(self.tf_config_json)
        elif self.distribution_strategy == TFDistributionStrategy.NONE:
            return 1
        elif self.distribution_strategy == TFDistributionStrategy.UNSUPPORTED:
            return 1
예제 #13
0
    def __init__(self,
                 filenames,
                 idx_filenames,
                 height,
                 width,
                 batch_size,
                 num_threads,
                 dtype=tf.uint8,
                 dali_cpu=True,
                 deterministic=False,
                 training=False):
        device_id = hvd.local_rank()
        shard_id = hvd.rank()
        num_gpus = hvd.size()
        pipe = HybridPipe(tfrec_filenames=filenames,
                          tfrec_idx_filenames=idx_filenames,
                          height=height,
                          width=width,
                          batch_size=batch_size,
                          num_threads=num_threads,
                          device_id=device_id,
                          shard_id=shard_id,
                          num_gpus=num_gpus,
                          deterministic=deterministic,
                          dali_cpu=dali_cpu,
                          training=training)

        daliop = dali_tf.DALIIterator()

        with tf.device("/gpu:0"):
            self.images, self.labels = daliop(pipeline=pipe,
                                              shapes=[(batch_size, height,
                                                       width, 3),
                                                      (batch_size, 1)],
                                              dtypes=[tf.float32, tf.int64],
                                              device_id=device_id)
예제 #14
0
    def _get_global_batch_size(worker_batch_size):

        if hvd_utils.is_using_hvd():
            return worker_batch_size * hvd.size()
        else:
            return worker_batch_size
예제 #15
0
    parser.add_argument('--rank', type=int, default=0)

    # SageMaker Container environment
    parser.add_argument('--model_dir', type=str, default='../model')
    parser.add_argument('--data_dir', type=str, default='../data')

    args = parser.parse_args()

    try:
        args.model_dir = os.environ['SM_MODEL_DIR']
        args.data_dir = os.environ['SM_CHANNEL_TRAINING']
    except KeyError as e:
        print(
            "The model starts training on the local host without SageMaker TrainingJob."
        )
        if not os.path.exists(args.model_dir):
            os.makedirs(args.model_dir)
        pass

    ########################################################
    ####### 2. SageMaker Distributed Data Parallel   #######
    #######  - Get all number of GPU and rank number #######
    ########################################################

    args.size = smdp.size()  # all number of GPU
    args.rank = smdp.rank()  # total rank in all hosts
    args.local_rank = smdp.local_rank()  # rank per host

    ########################################################

    train(args)
예제 #16
0
def MPI_size():
    return hr.size()
예제 #17
0
def main(unused_argv):
  tf.logging.set_verbosity(tf.logging.INFO)
  if FLAGS.amp:
      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
  else:
      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0"

  # Set seed to reduce randomness
  np.random.seed(FLAGS.seed)
  tf.set_random_seed(FLAGS.seed)

  hvd.init()

  flags.mark_flag_as_required('model_dir')
  flags.mark_flag_as_required('pipeline_config_path')
  session_config = tf.ConfigProto()
  session_config.gpu_options.per_process_gpu_memory_fraction=0.9
  session_config.gpu_options.visible_device_list = str(hvd.local_rank())
  if FLAGS.allow_xla:
      session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
  model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
  config = tf.estimator.RunConfig(tf_random_seed=(FLAGS.seed + hvd.rank()),
                                  model_dir=model_dir, session_config=session_config)

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
      eval_count=FLAGS.eval_count,
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
      sample_1_of_n_eval_on_train_examples=(
          FLAGS.sample_1_of_n_eval_on_train_examples))
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
  eval_input_fns = train_and_eval_dict['eval_input_fns']
  eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
  predict_input_fn = train_and_eval_dict['predict_input_fn']
  train_steps = train_and_eval_dict['train_steps']

  if FLAGS.checkpoint_dir:
    if FLAGS.eval_training_data:
      name = 'training_data'
      input_fn = eval_on_train_input_fn
    else:
      name = 'validation_data'
      # The first eval input will be evaluated.
      input_fn = eval_input_fns[0]
    if FLAGS.run_once:
      estimator.evaluate(input_fn,
                         steps=None,
                         checkpoint_path=tf.train.latest_checkpoint(
                             FLAGS.checkpoint_dir))
    else:
      model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn,
                                train_steps, name)
  else:
    train_spec, eval_specs = model_lib.create_train_and_eval_specs(
        train_input_fn,
        eval_input_fns,
        eval_on_train_input_fn,
        predict_input_fn,
        train_steps,
        eval_on_train_data=False)

    train_hooks = [hvd.BroadcastGlobalVariablesHook(0), DLLoggerHook(hvd.size()*train_and_eval_dict['train_batch_size'], hvd.rank())]
    eval_hooks = []

    for x in range(FLAGS.eval_count):
        estimator.train(train_input_fn,
                        hooks=train_hooks,
                        steps=train_steps // FLAGS.eval_count)


        if hvd.rank() == 0 and not FLAGS.train_only:
            eval_input_fn = eval_input_fns[0]
            results = estimator.evaluate(eval_input_fn,
                               steps=None,
                               hooks=eval_hooks)
def main(_):
  os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

  # Set seed to reduce randomness
  random.seed(FLAGS.seed)
  np.random.seed(FLAGS.seed)
  tf.set_random_seed(FLAGS.seed)

  if FLAGS.herring:
    import smdistributed.dataparallel.tensorflow as hvd
    hvd.init()

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  tf.io.gfile.makedirs(FLAGS.output_dir)

  input_files = []
  for input_file_dir in FLAGS.input_files_dir.split(","):
    input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*")))

  if FLAGS.herring and len(input_files) < hvd.size():
      raise ValueError("Input Files must be sharded")
  if FLAGS.amp and FLAGS.manual_fp16:
      raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  config = tf.compat.v1.ConfigProto()
  if FLAGS.herring:
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    if hvd.rank() == 0:
      tf.compat.v1.logging.info("***** Configuaration *****")
      for key in FLAGS.__flags.keys():
          tf.compat.v1.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
      tf.compat.v1.logging.info("**************************")

#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
  if FLAGS.use_xla: 
      config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
      config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
      if FLAGS.amp:
        tf.enable_resource_variables()

  run_config = tf.estimator.RunConfig(
      tf_random_seed=(FLAGS.seed if not FLAGS.herring else (FLAGS.seed + hvd.rank())),
      model_dir=FLAGS.output_dir,
      session_config=config,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None,
      save_summary_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None,
      # This variable controls how often estimator reports examples/sec.
      # Default value is every 100 steps.
      # When --report_loss is True, we set to very large value to prevent
      # default info reporting from estimator.
      # Ideally we should set it to None, but that does not work.
      log_step_count_steps=10000 if FLAGS.report_loss else 100)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate if not FLAGS.herring else FLAGS.learning_rate*hvd.size(),
      num_train_steps=FLAGS.num_train_steps,
      num_warmup_steps=FLAGS.num_warmup_steps,
      use_one_hot_embeddings=False,
      hvd=None if not FLAGS.herring else hvd)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

  if FLAGS.do_train:

    training_hooks = []
    if FLAGS.herring and hvd.size() > 1:
      training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if (not FLAGS.herring or hvd.rank() == 0):
      global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.herring else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
      training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps, FLAGS.save_checkpoints_steps, FLAGS.report_loss))

    tf.compat.v1.logging.info("***** Running training *****")
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(
        input_files=input_files,
        batch_size=FLAGS.train_batch_size,
        max_seq_length=FLAGS.max_seq_length,
        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
        is_training=True,
        hvd=None if not FLAGS.herring else hvd)

    train_start_time = time.time()
    estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
    train_time_elapsed = time.time() - train_start_time

    if (not FLAGS.herring or hvd.rank() == 0):
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = FLAGS.num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
                        FLAGS.num_train_steps * global_batch_size)
        tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
                        (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size)
        tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
        tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
        dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

  if FLAGS.do_eval and (not FLAGS.herring or hvd.rank() == 0):
    tf.compat.v1.logging.info("***** Running evaluation *****")
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    eval_files = []
    for eval_file_dir in FLAGS.eval_files_dir.split(","):
        eval_files.extend(tf.io.gfile.glob(os.path.join(eval_file_dir, "*")))

    eval_input_fn = input_fn_builder(
        input_files=eval_files,
        batch_size=FLAGS.eval_batch_size,
        max_seq_length=FLAGS.max_seq_length,
        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
        is_training=False,
        hvd=None if not FLAGS.herring else hvd)

    eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
    eval_start_time = time.time()
    result = estimator.evaluate(
        input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)

    eval_time_elapsed = time.time() - eval_start_time
    time_list = eval_hooks[-1].time_list
    time_list.sort()
    # Removing outliers (init/warmup) in throughput computation.
    eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
    num_sentences = (int(len(time_list) * 0.99)) * FLAGS.eval_batch_size

    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

    tf.compat.v1.logging.info("-----------------------------")
    tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
                    eval_hooks[-1].count * FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
                    num_sentences)
    tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
    tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
    tf.compat.v1.logging.info("Inference Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
    dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
    tf.compat.v1.logging.info("-----------------------------")

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    with tf.io.gfile.GFile(output_eval_file, "w") as writer:
      tf.compat.v1.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))
예제 #19
0
def get_dataset_from_tfrecords(
    *,
    model_type: str,
    filenames: List[str],
    per_gpu_batch_size: int,
    max_seq_length: int,
    max_predictions_per_seq: int = None,
    buffer_size: int = 1000,
    shard: bool = True,
) -> "tf.data.Dataset":
    """ Reads the dataset from TFRecords and returns it.
    Returns a dataset that includes batching, but not gradient accumulation.
    """
    def _parse_function(example_proto):
        # Parse the input `tf.Example` proto using the dictionary above.
        return tf.io.parse_single_example(example_proto, name_to_features)

    if model_type in ["albert", "bert"]:
        assert max_predictions_per_seq is not None, "Pass --max_predictions_per_seq"
        name_to_features = {
            "input_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to input_ids
            "input_mask":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to attention_mask
            "segment_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to token_type_ids
            "masked_lm_positions":
            tf.io.FixedLenFeature(
                [max_predictions_per_seq], tf.int64
            ),  # The number in the sequence that is masked, in range [0, max_seq_length]. 0 signifies a pad.
            "masked_lm_ids":
            tf.io.FixedLenFeature(
                [max_predictions_per_seq], tf.int64
            ),  # The token id that is masked, in range [0, vocab_size]. 0 signifies a pad.
            "masked_lm_weights":
            tf.io.FixedLenFeature(
                [max_predictions_per_seq],
                tf.float32),  # 1 if useful, 0 signifies a pad token
            "next_sentence_labels":
            tf.io.FixedLenFeature([1], tf.int64),
        }
    elif model_type in ["electra"]:
        name_to_features = {
            "input_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to input_ids
            "token_type_ids":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to token_type_ids
            "attention_mask":
            tf.io.FixedLenFeature([max_seq_length],
                                  tf.int64),  # corresponds to attention_mask
        }
    else:
        raise ValueError(
            f"model_type={model_type} must be one of ['albert', 'bert', 'electra']"
        )

    # Example input pipeline here: https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/run_pretraining.py#L443
    assert len(filenames) > 0, f"Filenames is an empty list"
    # Shard and shuffle the filenames
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    if shard:
        import smdistributed.dataparallel.tensorflow as smddp

        dataset = dataset.shard(smddp.size(), smddp.rank())
    dataset = dataset.shuffle(buffer_size=len(filenames),
                              reshuffle_each_iteration=True)
    dataset = dataset.repeat()

    # `cycle_length` is the number of parallel files that get read
    num_cpu_threads = 2 * 96
    cycle_length = min(num_cpu_threads, len(filenames))
    # file_to_dataset_func = lambda file: tf.data.TFRecordDataset(file).map(_parse_function)
    file_to_dataset_func = lambda file: tf.data.TFRecordDataset(file)
    dataset = dataset.interleave(
        file_to_dataset_func,
        cycle_length=cycle_length,
        block_length=1,
        num_parallel_calls=cycle_length,
    )
    # Map and batch will be automatically fused together, see https://www.tensorflow.org/api_docs/python/tf/data/experimental/map_and_batch
    dataset = dataset.map(_parse_function, num_parallel_calls=num_cpu_threads)
    dataset = dataset.shuffle(buffer_size=buffer_size,
                              reshuffle_each_iteration=True)
    dataset = dataset.batch(per_gpu_batch_size, drop_remainder=True)
    # Shuffle the batches and prefetch some batches
    dataset = dataset.shuffle(buffer_size=buffer_size,
                              reshuffle_each_iteration=True)

    return dataset
예제 #20
0
def main(args):
    # Hyper-parameters
    epochs = args.epochs
    lr = args.learning_rate
    batch_size = args.batch_size
    momentum = args.momentum
    weight_decay = args.weight_decay
    optimizer = args.optimizer
    model_type = args.model_type

    # SageMaker options
    training_dir = args.train
    validation_dir = args.validation
    eval_dir = args.eval

    # Change: Initialize SMDataParallel and get the size of the cluster
    smdp.init()
    size = smdp.size()

    # Change: Pin GPU to local process (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        # SMDataParallel: Pin GPUs to a single SMDataParallel process [use SMDataParallel local_rank() API]
        tf.config.experimental.set_visible_devices(gpus[smdp.local_rank()],
                                                   'GPU')

    # Get dataset
    train_dataset = get_dataset(training_dir + '/train.tfrecords', batch_size)
    train_dataset = train_dataset.take(NUM_TRAIN_IMAGES // size).shuffle(10000)

    val_dataset = get_dataset(validation_dir + '/validation.tfrecords',
                              batch_size)
    eval_dataset = get_dataset(eval_dir + '/eval.tfrecords', batch_size)

    # Load model
    model = get_model(model_type)

    # Optimizer
    if optimizer.lower() == 'adam':
        opt = Adam(lr=lr * size, decay=weight_decay)
    elif optimizer.lower() == 'rmsprop':
        opt = RMSprop(lr=lr * size, decay=weight_decay)
    else:
        opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum)

    # Loss function
    loss = tf.keras.losses.CategoricalCrossentropy()

    # Metrics to track
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.CategoricalAccuracy(
        name='train_accuracy')

    val_loss = tf.keras.metrics.Mean(name='val_loss')
    val_accuracy = tf.keras.metrics.CategoricalAccuracy(name='val_accuracy')

    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')

    # Training step
    @tf.function
    def training_step(images, labels, first_batch):
        with tf.GradientTape() as tape:
            train_pred = model(images, training=True)
            loss_value = loss(labels, train_pred)
        # Change: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape
        tape = smdp.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, model.trainable_variables)
        opt.apply_gradients(zip(grads, model.trainable_variables))

        if first_batch:
            # Change: Broadcast model and optimizer variables
            smdp.broadcast_variables(model.variables, root_rank=0)
            smdp.broadcast_variables(opt.variables(), root_rank=0)

        # Change: all_reduce call
        train_loss_value = smdp.oob_allreduce(
            loss_value)  # Average the loss across workers

        train_loss(train_loss_value)
        train_accuracy(labels, train_pred)
        return

    # Test step
    @tf.function
    def test_step(images, labels):
        val_pred = model(images, training=False)
        val_loss_value = loss(labels, val_pred)

        val_loss(val_loss_value)
        val_accuracy(labels, val_pred)
        return

    if smdp.rank() == 0:
        tb_log_dir = '/opt/ml/output/tensorboard/'
        train_summary_writer = tf.summary.create_file_writer(tb_log_dir)
        test_summary_writer = tf.summary.create_file_writer(tb_log_dir)

    # Training loop
    for epoch in range(epochs):
        train_loss.reset_states()
        train_accuracy.reset_states()
        val_loss.reset_states()
        val_accuracy.reset_states()

        for batch, (images, labels) in enumerate(train_dataset):
            start_time = time.time()
            training_step(images, labels, batch == 0)
            epoch_time = time.time() - start_time

        for images, labels in val_dataset:
            test_step(images, labels)

        if smdp.rank() == 0:
            with train_summary_writer.as_default():
                tf.summary.scalar('train_loss',
                                  train_loss.result(),
                                  step=epoch)
                tf.summary.scalar('train_accuracy',
                                  train_accuracy.result(),
                                  step=epoch)

            with test_summary_writer.as_default():
                tf.summary.scalar('val_loss', val_loss.result(), step=epoch)
                tf.summary.scalar('val_accuracy',
                                  val_accuracy.result(),
                                  step=epoch)

            print(
                f'Epoch: {epoch + 1}, '
                f'Epoch duration: {epoch_time} sec, '
                f'Training loss: {train_loss.result()}, '
                f'Training accuracy: {train_accuracy.result() * 100}',
                f'Validation Loss: {val_loss.result()}, '
                f'Validation Accuracy: {val_accuracy.result() * 100}')

    for images, labels in eval_dataset:
        test_pred = model(images, training=False)
        test_loss_value = loss(labels, test_pred)

        test_loss(test_loss_value)
        test_accuracy(labels, test_pred)

    print('====== Test Results ======')
    print(f'Test loss: {test_loss.result()}, '
          f'Test accuracy: {test_accuracy.result() * 100}')
    print('====== End of training ======')

    # Change: Save checkpoints only from master node.
    if smdp.rank() == 0:
        model.save(os.path.join(os.environ["SM_MODEL_DIR"], '1'))
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])
loss = tf.losses.SparseCategoricalCrossentropy()
# LR for 8 node run : 0.000125
# LR for single node run : 0.001
opt = tf.optimizers.Adam(0.000125 * dist.size())

checkpoint_dir = './checkpoints'
checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)


@tf.function
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = mnist_model(images, training=True)
        loss_value = loss(labels, probs)

    tape = dist.DistributedGradientTape(tape)

    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
예제 #22
0
mnist_model = tf.keras.Sequential(
    [
        tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
        tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(10, activation="softmax"),
    ]
)
loss = tf.losses.SparseCategoricalCrossentropy()

opt = tf.optimizers.Adam(0.001 * smdataparallel.size())

checkpoint_dir = "/tmp/checkpoints"
checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)


def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = mnist_model(images, training=True)
        loss_value = loss(labels, probs)

    # Create a new DistributedGradientTape, which uses TensorFlow’s GradientTape under the hood,
    # using an AllReduce to combine gradient values before applying gradients to model weights.
    tape = smdataparallel.DistributedGradientTape(tape)

    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
예제 #23
0
    def train(self,
              iter_unit,
              num_iter,
              run_iter,
              batch_size,
              warmup_steps=50,
              weight_decay=1e-4,
              lr_init=0.1,
              lr_warmup_epochs=5,
              momentum=0.9,
              log_every_n_steps=1,
              loss_scale=256,
              label_smoothing=0.0,
              mixup=0.0,
              use_cosine_lr=False,
              use_static_loss_scaling=False,
              is_benchmark=False,
              quantize=False,
              symmetric=False,
              quant_delay=0,
              finetune_checkpoint=None,
              use_final_conv=False,
              use_qdq=False):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError(
                '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])'
                % iter_unit)

        if self.run_hparams.data_dir is None and not is_benchmark:
            raise ValueError('`data_dir` must be specified for training!')

        if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16:
            if use_static_loss_scaling:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0"
            else:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
        else:
            use_static_loss_scaling = False  # Make sure it hasn't been set to True on FP32 training

        num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size()
        global_batch_size = batch_size * num_gpus

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
                data_dir=self.run_hparams.data_dir,
                mode="train",
                iter_unit=iter_unit,
                num_iter=num_iter,
                global_batch_size=global_batch_size,
            )

            steps_per_epoch = num_steps / num_epochs

        else:
            num_epochs = 1
            num_steps = num_iter
            steps_per_epoch = num_steps
            num_decay_steps = num_steps
            num_samples = num_steps * batch_size

        if run_iter == -1:
            run_iter = num_steps
        else:
            run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter

        if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
                data_idx_dir=self.run_hparams.data_idx_dir, mode="train")

        training_hooks = []

        if hvd.rank() == 0:
            print('Starting Model Training...')
            print("Training Epochs", num_epochs)
            print("Total Steps", num_steps)
            print("Steps per Epoch", steps_per_epoch)
            print("Decay Steps", num_decay_steps)
            print("Weight Decay Factor", weight_decay)
            print("Init Learning Rate", lr_init)
            print("Momentum", momentum)
            print("Num GPUs", num_gpus)
            print("Per-GPU Batch Size", batch_size)

            if is_benchmark:
                self.training_logging_hook = hooks.BenchmarkLoggingHook(
                    global_batch_size=global_batch_size,
                    warmup_steps=warmup_steps)
            else:
                self.training_logging_hook = hooks.TrainingLoggingHook(
                    global_batch_size=global_batch_size,
                    num_steps=num_steps,
                    num_samples=num_samples,
                    num_epochs=num_epochs,
                    steps_per_epoch=steps_per_epoch)
            training_hooks.append(self.training_logging_hook)

        if hvd_utils.is_using_hvd():
            bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
            training_hooks.append(bcast_hook)

        training_hooks.append(hooks.PrefillStagingAreasHook())
        training_hooks.append(hooks.TrainingPartitionHook())

        estimator_params = {
            'batch_size': batch_size,
            'steps_per_epoch': steps_per_epoch,
            'num_gpus': num_gpus,
            'momentum': momentum,
            'lr_init': lr_init,
            'lr_warmup_epochs': lr_warmup_epochs,
            'weight_decay': weight_decay,
            'loss_scale': loss_scale,
            'apply_loss_scaling': use_static_loss_scaling,
            'label_smoothing': label_smoothing,
            'mixup': mixup,
            'num_decay_steps': num_decay_steps,
            'use_cosine_lr': use_cosine_lr,
            'use_final_conv': use_final_conv,
            'quantize': quantize,
            'use_qdq': use_qdq,
            'symmetric': symmetric,
            'quant_delay': quant_delay
        }

        if finetune_checkpoint:
            estimator_params['finetune_checkpoint'] = finetune_checkpoint

        image_classifier = self._get_estimator(
            mode='train',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
            gpu_id=self.run_hparams.gpu_id)

        def training_data_fn():

            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    print("Using DALI input... ")

                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                if hvd.rank() == 0:
                    print("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )

        try:
            current_step = image_classifier.get_variable_value("global_step")
        except ValueError:
            current_step = 0

        run_iter = max(0, min(run_iter, num_steps - current_step))
        print("Current step:", current_step)

        if run_iter > 0:
            try:
                image_classifier.train(
                    input_fn=training_data_fn,
                    steps=run_iter,
                    hooks=training_hooks,
                )
            except KeyboardInterrupt:
                print("Keyboard interrupt")

        if hvd.rank() == 0:
            if run_iter > 0:
                print('Ending Model Training ...')
                train_throughput = self.training_logging_hook.mean_throughput.value(
                )
                train_time = self.training_logging_hook.train_time
                dllogger.log(data={'train_throughput': train_throughput},
                             step=tuple())
                dllogger.log(data={'Total Training time': train_time},
                             step=tuple())
            else:
                print(
                    'Model already trained required number of steps. Skipped')
예제 #24
0
def is_using_hvd():
    return hvd.size() > 1
예제 #25
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments)
    )
    (
        model_args,
        data_args,
        train_args,
        log_args,
        path_args,
        remaining_strings,
    ) = parser.parse_args_into_dataclasses(return_remaining_strings=True)
    # SageMaker may have some extra strings. TODO: Test this on SM.
    assert len(remaining_strings) == 0, f"The args {remaining_strings} could not be parsed."

    tf.random.set_seed(train_args.seed)
    tf.autograph.set_verbosity(0)

    # Settings init
    parse_bool = lambda arg: arg == "true"
    do_gradient_accumulation = train_args.gradient_accumulation_steps > 1
    do_xla = not parse_bool(train_args.skip_xla)
    do_eager = parse_bool(train_args.eager)
    skip_sop = parse_bool(train_args.skip_sop)
    skip_mlm = parse_bool(train_args.skip_mlm)
    pre_layer_norm = parse_bool(model_args.pre_layer_norm)
    fast_squad = parse_bool(log_args.fast_squad)
    dummy_eval = parse_bool(log_args.dummy_eval)
    is_sagemaker = path_args.filesystem_prefix.startswith("/opt/ml")
    disable_tqdm = is_sagemaker
    global max_grad_norm
    max_grad_norm = train_args.max_grad_norm

    # TODO : Change to obfuscate smddpcommon. This code does not use GradientTape, so need to pass it like this.
    if train_args.bucket_cap_mb:
        bucket_cap_bytes = int(train_args.bucket_cap_mb * 1024 * 1024)
    else:
        bucket_cap_bytes = int(64 * 1024 * 1024)
    hc.setBucketSize(bucket_cap_bytes)

    gpus = tf.config.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.set_visible_devices(gpus[smddp.local_rank()], "GPU")
    # XLA, AutoGraph
    tf.config.optimizer.set_jit(do_xla)
    tf.config.experimental_run_functions_eagerly(do_eager)

    if smddp.rank() == 0:
        # Run name should only be used on one process to avoid race conditions
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        platform = "sm" if is_sagemaker else "eks"
        if skip_sop:
            loss_str = "-skipsop"
        elif skip_mlm:
            loss_str = "-skipmlm"
        else:
            loss_str = ""

        if log_args.run_name is None:
            metadata = (
                f"{model_args.model_type}"
                f"-{model_args.model_size}"
                f"-{model_args.load_from}"
                f"-{smddp.size()}gpus"
                f"-{train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps}globalbatch"
                f"-{train_args.learning_rate}maxlr"
                f"-{train_args.learning_rate_decay_power}power"
                f"-{train_args.optimizer}opt"
                f"-{train_args.total_steps}steps"
                f"-{'preln' if pre_layer_norm else 'postln'}"
                f"{loss_str}"
                f"-{model_args.hidden_dropout_prob}dropout"
            )
            run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}"
        else:
            run_name = log_args.run_name

        # Logging should only happen on a single process
        # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time
        level = logging.INFO
        format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s"
        if not os.path.exists(path_args.log_dir):
            os.makedirs(path_args.log_dir)
        handlers = [
            logging.FileHandler(
                os.path.join(path_args.filesystem_prefix, path_args.log_dir, f"{run_name}.log")
            ),
            TqdmLoggingHandler(),
        ]
        logging.basicConfig(level=level, format=format, handlers=handlers)

        # Check that arguments passed in properly, only after registering the alert_func and logging
        assert not (skip_sop and skip_mlm), "Cannot use --skip_sop and --skip_mlm"

    wrap_global_functions(do_gradient_accumulation)

    # Create optimizer and enable AMP loss scaling.
    if train_args.optimizer == "lamb":
        optimizer = get_lamb_optimizer(train_args)
    elif train_args.optimizer == "adamw":
        optimizer = get_adamw_optimizer(train_args)

    if _PRE_TF_2_4_0:
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer, loss_scale="dynamic"
        )
    else:
        optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

    gradient_accumulator = GradientAccumulator()

    loaded_optimizer_weights = None

    model = create_model(model_class=TFAutoModelForPreTraining, model_args=model_args)
    tokenizer = create_tokenizer(model_args.model_type)
    if model_args.load_from == "checkpoint":
        checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path)
        model_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix(checkpoint_path)
        if smddp.rank() == 0:
            model.load_weights(model_ckpt)
            if model_args.load_optimizer_state == "true":
                loaded_optimizer_weights = np.load(optimizer_ckpt, allow_pickle=True)
            # We do not set the weights yet, we have to do a first step to initialize the optimizer.

    # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories
    # Move to same folder structure and remove if/else
    train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir, "*.tfrecord")
    validation_glob = os.path.join(path_args.filesystem_prefix, path_args.val_dir, "*.tfrecord")

    train_filenames = glob.glob(train_glob)
    validation_filenames = glob.glob(validation_glob)

    train_dataset = get_dataset_from_tfrecords(
        model_type=model_args.model_type,
        filenames=train_filenames,
        max_seq_length=data_args.max_seq_length,
        max_predictions_per_seq=data_args.max_predictions_per_seq,
        per_gpu_batch_size=train_args.per_gpu_batch_size,
    )  # Of shape [per_gpu_batch_size, ...]
    # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, per_gpu_batch_size, ...]
    train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps)
    # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps.
    train_dataset = train_dataset.prefetch(buffer_size=8)

    # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks
    if smddp.rank() == 0:
        validation_dataset = get_dataset_from_tfrecords(
            model_type=model_args.model_type,
            filenames=validation_filenames,
            max_seq_length=data_args.max_seq_length,
            max_predictions_per_seq=data_args.max_predictions_per_seq,
            per_gpu_batch_size=train_args.per_gpu_batch_size,
        )
        # validation_dataset = validation_dataset.batch(1)
        validation_dataset = validation_dataset.prefetch(buffer_size=8)

        pbar = tqdm.tqdm(total=train_args.total_steps, disable=disable_tqdm)
        summary_writer = None  # Only create a writer if we make it through a successful step
        logger.info(f"Starting training, job name {run_name}")

    i = 1
    start_time = time.perf_counter()
    train_start_time = time.perf_counter()
    for batch in train_dataset:
        learning_rate = optimizer.learning_rate(step=tf.constant(i, dtype=tf.float32))
        # weight_decay = wd_schedule(step=tf.constant(i, dtype=tf.float32))
        loss_scale = optimizer.loss_scale() if _PRE_TF_2_4_0 else optimizer.loss_scale
        loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step(
            model=model,
            optimizer=optimizer,
            gradient_accumulator=gradient_accumulator,
            batch=batch,
            gradient_accumulation_steps=train_args.gradient_accumulation_steps,
            skip_sop=skip_sop,
            skip_mlm=skip_mlm,
        )

        # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors
        if i == 1:
            if smddp.rank() == 0 and loaded_optimizer_weights is not None:
                optimizer.set_weights(loaded_optimizer_weights)
            print (" RANK {} is broadcasting".format(smddp.rank()))
            #smddp.broadcast_variables(model.variables + optimizer.variables(), root_rank=0)
            smddp.broadcast_variables(model.variables, root_rank=0)
            smddp.broadcast_variables(optimizer.variables(), root_rank=0)
            print(" RANK {} is done broadcasting".format(smddp.rank()))
            # smddp.broadcast_variables(optimizer.variables(), root_rank=0)
            i = optimizer.get_weights()[0]

        is_final_step = i >= train_args.total_steps
        do_squad = (log_args.squad_frequency != 0) and (
            (i % log_args.squad_frequency == 0) or is_final_step
        )
        # Squad requires all the ranks to train, but results are only returned on rank 0
        if do_squad:
            from albert.run_squad import get_squad_results_while_pretraining
            squad_results = get_squad_results_while_pretraining(
                model=model,
                tokenizer=tokenizer,
                model_size=model_args.model_size,
                filesystem_prefix=path_args.filesystem_prefix,
                step=i,
                dataset=data_args.squad_version,
                fast=log_args.fast_squad,
                dummy_eval=log_args.dummy_eval,
            )
            if smddp.rank() == 0:
                squad_exact, squad_f1 = squad_results["exact"], squad_results["f1"]
                logger.info(f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}")
            # Re-wrap autograph so it doesn't get arg mismatches
            wrap_global_functions(do_gradient_accumulation)
            gc.collect()

        if smddp.rank() == 0:
            do_log = i % log_args.log_frequency == 0
            do_checkpoint = (log_args.checkpoint_frequency != 0) and (
                (i % log_args.checkpoint_frequency == 0) or is_final_step
            )
            do_validation = (log_args.validation_frequency != 0) and (
                (i % log_args.validation_frequency == 0) or is_final_step
            )

            pbar.update(1)
            description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}"
            pbar.set_description(description)
            if do_log:
                elapsed_time = time.perf_counter() - start_time
                if i == 1:
                    logger.info(f"First step: {elapsed_time:.3f} secs")
                elif is_final_step:
                    total_time = time.perf_counter() - train_start_time
                    seq_per_sec = i * train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps / total_time
                    logger.info(f"Final step {i}: {description} -- Average seq_per_sec: {seq_per_sec:.2f} -- Total Time: {total_time}")
                else:
                    it_per_sec = log_args.log_frequency / elapsed_time
                    logger.info(f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}")
                    start_time = time.perf_counter()

            if do_checkpoint:
                checkpoint_prefix = os.path.join(
                    path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{i}"
                )
                model_ckpt = f"{checkpoint_prefix}.ckpt"
                optimizer_ckpt = f"{checkpoint_prefix}-optimizer.npy"
                logger.info(f"Saving model at {model_ckpt}, optimizer at {optimizer_ckpt}")
                model.save_weights(model_ckpt)
                # model.load_weights(model_ckpt)

                optimizer_weights = optimizer.get_weights()
                np.save(optimizer_ckpt, optimizer_weights)
                # optimizer.set_weights(optimizer_weights)

            if do_validation:
                val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation(
                    model=model,
                    validation_dataset=validation_dataset,
                    skip_sop=skip_sop,
                    skip_mlm=skip_mlm,
                )
                description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}"
                logger.info(f"Validation step {i} -- {description}")

            # Create summary_writer after the first step
            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    os.path.join(path_args.filesystem_prefix, path_args.log_dir, run_name)
                )
                config = {
                    **asdict(model_args),
                    **asdict(data_args),
                    **asdict(train_args),
                    **asdict(log_args),
                    "global_batch_size": train_args.per_gpu_batch_size * smddp.size(),
                }
                if is_wandb_available():
                    wandb.init(config=config, project=model_args.model_type)
                    wandb.run.save()
                    wandb_run_name = wandb.run.name

            train_metrics = {
                "weight_norm": weight_norm,
                "grad_norm": grad_norm,
                "loss_scale": loss_scale,
                "learning_rate": learning_rate,
                "train/loss": loss,
                "train/mlm_loss": mlm_loss,
                "train/mlm_acc": mlm_acc,
                "train/sop_loss": sop_loss,
                "train/sop_acc": sop_acc,
            }
            all_metrics = {**train_metrics}
            if do_validation:
                val_metrics = {
                    "val/loss": val_loss,
                    "val/mlm_loss": val_mlm_loss,
                    "val/mlm_acc": val_mlm_acc,
                    "val/sop_loss": val_sop_loss,
                    "val/sop_acc": val_sop_acc,
                }
                all_metrics = {**all_metrics, **val_metrics}
            if do_squad:
                squad_metrics = {
                    "squad/f1": squad_f1,
                    "squad/exact": squad_exact,
                }
                all_metrics = {**all_metrics, **squad_metrics}

            # Log to TensorBoard
            with summary_writer.as_default():
                for name, val in all_metrics.items():
                    tf.summary.scalar(name, val, step=i)
            # Log to Weights & Biases
            if is_wandb_available():
                wandb.log({"step": i, **all_metrics})

        i += 1
        if is_final_step:
            break

    if smddp.rank() == 0:
        pbar.close()
        logger.info(f"Finished pretraining, job name {run_name}")
예제 #26
0
    def __init__(
            self,
            # ========= Model HParams ========= #
            n_classes=1001,
            architecture='resnet50',
            input_format='NHWC',  # NCHW or NHWC
            compute_format='NCHW',  # NCHW or NHWC
            dtype=tf.float32,  # tf.float32 or tf.float16
            n_channels=3,
            height=224,
            width=224,
            distort_colors=False,
            model_dir=None,
            log_dir=None,
            data_dir=None,
            data_idx_dir=None,
            weight_init="fan_out",

            # ======= Optimization HParams ======== #
            use_xla=False,
            use_tf_amp=False,
            use_dali=False,
            gpu_memory_fraction=1.0,
            gpu_id=0,

            # ======== Debug Flags ======== #
            debug_verbosity=0,
            seed=None):

        if dtype not in [tf.float32, tf.float16]:
            raise ValueError(
                "Unknown dtype received: %s (allowed: `tf.float32` and `tf.float16`)"
                % dtype)

        if compute_format not in ["NHWC", 'NCHW']:
            raise ValueError(
                "Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])"
                % compute_format)

        if input_format not in ["NHWC", 'NCHW']:
            raise ValueError(
                "Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])"
                % input_format)

        if n_channels not in [1, 3]:
            raise ValueError(
                "Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))"
                % n_channels)

        tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None

        # ============================================
        # Optimsation Flags - Do not remove
        # ============================================

        os.environ['CUDA_CACHE_DISABLE'] = '0'

        os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

        #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
        os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd(
        ) else str(hvd.size())

        os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

        os.environ['TF_ADJUST_HUE_FUSED'] = '1'
        os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

        os.environ['TF_SYNC_ON_FINISH'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
        os.environ['TF_DISABLE_NVTX_RANGES'] = '1'
        os.environ["TF_XLA_FLAGS"] = (
            os.environ.get("TF_XLA_FLAGS", "") +
            " --tf_xla_enable_lazy_compilation=false")

        # ============================================
        # TF-AMP Setup - Do not remove
        # ============================================

        if dtype == tf.float16:
            if use_tf_amp:
                raise RuntimeError(
                    "TF AMP can not be activated for FP16 precision")

        elif use_tf_amp:
            os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
        else:
            os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "0"

        # =================================================

        model_hparams = tf.contrib.training.HParams(
            width=height,
            height=width,
            n_channels=n_channels,
            n_classes=n_classes,
            dtype=dtype,
            input_format=input_format,
            compute_format=compute_format,
            distort_colors=distort_colors,
            seed=tf_seed)

        num_preprocessing_threads = 10 if not use_dali else 4
        run_config_performance = tf.contrib.training.HParams(
            num_preprocessing_threads=num_preprocessing_threads,
            use_tf_amp=use_tf_amp,
            use_xla=use_xla,
            use_dali=use_dali,
            gpu_memory_fraction=gpu_memory_fraction,
            gpu_id=gpu_id)

        run_config_additional = tf.contrib.training.HParams(
            model_dir=model_dir
            if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            log_dir=log_dir
            if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            data_dir=data_dir,
            data_idx_dir=data_idx_dir,
            num_preprocessing_threads=num_preprocessing_threads)

        self.run_hparams = Runner._build_hparams(model_hparams,
                                                 run_config_additional,
                                                 run_config_performance)

        model_name = architecture
        architecture = resnet.model_architectures[architecture]

        self._model = resnet.ResnetModel(
            model_name=model_name,
            n_classes=model_hparams.n_classes,
            layers_count=architecture["layers"],
            layers_depth=architecture["widths"],
            expansions=architecture["expansions"],
            input_format=model_hparams.input_format,
            compute_format=model_hparams.compute_format,
            dtype=model_hparams.dtype,
            weight_init=weight_init,
            use_dali=use_dali,
            cardinality=architecture['cardinality']
            if 'cardinality' in architecture else 1,
            use_se=architecture['use_se']
            if 'use_se' in architecture else False,
            se_ratio=architecture['se_ratio']
            if 'se_ratio' in architecture else 1)

        if self.run_hparams.seed is not None:
            np.random.seed(self.run_hparams.seed)
            tf.set_random_seed(self.run_hparams.seed)

        self.training_logging_hook = None
        self.eval_logging_hook = None
예제 #27
0
def build(input_reader_config, batch_size=None, transform_input_data_fn=None, multi_gpu=True):
  """Builds a tf.data.Dataset.

  Builds a tf.data.Dataset by applying the `transform_input_data_fn` on all
  records. Applies a padded batch to the resulting dataset.

  Args:
    input_reader_config: A input_reader_pb2.InputReader object.
    batch_size: Batch size. If batch size is None, no batching is performed.
    transform_input_data_fn: Function to apply transformation to all records,
      or None if no extra decoding is required.

  Returns:
    A tf.data.Dataset based on the input_reader_config.

  Raises:
    ValueError: On invalid input reader proto.
    ValueError: If no input paths are specified.
  """
  if not isinstance(input_reader_config, input_reader_pb2.InputReader):
    raise ValueError('input_reader_config not of type '
                     'input_reader_pb2.InputReader.')

  if input_reader_config.WhichOneof('input_reader') == 'tf_record_input_reader':
    config = input_reader_config.tf_record_input_reader
    if not config.input_path:
      raise ValueError('At least one input path must be specified in '
                       '`input_reader_config`.')

    label_map_proto_file = None
    if input_reader_config.HasField('label_map_path'):
      label_map_proto_file = input_reader_config.label_map_path
    decoder = tf_example_decoder.TfExampleDecoder(
        load_instance_masks=input_reader_config.load_instance_masks,
        instance_mask_type=input_reader_config.mask_type,
        label_map_proto_file=label_map_proto_file,
        use_display_name=input_reader_config.use_display_name,
        num_additional_channels=input_reader_config.num_additional_channels)

    def process_fn(value):
      """Sets up tf graph that decodes, transforms and pads input data."""
      processed_tensors = decoder.decode(value)
      if transform_input_data_fn is not None:
        processed_tensors = transform_input_data_fn(processed_tensors)
      return processed_tensors

    dataset = read_dataset(
        functools.partial(tf.data.TFRecordDataset, buffer_size=8 * 1000 * 1000),
        config.input_path[:], input_reader_config)
    if multi_gpu:
        dataset = dataset.shard(hvd.size(), hvd.rank())
    # TODO(rathodv): make batch size a required argument once the old binaries
    # are deleted.
    if batch_size:
      num_parallel_calls = batch_size * input_reader_config.num_parallel_batches
    else:
      num_parallel_calls = input_reader_config.num_parallel_map_calls
    dataset = dataset.map(
        process_fn,
        num_parallel_calls=num_parallel_calls)
    if batch_size:
      dataset = dataset.apply(
          tf.contrib.data.batch_and_drop_remainder(batch_size))
    dataset = dataset.prefetch(input_reader_config.num_prefetch_batches)
    return dataset

  raise ValueError('Unsupported input_reader_config.')