示例#1
0
def predict_ctl(params):
  image_width = params['image_width']
  image_height = params['image_height']
  batch_size = params['batch_size']
  export_dir = params['export_dir']

  assert export_dir, "--export_dir must be given."
  model_path = export_dir +"/saved_model_rn50.h5"
  assert os.path.exists(model_path)

  model = keras.models.load_model(model_path, custom_objects={
      "PiecewiseConstantDecayWithWarmup":
          common.PiecewiseConstantDecayWithWarmup})

  predict_input = image_processing.fake_image_set(batch_size, image_height,
                                                  image_width, with_label=False)
  results = model.predict(predict_input, verbose=1, steps=3)
  print(f"The loaded model predicts {results.shape[0]} images.")
示例#2
0
def train(model_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    distort_color = params['distort_color']
    momentum = params['momentum']
    loss_scale = params['loss_scale']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    batch_size = params['batch_size']
    num_iter = params['num_iter']
    iter_unit = params['iter_unit']
    log_dir = params['log_dir']
    export_dir = params['export_dir']
    tensorboard_dir = params['tensorboard_dir']
    display_every = params['display_every']
    precision = params['precision']
    dali_mode = params['dali_mode']
    use_xla = params['use_xla']

    if data_dir is not None:
        file_format = os.path.join(data_dir, '%s-*')
        train_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_files = sorted(tf.io.gfile.glob(file_format % 'validation'))
        num_train_samples = common.get_num_records(train_files)
        num_valid_samples = common.get_num_records(valid_files)
    else:
        num_train_samples = 1281982
        num_valid_samples = 5000

    train_idx_files = None
    valid_idx_files = None
    if data_idx_dir is not None:
        file_format = os.path.join(data_idx_dir, '%s-*')
        train_idx_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_idx_files = sorted(tf.io.gfile.glob(file_format % 'validation'))

    if iter_unit.lower() == 'epoch':
        num_epochs = num_iter
        nstep_per_epoch = num_train_samples // (batch_size * hvd.size())
        nstep_per_valid = num_valid_samples // (batch_size * hvd.size())
    else:
        assert iter_unit.lower() == 'batch'
        num_epochs = 1
        nstep_per_epoch = min(num_iter,
                              num_train_samples // (batch_size * hvd.size()))
        nstep_per_valid = min(10,
                              num_valid_samples // (batch_size * hvd.size()))

    initial_epoch = 0
    if log_dir:
        # We save check points only when using the real data.
        assert data_dir, "--data_dir cannot be empty when using --log_dir"
        assert os.path.exists(log_dir)
        ckpt_format = log_dir + "/model-{epoch:02d}-{val_top1:.2f}.hdf5"
        # Looks for the most recent checkpoint and sets the initial epoch from it.
        for filename in os.listdir(log_dir):
            if filename.startswith('model-'):
                initial_epoch = max(int(re.findall(r'\d+', filename)[0]),
                                    initial_epoch)

    if tensorboard_dir:
        assert os.path.exists(tensorboard_dir)

    if export_dir:
        assert os.path.exists(export_dir)
        save_format = export_dir + "/saved_model_rn50.h5"

    if use_xla:
        tf.config.optimizer.set_jit(True)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    if precision == 'fp16':
        policy = keras.mixed_precision.experimental.Policy(
            'mixed_float16', loss_scale)
        keras.mixed_precision.experimental.set_policy(policy)

    lr_schedule = common.create_piecewise_constant_decay_with_warmup(
        batch_size=batch_size * hvd.size(),
        epoch_size=num_train_samples,
        warmup_epochs=common.LR_SCHEDULE[0][1],
        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
        multipliers=list(p[0] for p in common.LR_SCHEDULE),
        compute_lr_on_cpu=True)
    opt = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=momentum)
    # Horovod: add Horovod DistributedOptimizer. We use a modified version to
    # support the custom learning rate schedule.
    opt = hvd_patch.DistributedOptimizer(opt)

    backend.set_image_data_format(image_format)
    dtype = 'float16' if precision == 'fp16' else 'float32'
    backend.set_floatx(dtype)
    model = model_func(num_classes=image_processing.NUM_CLASSES)
    loss_func = 'sparse_categorical_crossentropy',

    top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top5')
    top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1, name='top1')

    # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
    # uses hvd.DistributedOptimizer() to compute gradients. However, this option
    # will disable the overlapping of the data loading and compute and hurt the
    # performace if the model is not under the scope of distribution strategy
    # scope.
    model.compile(optimizer=opt,
                  loss=loss_func,
                  metrics=[top1, top5],
                  experimental_run_tf_function=False)

    training_hooks = []
    training_hooks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    training_hooks.append(_ProfileKerasFitCallback(batch_size, display_every))

    if log_dir and hvd.rank() == 0:
        ckpt_callback = keras.callbacks.ModelCheckpoint(
            ckpt_format,
            monitor='val_top1',
            verbose=1,
            save_best_only=False,
            save_weights_only=False,
            save_frequency=1)
        training_hooks.append(ckpt_callback)

    if tensorboard_dir and hvd.rank() == 0:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=tensorboard_dir)
        training_hooks.append(tensorboard_callback)

    if data_dir is not None:
        num_preproc_threads = params['dali_threads'] if dali_mode else 10
        train_input = image_processing.image_set(
            train_files,
            batch_size,
            image_height,
            image_width,
            training=True,
            distort_color=distort_color,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=train_idx_files)

        valid_input = image_processing.image_set(
            valid_files,
            batch_size,
            image_height,
            image_width,
            training=False,
            distort_color=False,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=valid_idx_files)
        if dali_mode:
            train_input = train_input.get_device_dataset()
            valid_input = valid_input.get_device_dataset()
        valid_params = {
            'validation_data': valid_input,
            'validation_steps': nstep_per_valid,
            'validation_freq': 1
        }
    else:
        train_input = image_processing.fake_image_set(batch_size, image_height,
                                                      image_width)
        valid_params = {}

    try:
        verbose = 2 if hvd.rank() == 0 else 0
        model.fit(train_input,
                  epochs=num_epochs,
                  callbacks=training_hooks,
                  steps_per_epoch=nstep_per_epoch,
                  verbose=verbose,
                  initial_epoch=initial_epoch,
                  **valid_params)
    except KeyboardInterrupt:
        print("Keyboard interrupt")

    if export_dir and hvd.rank() == 0:
        model.save(save_format)
        print(f"The model is saved to {save_format}")
示例#3
0
def train_ctl(model_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    distort_color = params['distort_color']
    momentum = params['momentum']
    loss_scale = params['loss_scale']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    batch_size = params['batch_size']
    num_iter = params['num_iter']
    iter_unit = params['iter_unit']
    log_dir = params['log_dir']
    export_dir = params['export_dir']
    tensorboard_dir = params['tensorboard_dir']
    display_every = params['display_every']
    precision = params['precision']
    dali_mode = params['dali_mode']
    use_xla = params['use_xla']

    if data_dir is not None:
        file_format = os.path.join(data_dir, '%s-*')
        train_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_files = sorted(tf.io.gfile.glob(file_format % 'validation'))
        num_train_samples = common.get_num_records(train_files)
        num_valid_samples = common.get_num_records(valid_files)
    else:
        num_train_samples = 1281982
        num_valid_samples = 5000

    train_idx_files = None
    valid_idx_files = None
    if data_idx_dir is not None:
        file_format = os.path.join(data_idx_dir, '%s-*')
        train_idx_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_idx_files = sorted(tf.io.gfile.glob(file_format % 'validation'))

    if iter_unit.lower() == 'epoch':
        num_epochs = num_iter
        nstep_per_epoch = num_train_samples // (batch_size * hvd.size())
        nstep_per_valid = num_valid_samples // (batch_size * hvd.size())
    else:
        assert iter_unit.lower() == 'batch'
        num_epochs = 1
        nstep_per_epoch = min(num_iter,
                              num_train_samples // (batch_size * hvd.size()))
        nstep_per_valid = min(10,
                              num_valid_samples // (batch_size * hvd.size()))

    if export_dir:
        assert os.path.exists(export_dir)
        save_format = export_dir + "/saved_model_rn50.h5"

    if use_xla:
        tf.config.optimizer.set_jit(True)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    if tensorboard_dir and hvd.rank() == 0:
        assert os.path.exists(tensorboard_dir)
        summary_writer = tf.summary.create_file_writer(tensorboard_dir)
    else:
        summary_writer = None

    if precision == 'fp16':
        if tf.__version__ >= "2.4.0":
            policy = keras.mixed_precision.Policy('mixed_float16')
            keras.mixed_precision.set_global_policy(policy)
        else:
            policy = keras.mixed_precision.experimental.Policy(
                'mixed_float16', loss_scale)
            keras.mixed_precision.experimental.set_policy(policy)

    lr_schedule = common.create_piecewise_constant_decay_with_warmup(
        batch_size=batch_size * hvd.size(),
        epoch_size=num_train_samples,
        warmup_epochs=common.LR_SCHEDULE[0][1],
        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
        multipliers=list(p[0] for p in common.LR_SCHEDULE),
        compute_lr_on_cpu=True)
    opt = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=momentum)

    backend.set_image_data_format(image_format)
    dtype = 'float16' if precision == 'fp16' else 'float32'
    backend.set_floatx(dtype)
    model = model_func(num_classes=image_processing.NUM_CLASSES,
                       batch_size=batch_size)

    loss_func = keras.losses.SparseCategoricalCrossentropy()

    train_top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
        k=1, name='train_top1')
    train_top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
        k=5, name='train_top5')

    val_loss = tf.keras.metrics.Mean(name='val_loss', dtype=tf.float32)

    val_top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1,
                                                              name='val_top1')
    val_top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5,
                                                              name='val_top5')

    if log_dir:
        # We save check points only when using the real data.
        assert data_dir, "--data_dir cannot be empty when using --log_dir"
        assert os.path.exists(log_dir)
        ckpt = tf.train.Checkpoint(epoch=tf.Variable(0),
                                   optimizer=opt,
                                   net=model)
        manager = tf.train.CheckpointManager(ckpt,
                                             log_dir,
                                             max_to_keep=3,
                                             checkpoint_name="model-ckpt")

    @tf.function
    def train_step(inputs, first_batch):
        images, labels = inputs

        with tf.GradientTape() as tape:
            predictions = model(images, training=True)
            loss = loss_func(labels, predictions)
            loss += tf.reduce_sum(model.losses)
            loss_copy = loss
            # Scale the losses
            if precision == 'fp16':
                loss = loss * tf.cast(loss_scale, loss.dtype)

        tape = hvd.DistributedGradientTape(tape)

        old_grads = tape.gradient(loss, model.trainable_variables)

        # Unscale the grads
        if precision == 'fp16':
            loss_scale_reciprocal = 1. / loss_scale
            grads = [
                g * tf.cast(loss_scale_reciprocal, g.dtype)
                if g is not None else None for g in old_grads
            ]
        else:
            grads = old_grads

        opt.apply_gradients(zip(grads, model.trainable_variables))

        train_top1.update_state(labels, predictions)
        train_top5.update_state(labels, predictions)

        if hvd.size() > 1 and first_batch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)

        return loss_copy

    @tf.function
    def valid_step(inputs):
        images, labels = inputs
        predictions = model(images, training=False)
        loss = loss_func(labels, predictions)

        val_loss.update_state(loss)
        val_top1.update_state(labels, predictions)
        val_top5.update_state(labels, predictions)

    if data_dir is not None:
        num_preproc_threads = 4 if dali_mode else 10
        train_input = image_processing.image_set(
            train_files,
            batch_size,
            image_height,
            image_width,
            training=True,
            distort_color=distort_color,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=train_idx_files)

        valid_input = image_processing.image_set(
            valid_files,
            batch_size,
            image_height,
            image_width,
            training=False,
            distort_color=False,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=valid_idx_files)
    else:
        if dali_mode:
            raise ValueError("Must provide --data_dir if Dali is enabled")
        else:
            train_input = image_processing.fake_image_set(
                batch_size, image_height, image_width)

    global_steps = 0
    log_steps = display_every
    try:

        initial_epoch = 0
        if log_dir:
            ckpt.restore(manager.latest_checkpoint)
            if manager.latest_checkpoint:
                if hvd.rank() == 0:
                    print("Restored from {}".format(manager.latest_checkpoint))
                initial_epoch = max(
                    int(re.findall(r'\d+', manager.latest_checkpoint)[0]),
                    initial_epoch)
            else:
                if hvd.rank() == 0:
                    print("Initializing from scratch.")

        # Training Loop
        for epoch in range(num_epochs):
            if epoch < initial_epoch:
                continue
            # on_epoch_begin
            epoch_start = time.time()

            total_loss = 0.0
            num_batches = 0
            train_top1.reset_states()
            train_top5.reset_states()

            if not dali_mode:
                train_iter = iter(train_input)
            for _ in range(nstep_per_epoch):
                # on_batch_begin
                global_steps += 1
                if global_steps == 1:
                    start_time = time.time()

                if global_steps == 1 and hvd.rank() == 0 and summary_writer:
                    tf.summary.trace_on(graph=True, profiler=True)

                if not dali_mode:
                    x = next(train_iter)
                else:
                    x = train_input.get_device_minibatches()
                total_loss += train_step(x, global_steps == 1)

                if global_steps == 1 and hvd.rank() == 0 and summary_writer:
                    with summary_writer.as_default():
                        tf.summary.trace_export(
                            name="train_step",
                            step=0,
                            profiler_outdir=tensorboard_dir)

                # on_batch_end
                if global_steps % log_steps == 0:
                    timestamp = time.time()
                    elapsed_time = timestamp - start_time
                    examples_per_second = \
                        (batch_size * hvd.size() * log_steps) / elapsed_time
                    if hvd.rank() == 0:
                        print("global_step: %d images_per_sec: %.1f" %
                              (global_steps, examples_per_second))
                    start_time = timestamp
                num_batches += 1

            train_loss = total_loss / num_batches

            # on_epoch_end
            epoch_run_time = time.time() - epoch_start
            if hvd.rank() == 0:
                print("epoch: %d time_taken: %.1f" % (epoch, epoch_run_time))

            if data_dir is not None:
                val_loss.reset_states()
                val_top1.reset_states()
                val_top5.reset_states()

                if not dali_mode:
                    test_iter = iter(valid_input)
                for _ in range(nstep_per_valid):
                    if not dali_mode:
                        x = next(test_iter)
                    else:
                        x = valid_input.get_device_minibatches()
                    valid_step(x)

            if log_dir:
                ckpt.epoch.assign_add(1)
                if hvd.rank() == 0:
                    save_path = manager.save()
                    print("Saved checkpoint for epoch {}: {}".format(
                        int(ckpt.epoch), save_path))

            if hvd.rank() == 0:
                output_str = (
                    "loss: {} - top1: {} - top5: {} - val_loss: {} - "
                    "val_top1: {} - val_top5: {}")
                print(
                    output_str.format(train_loss, train_top1.result(),
                                      train_top5.result(), val_loss.result(),
                                      val_top1.result(), val_top5.result()))

            if hvd.rank() == 0 and summary_writer:
                with summary_writer.as_default():
                    tf.summary.scalar('train_loss', train_loss, global_steps)
                    tf.summary.scalar('train_top1', train_top1.result(),
                                      global_steps)
                    tf.summary.scalar('train_top5', train_top5.result(),
                                      global_steps)
                    tf.summary.scalar('val_loss', val_loss.result(),
                                      global_steps)
                    tf.summary.scalar('val_top1', val_top1.result(),
                                      global_steps)
                    tf.summary.scalar('val_top5', val_top5.result(),
                                      global_steps)

        if hvd.rank() == 0 and summary_writer:
            summary_writer.close()

    except KeyboardInterrupt:
        print("Keyboard interrupt")

    if export_dir and hvd.rank() == 0:
        model.save(save_format)
        print(f"The model is saved to {save_format}")