Пример #1
0
def create_validation_estimator(infer_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    batch_size = params['batch_size']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    log_dir = params['log_dir']
    precision = params['precision']
    momentum = params['momentum']
    learning_rate_init = params['learning_rate_init']
    learning_rate_power = params['learning_rate_power']
    weight_decay = params['weight_decay']
    loss_scale = params['loss_scale']
    larc_eta = params['larc_eta']
    larc_mode = params['larc_mode']
    num_iter = params['num_iter']
    checkpoint_secs = params['checkpoint_secs']
    display_every = params['display_every']
    iter_unit = params['iter_unit']
    dali_cpu = params['dali_cpu']

    # Determinism is not fully supported by all TF ops.
    deterministic = False
    if deterministic:
        tf.set_random_seed(2 * (1 + hvd.rank()))
        random.seed(3 * (1 + hvd.rank()))
        np.random.seed(2)

    log_dir = None if log_dir == "" else log_dir
    data_dir = None if data_dir == "" else data_dir
    if data_dir is None:
        raise ValueError("data_dir must be specified")
    if log_dir is None:
        raise ValueError("log_dir must be specified")

    filename_pattern = os.path.join(data_dir, '%s-*')
    eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    num_eval_samples = _get_num_records(eval_filenames)

    eval_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        eval_idx_filenames = sorted(
            tf.gfile.Glob(filename_pattern % 'validation'))
    else:
        raise ValueError("data_idx_dir must be specified")

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True  # Force pinned memory
    config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 40 // hvd.size() - 2

    classifier_eval = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model': infer_func,
            'format': image_format,
            'dtype': tf.float16 if precision == 'fp16' else tf.float32,
            'momentum': momentum,
            'learning_rate_init': learning_rate_init,
            'learning_rate_power': learning_rate_power,
            'decay_steps': None,
            'weight_decay': weight_decay,
            'loss_scale': loss_scale,
            'larc_eta': larc_eta,
            'larc_mode': larc_mode,
            'deterministic': deterministic,
            'n_classes': 1000,
            'dali_cpu': dali_cpu,
        },
        config=tf.estimator.RunConfig(
            tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None,
            session_config=config,
            save_checkpoints_secs=None,
            save_checkpoints_steps=None,
            keep_checkpoint_every_n_hours=3))

    if not deterministic:
        num_preproc_threads = 4
    else:
        num_preproc_threads = 1
    input_fn = lambda: nvutils.image_set(eval_filenames,
                                         batch_size,
                                         image_height,
                                         image_width,
                                         training=False,
                                         distort_color=False,
                                         deterministic=deterministic,
                                         dali_cpu=dali_cpu,
                                         idx_filenames=eval_idx_filenames,
                                         num_threads=num_preproc_threads)
    return classifier_eval, input_fn, (num_eval_samples / batch_size)
    x = graph.get_tensor_by_name(x_tensor_name)
    class_ids = graph.get_tensor_by_name(c_tensor_name)
    probabilities = graph.get_tensor_by_name(p_tensor_name)

    class_ids_, probs_ = None, None
    total = 0
    if data_dir is not None:
        filename_pattern = os.path.join(data_dir, '%s-*')
        eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))

        num_preproc_threads = 10
        dataset = nvutils.image_set(eval_filenames,
                                    batch_size,
                                    224,
                                    224,
                                    training=False,
                                    distort_color=False,
                                    deterministic=False,
                                    num_threads=num_preproc_threads)
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        try:
            while True:
                value_, _ = sess.run(next_element)
                tclass_ids_, tprobs_ = sess.run([class_ids, probabilities],
                                                {x: value_})
                total += tclass_ids_.shape[0]
                if class_ids_ is None:
                    class_ids_ = tclass_ids_
                    probs_ = tprobs_
                else:
Пример #3
0
def train(infer_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    batch_size = params['batch_size']
    distort_color = params['distort_color']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    log_dir = params['log_dir']
    precision = params['precision']
    momentum = params['momentum']
    learning_rate_init = params['learning_rate_init']
    learning_rate_power = params['learning_rate_power']
    weight_decay = params['weight_decay']
    loss_scale = params['loss_scale']
    larc_eta = params['larc_eta']
    larc_mode = params['larc_mode']
    num_iter = params['num_iter']
    checkpoint_secs = params['checkpoint_secs']
    display_every = params['display_every']
    iter_unit = params['iter_unit']
    dali_cpu = params['dali_cpu']
    epoch_evaluation = params['epoch_evaluation']
    use_xla = params['use_xla']

    # Determinism is not fully supported by all TF ops.
    # Disabling until remaining wrinkles can be ironed out.
    deterministic = False
    if deterministic:
        tf.set_random_seed(2 * (1 + hvd.rank()))
        random.seed(3 * (1 + hvd.rank()))
        np.random.seed(2)

    log_dir = None if log_dir == "" else log_dir
    data_dir = None if data_dir == "" else data_dir
    data_idx_dir = None if data_idx_dir == "" else data_idx_dir

    global_batch_size = batch_size * hvd.size()
    if data_dir is not None:
        filename_pattern = os.path.join(data_dir, '%s-*')
        train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
        num_training_samples = _get_num_records(train_filenames)
    else:
        num_training_samples = global_batch_size
    train_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        train_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))

    if iter_unit.lower() == 'epoch':
        nstep = num_training_samples * num_iter // global_batch_size
        num_epochs = num_iter
        decay_steps = nstep
    else:
        nstep = num_iter
        num_epochs = max(nstep * global_batch_size // num_training_samples, 1)
        decay_steps = 90 * num_training_samples // global_batch_size

    nstep_per_epoch = num_training_samples // global_batch_size

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpu_options = GPUOptions(per_process_gpu_memory_fraction=0.7)
    config = ConfigProto(gpu_options=gpu_options)
    if use_xla:
        config.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True  # Force pinned memory
    config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = max(2, 40 // hvd.size() - 2)

    classifier = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model': infer_func,
            'format': image_format,
            'dtype': tf.float16 if precision == 'fp16' else tf.float32,
            'momentum': momentum,
            'learning_rate_init': learning_rate_init,
            'learning_rate_power': learning_rate_power,
            'decay_steps': decay_steps,
            'weight_decay': weight_decay,
            'loss_scale': loss_scale,
            'larc_eta': larc_eta,
            'larc_mode': larc_mode,
            'deterministic': deterministic,
            'n_classes': 1000,
            'dali_cpu': dali_cpu,
        },
        config=tf.estimator.RunConfig(
            tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None,
            session_config=config,
            save_checkpoints_secs=checkpoint_secs if hvd.rank() == 0 else None,
            save_checkpoints_steps=nstep if hvd.rank() == 0 else None,
            keep_checkpoint_every_n_hours=3))

    print("Training")
    if not deterministic:
        num_preproc_threads = 4
    else:
        num_preproc_threads = 1

    training_hooks = [
        hvd.BroadcastGlobalVariablesHook(0),
        _PrefillStagingAreasHook()
    ]
    if hvd.rank() == 0:
        training_hooks.append(
            _LogSessionRunHook(global_batch_size, num_training_samples,
                               display_every))

    input_func = lambda: nvutils.image_set(train_filenames,
                                           batch_size,
                                           image_height,
                                           image_width,
                                           training=True,
                                           distort_color=distort_color,
                                           deterministic=deterministic,
                                           num_threads=num_preproc_threads,
                                           dali_cpu=dali_cpu,
                                           idx_filenames=train_idx_filenames)

    if epoch_evaluation:
        classifier_eval, eval_input_func, eval_steps = create_validation_estimator(
            infer_func, params)

    try:
        if epoch_evaluation:
            for i in range(num_epochs):
                classifier.train(input_fn=input_func,
                                 steps=nstep // num_epochs,
                                 hooks=training_hooks)
                if hvd.rank() == 0:
                    eval_result = classifier_eval.evaluate(
                        input_fn=eval_input_func, steps=eval_steps)
                    print('epoch {} top1: {}%'.format(
                        i, eval_result['top1_accuracy'] * 100))
                    print('epoch {} top5: {}%'.format(
                        i, eval_result['top5_accuracy'] * 100))
        else:
            classifier.train(input_fn=input_func,
                             max_steps=nstep,
                             hooks=training_hooks)

    except KeyboardInterrupt:
        print("Keyboard interrupt")
Пример #4
0
def validate(infer_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    batch_size = params['batch_size']
    data_dir = params['data_dir']
    log_dir = params['log_dir']
    precision = params['precision']
    momentum = params['momentum']
    learning_rate_init = params['learning_rate_init']
    learning_rate_power = params['learning_rate_power']
    weight_decay = params['weight_decay']
    loss_scale = params['loss_scale']
    larc_eta = params['larc_eta']
    larc_mode = params['larc_mode']
    num_iter = params['num_iter']
    checkpoint_secs = params['checkpoint_secs']
    display_every = params['display_every']
    iter_unit = params['iter_unit']
    use_dali = params['use_dali']

    # Determinism is not fully supported by all TF ops.
    # Disabling until remaining wrinkles can be ironed out.
    deterministic = False
    if deterministic:
        tf.set_random_seed(2 * (1 + hvd.rank()))
        random.seed(3 * (1 + hvd.rank()))
        np.random.seed(2)

    log_dir  = None if log_dir  == "" else log_dir
    data_dir = None if data_dir == "" else data_dir
    if data_dir is None:
        raise ValueError("data_dir must be specified")
    if log_dir is None:
        raise ValueError("log_dir must be specified")

    filename_pattern = os.path.join(data_dir, '%s-*')
    eval_filenames  = sorted(tf.gfile.Glob(filename_pattern % 'validation'))

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 40 // hvd.size() - 2 # HACK TESTING

    classifier = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : None,
            'weight_decay' : weight_decay,
            'loss_scale' : loss_scale,
            'larc_eta' : larc_eta,
            'larc_mode' : larc_mode,
            'deterministic' : deterministic,
            'n_classes':     1000,
            'use_dali': False,
        },
        config=tf.estimator.RunConfig(
            tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None,
            session_config=config,
            save_checkpoints_secs=None,
            save_checkpoints_steps=None,
            keep_checkpoint_every_n_hours=3))

    if not deterministic and not use_dali:
        num_preproc_threads = 10
    elif not deterministic and use_dali:
        num_preproc_threads = 2
    elif deterministic:
        num_preproc_threads = 1

    if hvd.rank() == 0:
        print("Evaluating")
        try:
            eval_result = classifier.evaluate(
                input_fn=lambda: nvutils.image_set(
                    eval_filenames, batch_size, image_height, image_width,
                    training=False, distort_color=False,
                    deterministic=deterministic,
                    num_threads=num_preproc_threads))
            print('Top-1 accuracy:', eval_result['top1_accuracy']*100, '%')
            print('Top-5 accuracy:', eval_result['top5_accuracy']*100, '%')
        except KeyboardInterrupt:
            print("Keyboard interrupt")