示例#1
0
def train_and_maybe_evaluate(model_est, imagenet_train, imagenet_eval, params):
    """Trains the model and maybe run evaluation when the mode flag is set to 'train_and_eval'
    Args:
        model_est: `TPUEstimator` instance for the discovered model
        imagenet_train: Input pipeline for the training set
        imagenet_eval: Input pipeline for the validation set
        params: Dictionary containing parameters
    """
    current_step = estimator._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

    tf.logging.info(
        'Training for %d steps (%.2f epochs in total). Current'
        ' step %d.', FLAGS.train_steps,
        FLAGS.train_steps / params['steps_per_epoch'], current_step)

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == 'train':
        hooks = []
        if FLAGS.use_async_checkpointing:
            hooks.append(
                async_checkpoint.AsyncCheckpointSaverHook(
                    checkpoint_dir=FLAGS.model_dir,
                    save_steps=max(100, FLAGS.iterations_per_loop)))
        model_est.train(input_fn=imagenet_train.input_fn,
                        max_steps=FLAGS.train_steps,
                        hooks=hooks)

    else:
        while current_step < FLAGS.train_steps:
            # Train for up to steps_per_eval number of steps.
            # At the end of training, a checkpoint will be written to --model_dir.
            next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                  FLAGS.train_steps)
            model_est.train(input_fn=imagenet_train.input_fn,
                            max_steps=int(next_checkpoint))
            current_step = next_checkpoint

            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                next_checkpoint, int(time.time() - start_timestamp))

            # Evaluate the model on the most recent model in --model_dir.
            # Since evaluation happens in batches of --eval_batch_size, some images
            # may be excluded modulo the batch size. As long as the batch size is
            # consistent, the evaluated images are also consistent.
            tf.logging.info('Starting to evaluate.')
            eval_results = model_est.evaluate(input_fn=imagenet_eval.input_fn,
                                              steps=FLAGS.num_eval_images //
                                              FLAGS.eval_batch_size)
            tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                            eval_results)

        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        FLAGS.train_steps, elapsed_time)
        if FLAGS.export_dir:
            export(model_est, FLAGS.export_dir)
示例#2
0
def create_hooks(loss, params):
    hooks = []
    async_save_hook = async_checkpoint.AsyncCheckpointSaverHook(
        checkpoint_dir=params['model_dir'], save_steps=params["save_steps"])
    hooks.append(async_save_hook)
    # save_hook = tf.train.CheckpointSaverHook(
    #     params["model_dir"], save_steps=params["save_steps"], saver=tf.train.Saver() )
    # hooks.append( save_hook )
    # logging_hook = tf.train.LoggingTensorHook(
    #     tensors= {
    #         'loss': loss,
    #     },
    #     every_n_iter=params["save_steps"] )
    # hooks.append( logging_hook )

    return hooks
示例#3
0
def main(unused_argv):

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        if FLAGS.model_name.startswith('efficientnet-edgetpu'):
            _, _, input_image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params(
                FLAGS.model_name)
        elif FLAGS.model_name.startswith('efficientnet-tpu'):
            _, _, input_image_size, _ = efficientnet_tpu_builder.efficientnet_tpu_params(
                FLAGS.model_name)
        elif FLAGS.model_name.startswith('efficientnet'):
            _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
                FLAGS.model_name)
        else:
            raise ValueError(
                'input_image_size must be set except for EfficientNet')

    # For imagenet dataset, include background label if number of output classes
    # is 1001
    include_background_label = (FLAGS.num_label_classes == 1001)

    if FLAGS.tpu or FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long
    # Initializes model parameters.
    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size,
                  use_bfloat16=FLAGS.use_bfloat16)
    est = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu,
                                      model_fn=model_fn,
                                      config=config,
                                      train_batch_size=FLAGS.train_batch_size,
                                      eval_batch_size=FLAGS.eval_batch_size,
                                      export_to_tpu=FLAGS.export_to_tpu,
                                      params=params)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    def build_imagenet_input(is_training):
        """Generate ImageNetInput for training and eval."""
        if FLAGS.bigtable_instance:
            tf.logging.info('Using Bigtable dataset, table %s',
                            FLAGS.bigtable_table)
            select_train, select_eval = _select_tables_from_flags()
            return imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=FLAGS.use_bfloat16,
                transpose_input=FLAGS.transpose_input,
                selection=select_train if is_training else select_eval,
                include_background_label=include_background_label,
                autoaugment_name=FLAGS.autoaugment_name)
        else:
            if FLAGS.data_dir == FAKE_DATA_DIR:
                tf.logging.info('Using fake dataset.')
            else:
                tf.logging.info('Using dataset: %s', FLAGS.data_dir)

            return imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=FLAGS.transpose_input,
                cache=FLAGS.use_cache and is_training,
                image_size=input_image_size,
                num_parallel_calls=FLAGS.num_parallel_calls,
                use_bfloat16=FLAGS.use_bfloat16,
                include_background_label=include_background_label,
                autoaugment_name=FLAGS.autoaugment_name)

    imagenet_train = build_imagenet_input(is_training=True)
    imagenet_eval = build_imagenet_input(is_training=False)

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            est.train(input_fn=imagenet_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=imagenet_train.input_fn,
                          max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
    if FLAGS.export_dir:
        export(est, FLAGS.export_dir, input_image_size)
示例#4
0
def main(unused_argv):
    params = resnet_params.from_file(FLAGS.param_file)
    params = resnet_params.override(params, FLAGS.param_overrides)
    resnet_params.log_hparams_to_model_dir(params, FLAGS.model_dir)
    tf.logging.info('Model params: {}'.format(params))

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if params['use_async_checkpointing']:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, params['iterations_per_loop'])
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params['iterations_per_loop'],
            num_shards=params['num_cores'],
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    if FLAGS.inference_with_all_cores:
        resnet_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=params['use_tpu'],
            model_fn=resnet_model_fn,
            config=config,
            params=params,
            train_batch_size=params['train_batch_size'],
            eval_batch_size=params['eval_batch_size'],
            export_to_tpu=FLAGS.export_to_tpu,
            experimental_exported_model_uses_all_cores=FLAGS.
            inference_with_all_cores)
    else:
        resnet_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=params['use_tpu'],
            model_fn=resnet_model_fn,
            config=config,
            params=params,
            train_batch_size=params['train_batch_size'],
            eval_batch_size=params['eval_batch_size'],
            export_to_tpu=FLAGS.export_to_tpu)
    assert (params['precision'] == 'bfloat16' or params['precision']
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params['precision'])
    use_bfloat16 = params['precision'] == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=use_bfloat16,
                transpose_input=params['transpose_input'],
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params['transpose_input'],
                cache=params['use_cache'] and is_training,
                image_size=params['image_size'],
                num_parallel_calls=params['num_parallel_calls'],
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = params['num_train_images'] // params['train_batch_size']
    eval_steps = params['num_eval_images'] // params['eval_batch_size']

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params['train_steps']:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        steps_per_epoch = params['num_train_images'] // params[
            'train_batch_size']
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params['train_steps'],
            params['train_steps'] / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params['use_async_checkpointing']:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, params['iterations_per_loop'])))
            if FLAGS.profile_every_n_steps > 0:
                hooks.append(
                    tpu_profiler_hook.TPUProfilerHook(
                        save_steps=FLAGS.profile_every_n_steps,
                        output_dir=FLAGS.model_dir,
                        tpu=FLAGS.tpu))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=params['train_steps'],
                                    hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params['train_steps']:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params['train_steps'])
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params['num_eval_images'] //
                    params['eval_batch_size'])
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params['train_steps'], elapsed_time)

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            export_path = resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
            if FLAGS.add_warmup_requests:
                inference_warmup.write_warmup_requests(
                    export_path,
                    FLAGS.model_name,
                    params['image_size'],
                    batch_sizes=FLAGS.inference_batch_sizes,
                    image_format='JPEG')
示例#5
0
def main(unused_argv):

    # Check flag conditions:
    if FLAGS.mode == 'train':
        tf.logging.info('Mode = train, TPU = %s, Num cores = %d' %
                        (FLAGS.tpu, FLAGS.train_num_cores))

    elif FLAGS.mode == 'evaluate':
        tf.logging.info('Mode = evaluate, TPU = %s, Num cores = %d' %
                        (FLAGS.eval_tpu, FLAGS.eval_num_cores))

    elif FLAGS.mode == 'train_and_eval':
        if FLAGS.train_num_cores > 8:
            tf.logging.info('Mode = train_and_eval, Train TPU = %s, '
                            'Train num cores: %d, Eval TPU = %s, '
                            'Eval num cores: %d' %
                            (FLAGS.tpu, FLAGS.train_num_cores, FLAGS.eval_tpu,
                             FLAGS.eval_num_cores))
        else:
            tf.logging.info('Mode = train_and_eval, TPU = %s, '
                            'Num cores: %d' %
                            (FLAGS.tpu, FLAGS.train_num_cores))

    # Set up general purpose tpu_cluster_resolver based on FLAGS.mode:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu
        if FLAGS.mode in ['train', 'train_and_eval'] else FLAGS.eval_tpu,
        zone=FLAGS.tpu_zone
        if FLAGS.mode in ['train', 'train_and_eval'] else FLAGS.eval_tpu_zone,
        project=FLAGS.gcp_project)

    # For mode == 'train_and_eval' we can have 2 options:
    # 1. Use same TPU for training and evaluating (only v2-8)
    # 2. Use TPU with more cores for training (v2-32/128/256/512),
    #       and a separate v2-8 for evaluating.
    if FLAGS.mode == 'train_and_eval' and FLAGS.train_num_cores > 8:
        eval_tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.eval_tpu,
            zone=FLAGS.eval_tpu_zone,
            project=FLAGS.gcp_project)

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)

    ##### RunConfig parameters:
    '''Arguments:
        iterations_per_loop: number of training steps running in TPU system
            before returning to CPU host for each Session.run. Global step is
            increased iterations_per_loop times in one Session.run. It is recommended
            to be set as number of global steps for next checkpoint.
        per_host_input_for_training: If True, input_fn is invoked once on each host.
            If PER_HOST_V1: batch size per shard = train_batch_size // #hosts (#cpus)
            If PER_HOST_V2: batch size per shard = train_batch_size // #cores  
        keep_checkpoint_max: If None, keep all checkpoint files, otherwise specify
            'n' to keep latest 'n' files.

    Each TPU device has 8 cores and is connected to a host (CPU). Larger slices have
    multiple hosts. For instance, v2-256 communicates with 16 hosts. So, per_host_input_\
    for_training will invoke/create the Dataset pipeline 16 times in total for 16 hosts,
    where each host will serve 256/16 = 16 cores. Each core will take a batch size represented
    by flag PER_HOST_V2. This functionality is missing right now in tf.Keras which makes it
    difficult to scale up models to bigger TPU slices.

    '''
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        keep_checkpoint_max=None,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.train_num_cores
                if FLAGS.mode in ['train', 'train_and_eval']
                else FLAGS.eval_num_cores,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.\
                PER_HOST_V2))

    if FLAGS.mode == 'train_and_eval' and FLAGS.train_num_cores > 8:
        config_eval = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver_eval,
            model_dir=FLAGS.model_dir,
            save_checkpoints_steps=save_checkpoints_steps,
            log_step_count_steps=FLAGS.log_step_count_steps,
            keep_checkpoint_max=None,
            session_config=tf.ConfigProto(
                graph_options=tf.GraphOptions(
                    rewrite_options=rewriter_config_pb2.RewriterConfig(
                        disable_meta_optimizer=True))),
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=FLAGS.iterations_per_loop,
                num_shards=FLAGS.eval_num_cores,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.\
                    PER_HOST_V2))

    ##### Estimator story:
    '''Estimator handles running details, such as replicating inputs and models for
        core, and returning to host periodically to run hooks.
        -> TPUEstimator transforms a global batch size in params to a per-shard/core
            batch size when calling input_fn and model_fn. Users SHOULD specify GLOBAL
            batch size in constructor and then get the batch size for EACH shard/core 
            in input_fn and model_fn by PARAMS['BATCH_SIZE'].
        -> For training, model_fn gets per_core_batch_size; input_fn may get
            per-core or per-host batch size depending on per_host_input_for_training in
            TPUConfig. For this model, we use PER_HOST_V2.
        -> For evaluation and prediction, model_fn gets per-core batch size and input_fn
            per-host batch size.

        Current limitations:
            -> TPU prediction only works on a single host (one TPU worker)
            -> input_fn must return a Dataset instance rather than features. In fact,
                train(), and evaluate() also support Dataset as return value.
    '''
    '''Arguments:
        model_fn: Should be a TPUEstimatorSpec. 
        use_tpu: Setting to False for testing. All training, evaluation, and predict will
            be executed on CPU. input_fn and model_fn will receive train_batch_size or
            eval_batch_size unmodified as params['batch_size']. Setting to True, input_fn
            and model_fn will receive per_core batch size. :config plays a role in specifying
            details about TPU workers to the Estimator.
        config: An tpu_config.RunConfig configuration object. Cannot be None.
        params: An optional dict of hyper parameters that will be passed into input_fn and
            model_fn. Keys are names of parameters, values are basic python types. There are
            reserved keys for TPUEstimator, including 'batch_size'. Extra parameters can be 
            added to this dictionary and can be used in input_fn and model_fn scripts.
        train_batch_size: An int representing the global batch size. TPUEstimator transforms
            this global batch size to a per-shard/core batch size, as params['batch_size'],
            when calling input_fn and model_fn. Cannot be None if :use_tpu is True. Must be
            DIVISIBLE by total number of replicas. The per-shard batch size calculation is
            automatically done using TPUConfig details.
        export_to_tpu: If True, export_savedmodel() exports a metagraph for serving on TPU
            besides the one on CPU.
    '''

    if not FLAGS.init_checkpoint == 'None':
        warm_start_vars = FLAGS.warm_start_vars.split(',')
        warm_start_vars = [x.strip() for x in warm_start_vars]
        ws = tf.estimator.WarmStartSettings(
            ckpt_to_initialize_from=FLAGS.init_checkpoint,
            vars_to_warm_start=warm_start_vars)

        i3d_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=i3d_model_fn,
            config=config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size,
            export_to_tpu=FLAGS.export_to_tpu,
            warm_start_from=ws)
    else:
        i3d_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=i3d_model_fn,
            config=config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size,
            export_to_tpu=FLAGS.export_to_tpu)

    if FLAGS.mode == 'train_and_eval' and FLAGS.train_num_cores > 8:
        i3d_eval = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=i3d_model_fn,
            config=config_eval,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            export_to_tpu=FLAGS.export_to_tpu,
            warm_start_from=ws)

    assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', (
        'Invalid value for --precision flag; must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', FLAGS.precision)

    use_bfloat16 = FLAGS.precision == 'bfloat16'

    tf.logging.info('Using dataset: %s', FLAGS.data_dir)

    list_of_augmentations = [
        'random_crop', 'random_brightness', 'random_contrast'
    ]

    # dataset_train and dataset_eval are the Input pipelines
    dataset_train, dataset_eval, dataset_predict = [
        inp_pipeline.InputPipelineTFExample(
            data_dir=FLAGS.data_dir,
            is_training=is_training,
            cache=FLAGS.use_cache and is_training,
            use_bfloat16=use_bfloat16,
            target_image_size=224,
            num_frames=32,  # num_frames_change_here
            num_classes=15,
            num_parallel_calls=FLAGS.num_parallel_calls,
            list_of_augmentations=list_of_augmentations)
        for is_training in [True, False, False]
    ]

    # num_train_videos = total images in the dataset
    # train_batch_size = total batch size (across all cores)
    steps_per_epoch = FLAGS.num_train_videos // FLAGS.train_batch_size
    eval_steps = FLAGS.num_eval_videos // FLAGS.eval_batch_size

    if FLAGS.mode == 'train' or FLAGS.mode == 'evaluate':

        # Automatically get the latest checkpoint file and latest
        # train step from the model_dir.
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            'step %d.', FLAGS.train_steps, FLAGS.train_steps / steps_per_epoch,
            current_step)

        start_timestamp = time.time()  # Compilation time included

        if FLAGS.mode == 'train':
            hooks = []

            # Not sure what this does. I think this takes care of
            # asynchronously saving checkpoint files, irrespective of
            # training routine on TPU.
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))

            # Number of steps between collecting prog=files if larger
            # than 0.
            if FLAGS.profile_every_n_steps > 0:
                hooks.append(
                    tpu_profiler_hook.TPUProfilerHook(
                        save_steps=FLAGS.profile_every_n_steps,
                        output_dir=FLAGS.model_dir,
                        tpu=FLAGS.tpu))

            ##### Estimator training story:
            '''Arguments:
                input_fn: Returns mini batches for training. Function should
                    return tf.data.Dataset object: tuple (features, labels).
                    Both features and labels are consumed by model_fn. They
                    should satisfy the expectation of model_fn for inputs.
                hooks: List of tf.train.SessionRunHook subclass instance. Used
                    for callbacks inside the training loop.
                max_steps: Number of total steps for which to train the model.
            '''
            i3d_classifier.train(input_fn=dataset_train.input_fn,
                                 max_steps=FLAGS.train_steps,
                                 hooks=hooks)

        elif FLAGS.mode == 'evaluate':
            '''
            for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
                tf.logging.info(
                    'Starting to evaluate using %s',
                    ckpt)
            '''
            f = open(
                'evaluations/dummy_' + FLAGS.model_dir.split('/')[-1] + '.txt',
                'ab')
            #ids = [i for i in range(12600, 14000, 300)]
            #ids.append(14000)
            ids = [14000]
            #import ipdb; ipdb.set_trace()
            for i in ids:
                try:
                    ckpt = FLAGS.model_dir + '/model.ckpt-' + str(i)
                    start_timestamp = time.time()  # Compilation time included
                    eval_results = i3d_classifier.evaluate(
                        input_fn=dataset_eval.input_fn,
                        steps=eval_steps,
                        checkpoint_path=ckpt)
                    elapsed_time = int(time.time() - start_timestamp)
                    tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                    eval_results, elapsed_time)

                    f.write('step: ' + str(i) + ', stats: ' +
                            str(eval_results) + '\n')
                    f.close()
                    f = open(
                        'evaluations/dummy_' + FLAGS.model_dir.split('/')[-1] +
                        '.txt', 'ab')

                    # Terminate eval job when final checkpoint is reached
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                    if current_step >= FLAGS.train_steps:
                        tf.logging.info(
                            'Evaluation finished after training step %d',
                            current_step)
                        break

                except tf.errors.NotFoundError:
                    tf.logging.info(
                        'Checkpoint %s no longer exists, skipping checkpoint',
                        ckpt)
            f.close()

    elif FLAGS.mode == 'predict':
        i = 1000
        ckpt = FLAGS.model_dir + '/model.ckpt-' + str(i)
        predict_iters = i3d_classifier.predict(
            input_fn=dataset_predict.input_fn,
            checkpoint_path=ckpt,
            yield_single_examples=False)
        all_gt, all_preds = [], []
        count = 0
        for predict_result in predict_iters:
            gt = predict_result['ground_truth']
            preds = predict_result['predictions']
            if count % 10 == 0:
                print('step:{}, shapes:{}'.format(count, gt.shape))
            count += 1

            for j in gt:
                all_gt.append(j)
                all_preds.append(j)

        print('Finished, {}'.format(len(all_gt)))
        with open('gt.pkl', 'wb') as handle:
            pickle.dump(all_gt, handle)
        with open('preds.pkl', 'wb') as handle:
            pickle.dump(all_preds, handle)
示例#6
0
def main(unused_argv):
    params = params_dict.ParamsDict(mnasnet_config.MNASNET_CFG,
                                    mnasnet_config.MNASNET_RESTRICTIONS)
    params = params_dict.override_params_dict(params,
                                              FLAGS.config_file,
                                              is_strict=True)
    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)

    params = flags_to_params.override_params_from_input_flags(params, FLAGS)

    additional_params = {
        'steps_per_epoch': params.num_train_images / params.train_batch_size,
        'quantized_training': FLAGS.quantized_training,
    }

    params = params_dict.override_params_dict(params,
                                              additional_params,
                                              is_strict=False)

    params.validate()
    params.lock()

    if FLAGS.tpu or params.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if params.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, params.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params.iterations_per_loop,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    # Validates Flags.
    if params.precision == 'bfloat16' and params.use_keras:
        raise ValueError(
            'Keras layers do not have full support to bfloat16 activation training.'
            ' You have set precision as %s and use_keras as %s' %
            (params.precision, params.use_keras))

    # Initializes model parameters.
    mnasnet_est = tf.contrib.tpu.TPUEstimator(
        use_tpu=params.use_tpu,
        model_fn=mnasnet_model_fn,
        config=config,
        train_batch_size=params.train_batch_size,
        eval_batch_size=params.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu,
        params=params.as_dict())

    if FLAGS.mode == 'export_only':
        export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
        return

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=False,
                transpose_input=params.transpose_input,
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params.transpose_input,
                cache=params.use_cache and is_training,
                image_size=params.input_image_size,
                num_parallel_calls=params.num_parallel_calls,
                use_bfloat16=(params.precision == 'bfloat16'))
            for is_training in [True, False]
        ]

    if FLAGS.mode == 'eval':
        eval_steps = params.num_eval_images // params.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = mnasnet_est.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

        if FLAGS.export_dir:
            export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(  # pylint: disable=protected-access
            FLAGS.model_dir)

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params.train_steps,
            params.train_steps / params.steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, params.iterations_per_loop)))
            mnasnet_est.train(input_fn=imagenet_train.input_fn,
                              max_steps=params.train_steps,
                              hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params.train_steps)
                mnasnet_est.train(input_fn=imagenet_train.input_fn,
                                  max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = mnasnet_est.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params.num_eval_images // params.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params.train_steps, elapsed_time)
            if FLAGS.export_dir:
                export(mnasnet_est, FLAGS.export_dir, params,
                       FLAGS.post_quantize)
示例#7
0
def main(unused_argv):
    params = hyperparameters.get_hyperparameters(FLAGS.default_hparams_file,
                                                 FLAGS.hparams_file, FLAGS,
                                                 FLAGS.hparams)
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if params['use_async_checkpointing']:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(2500, params['iterations_per_loop'])
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=get_model_dir(params),
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=None,  # Keep all checkpoints.
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params['iterations_per_loop'],
            num_shards=params['num_cores'],
            # copybara:strip_begin
            tpu_job_name=FLAGS.tpu_job_name,
            # copybara:strip_end
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=params['use_tpu'],
        model_fn=resnet_model_fn,
        config=config,
        params=params,
        train_batch_size=params['train_batch_size'],
        eval_batch_size=params['eval_batch_size'],
        export_to_tpu=FLAGS.export_to_tpu)

    # copybara:strip_begin
    if FLAGS.xla_compile:
        resnet_classifier = tf.contrib.tpu.TPUEstimator(
            use_tpu=params['use_tpu'],
            model_fn=xla.estimator_model_fn(resnet_model_fn),
            config=config,
            params=params,
            train_batch_size=params['train_batch_size'],
            eval_batch_size=params['eval_batch_size'],
            export_to_tpu=FLAGS.export_to_tpu)
    # copybara:strip_end
    assert (params['precision'] == 'bfloat16' or params['precision']
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params['precision'])
    use_bfloat16 = params['precision'] == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train = imagenet_input.ImageNetBigtableInput(
            is_training=True,
            use_bfloat16=use_bfloat16,
            transpose_input=params['transpose_input'],
            selection=select_train)
        imagenet_eval = imagenet_input.ImageNetBigtableInput(
            is_training=False,
            use_bfloat16=use_bfloat16,
            transpose_input=params['transpose_input'],
            selection=select_eval)
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params['transpose_input'],
                cache=params['use_cache'] and is_training,
                image_size=params['image_size'],
                num_parallel_calls=params['num_parallel_calls'],
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = params['num_train_images'] // params['train_batch_size']
    eval_steps = params['num_eval_images'] // params['eval_batch_size']

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                get_model_dir(params), timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params['train_steps']:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'eval_igt':
        # IGT evaluation mode. Evaluate metrics for the desired parameters
        # (true or shifted) on the desired dataset (train or eval). Note that
        # train is still with data augmentation.

        # Get checkpoint file names.
        index_files = tf.gfile.Glob(
            os.path.join(get_model_dir(params), 'model.ckpt-*.index'))
        checkpoints = [fn[:-len('.index')] for fn in index_files]
        # Need to sort them to get proper tensorboard plotting (increasing event
        # timestamps correspond to increasing steps).
        checkpoint_steps = []
        for ckpt in checkpoints:
            tf.logging.info(ckpt)
            step_match = re.match(r'.*model.ckpt-([0-9]*)', ckpt)
            checkpoint_steps.append(int(step_match.group(1)))
        checkpoints = [
            ckpt for _, ckpt in sorted(zip(checkpoint_steps, checkpoints))
        ]
        tf.logging.info('There are {} checkpoints'.format(len(checkpoints)))
        tf.logging.info(', '.join(checkpoints))

        # Keep track of the last processed checkpoint (fault tolerance).
        analysis_state_path = os.path.join(
            get_model_dir(params),
            'analysis_state_' + FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode)
        next_analysis_index = 0
        if tf.gfile.Exists(analysis_state_path):
            with tf.gfile.Open(analysis_state_path) as fd:
                next_analysis_index = int(fd.read())

        # Process each checkpoint.
        while next_analysis_index < len(checkpoints):
            tf.logging.info(
                'Next analysis index: {}'.format(next_analysis_index))
            ckpt_path = checkpoints[next_analysis_index]
            tf.logging.info('Starting to evaluate: {}.'.format(ckpt_path))
            start_timestamp = time.time(
            )  # This time will include compilation time

            if FLAGS.igt_eval_set == 'train':
                the_input_fn = imagenet_train.input_fn
                the_steps = steps_per_epoch
            elif FLAGS.igt_eval_set == 'eval':
                the_input_fn = imagenet_eval.input_fn
                the_steps = eval_steps
            else:
                raise ValueError('Unsupported igt_eval_set')

            eval_results = resnet_classifier.evaluate(
                input_fn=the_input_fn,
                steps=the_steps,
                checkpoint_path=ckpt_path,
                name=FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode)
            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                            eval_results, elapsed_time)

            next_analysis_index += 1
            file_io.atomic_write_string_to_file(analysis_state_path,
                                                str(next_analysis_index))

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            get_model_dir(params))  # pylint:disable=protected-access,g-line-too-long
        steps_per_epoch = params['num_train_images'] // params[
            'train_batch_size']
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params['train_steps'],
            params['train_steps'] / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params['use_async_checkpointing']:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=get_model_dir(params),
                        save_steps=max(2500, params['iterations_per_loop'])))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=params['train_steps'],
                                    hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params['train_steps']:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params['train_steps'])
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params['num_eval_images'] //
                    params['eval_batch_size'])
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params['train_steps'], elapsed_time)

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            unused_export_path = resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
示例#8
0
def main(unused_argv):
  params = params_dict.ParamsDict(
      resnet_config.RESNET_CFG, resnet_config.RESNET_RESTRICTIONS)
  params = params_dict.override_params_dict(
      params, FLAGS.config_file, is_strict=True)
  params = params_dict.override_params_dict(
      params, FLAGS.params_override, is_strict=True)

  params = flags_to_params.override_params_from_input_flags(params, FLAGS)

  params.validate()
  params.lock()

  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu if (FLAGS.tpu or params.use_tpu) else '',
      zone=FLAGS.tpu_zone,
      project=FLAGS.gcp_project)

  if params.use_async_checkpointing:
    save_checkpoints_steps = None
  else:
    save_checkpoints_steps = max(5000, params.iterations_per_loop)
  config = tf.estimator.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=save_checkpoints_steps,
      log_step_count_steps=FLAGS.log_step_count_steps,
      session_config=tf.ConfigProto(
          graph_options=tf.GraphOptions(
              rewrite_options=rewriter_config_pb2.RewriterConfig(
                  disable_meta_optimizer=True))),
      tpu_config=tf.estimator.tpu.TPUConfig(
          iterations_per_loop=params.iterations_per_loop,
          num_shards=params.num_cores,
          per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig
          .PER_HOST_V2))  # pylint: disable=line-too-long

  resnet_classifier = tf.estimator.tpu.TPUEstimator(
      use_tpu=params.use_tpu,
      model_fn=resnet_model_fn,
      config=config,
      params=params.as_dict(),
      train_batch_size=params.train_batch_size,
      eval_batch_size=params.eval_batch_size,
      export_to_tpu=FLAGS.export_to_tpu)

  assert (params.precision == 'bfloat16' or
          params.precision == 'float32'), (
              'Invalid value for precision parameter; '
              'must be bfloat16 or float32.')
  tf.logging.info('Precision: %s', params.precision)
  use_bfloat16 = params.precision == 'bfloat16'

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  if FLAGS.bigtable_instance:
    tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table)
    select_train, select_eval = _select_tables_from_flags()
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetBigtableInput(  # pylint: disable=g-complex-comprehension
            is_training=is_training,
            use_bfloat16=use_bfloat16,
            transpose_input=params.transpose_input,
            selection=selection,
            augment_name=FLAGS.augment_name,
            randaug_num_layers=FLAGS.randaug_num_layers,
            randaug_magnitude=FLAGS.randaug_magnitude)
        for (is_training, selection) in [(True,
                                          select_train), (False, select_eval)]
    ]
  else:
    if FLAGS.data_dir == FAKE_DATA_DIR:
      tf.logging.info('Using fake dataset.')
    else:
      tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(  # pylint: disable=g-complex-comprehension
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=params.transpose_input,
            cache=params.use_cache and is_training,
            image_size=params.image_size,
            num_parallel_calls=params.num_parallel_calls,
            include_background_label=(params.num_label_classes == 1001),
            use_bfloat16=use_bfloat16,
            augment_name=FLAGS.augment_name,
            randaug_num_layers=FLAGS.randaug_num_layers,
            randaug_magnitude=FLAGS.randaug_magnitude)
        for is_training in [True, False]
    ]

  steps_per_epoch = params.num_train_images // params.train_batch_size
  eval_steps = params.num_eval_images // params.eval_batch_size

  if FLAGS.mode == 'eval':

    # Run evaluation when there's a new checkpoint
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = resnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                        eval_results, elapsed_time)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= params.train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d', current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint', ckpt)

  else:   # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    try:
      current_step = tf.train.load_variable(FLAGS.model_dir,
                                            tf.GraphKeys.GLOBAL_STEP)
    except (TypeError, ValueError, tf.errors.NotFoundError):
      current_step = 0
    steps_per_epoch = params.num_train_images // params.train_batch_size
    tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                    ' step %d.',
                    params.train_steps,
                    params.train_steps / steps_per_epoch,
                    current_step)

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == 'train':
      hooks = []
      if params.use_async_checkpointing:
        try:
          from tensorflow.contrib.tpu.python.tpu import async_checkpoint  # pylint: disable=g-import-not-at-top
        except ImportError as e:
          logging.exception(
              'Async checkpointing is not supported in TensorFlow 2.x')
          raise e

        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=FLAGS.model_dir,
                save_steps=max(5000, params.iterations_per_loop)))
      if FLAGS.profile_every_n_steps > 0:
        hooks.append(
            tpu_profiler_hook.TPUProfilerHook(
                save_steps=FLAGS.profile_every_n_steps,
                output_dir=FLAGS.model_dir, tpu=FLAGS.tpu)
            )
      resnet_classifier.train(
          input_fn=imagenet_train.input_fn,
          max_steps=params.train_steps,
          hooks=hooks)

    else:
      assert FLAGS.mode == 'train_and_eval'
      while current_step < params.train_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              params.train_steps)
        resnet_classifier.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        next_checkpoint, int(time.time() - start_timestamp))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info('Starting to evaluate.')
        eval_results = resnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=params.num_eval_images // params.eval_batch_size)
        tf.logging.info('Eval results at step %d: %s',
                        next_checkpoint, eval_results)

      elapsed_time = int(time.time() - start_timestamp)
      tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                      params.train_steps, elapsed_time)

    if FLAGS.export_dir is not None:
      # The guide to serve a exported TensorFlow model is at:
      #    https://www.tensorflow.org/serving/serving_basic
      tf.logging.info('Starting to export model.')
      export_path = resnet_classifier.export_saved_model(
          export_dir_base=FLAGS.export_dir,
          serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
      if FLAGS.add_warmup_requests:
        inference_warmup.write_warmup_requests(
            export_path,
            FLAGS.model_name,
            params.image_size,
            batch_sizes=FLAGS.inference_batch_sizes,
            image_format='JPEG')
示例#9
0
def main(unused_argv):
  # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
  #     FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '',
  #     zone=FLAGS.tpu_zone,
  #     project=FLAGS.gcp_project)

  if FLAGS.use_async_checkpointing:
    save_checkpoints_steps = None
  else:
    save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)

  NUM_GPUS = len(get_available_gpus())
  distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=NUM_GPUS)
  gpu_options = tf.GPUOptions(allow_growth=True)

  # config = tf.contrib.tpu.RunConfig(
  #     cluster=tpu_cluster_resolver,
  #     model_dir=FLAGS.model_dir,
  #     save_checkpoints_steps=save_checkpoints_steps,
  #     log_step_count_steps=FLAGS.log_step_count_steps,
  #     session_config=tf.ConfigProto(
  #         graph_options=tf.GraphOptions(
  #             rewrite_options=rewriter_config_pb2.RewriterConfig(
  #                 disable_meta_optimizer=True))),
  #     tpu_config=tf.contrib.tpu.TPUConfig(
  #         iterations_per_loop=FLAGS.iterations_per_loop,
  #         per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
  #         .PER_HOST_V2))  # pylint: disable=line-too-long

  config = tf.estimator.RunConfig(
      # cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=save_checkpoints_steps,
      log_step_count_steps=FLAGS.log_step_count_steps,
      session_config=tf.ConfigProto(allow_soft_placement=True,
          graph_options=tf.GraphOptions(
              rewrite_options=rewriter_config_pb2.RewriterConfig(
                  disable_meta_optimizer=True)), gpu_options=gpu_options),
      train_distribute=distribution,
      # tpu_config=tf.contrib.tpu.TPUConfig(
      #     iterations_per_loop=FLAGS.iterations_per_loop,
      #     per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
      #     .PER_HOST_V2)
  )
  # Initializes model parameters.
  # params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size)
  # model_est = tf.estimator.Estimator(
  #     use_tpu=FLAGS.use_tpu,
  #     model_fn=final_model_fn,
  #     config=config,
  #     train_batch_size=FLAGS.train_batch_size,
  #     eval_batch_size=FLAGS.eval_batch_size,
  #     export_to_tpu=FLAGS.export_to_tpu,
  #     params=params)
  params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, batch_size=FLAGS.train_batch_size)
  model_est = tf.estimator.Estimator(
      model_fn=final_model_fn,
      config=config,
      params=params)

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  if FLAGS.bigtable_instance:
    tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table)
    select_train, select_eval = _select_tables_from_flags()
    imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput(
        is_training=is_training,
        use_bfloat16=False,
        transpose_input=FLAGS.transpose_input,
        selection=selection) for (is_training, selection) in
                                     [(True, select_train),
                                      (False, select_eval)]]
  else:
    if FLAGS.data_dir == FAKE_DATA_DIR:
      tf.logging.info('Using fake dataset.')
    else:
      tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=FLAGS.transpose_input,
            cache=FLAGS.use_cache and is_training,
            image_size=FLAGS.input_image_size,
            num_parallel_calls=FLAGS.num_parallel_calls,
            use_bfloat16=False) for is_training in [True, False]
    ]

  if FLAGS.mode == 'eval':
    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = model_est.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                        eval_results, elapsed_time)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= FLAGS.train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d', current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint', ckpt)

    if FLAGS.export_dir:
      export(model_est, FLAGS.export_dir, FLAGS.post_quantize)
  else:   # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

    tf.logging.info(
        'Training for %d steps (%.2f epochs in total). Current'
        ' step %d.', FLAGS.train_steps,
        FLAGS.train_steps / params['steps_per_epoch'], current_step)

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == 'train':
      hooks = []
      if FLAGS.use_async_checkpointing:
        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=FLAGS.model_dir,
                save_steps=max(100, FLAGS.iterations_per_loop)))
      model_est.train(
          input_fn=imagenet_train.input_fn,
          max_steps=FLAGS.train_steps,
          hooks=hooks)

    else:
      assert FLAGS.mode == 'train_and_eval'
      while current_step < FLAGS.train_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              FLAGS.train_steps)
        model_est.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        next_checkpoint, int(time.time() - start_timestamp))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info('Starting to evaluate.')
        eval_results = model_est.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=FLAGS.num_eval_images // FLAGS.eval_batch_size)
        tf.logging.info('Eval results at step %d: %s',
                        next_checkpoint, eval_results)

      elapsed_time = int(time.time() - start_timestamp)
      tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                      FLAGS.train_steps, elapsed_time)
      if FLAGS.export_dir:
        export(model_est, FLAGS.export_dir, FLAGS.post_quantize)
示例#10
0
        num_shards=args.num_shards,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
        .PER_HOST_V2))  # pylint: disable=line-too-long

params = dict(weight_decay=args.weight_decay)
tpu_estimator = tf.contrib.tpu.TPUEstimator(
    model_fn=model_fn,
    config=run_config,
    train_batch_size=args.batch_size,
    eval_batch_size=args.batch_size,
    params=params)

hooks = []
hooks.append(
    async_checkpoint.AsyncCheckpointSaverHook(
        checkpoint_dir=args.model_dir,
        save_steps=iterations_per_loop))

train_input_fn = make_input_fn(data, labels)
eval_input_fn = make_input_fn(test_data, test_labels)

if pid > 0:
    tpu_estimator.train(input_fn=train_input_fn,
                        steps=args.num_epochs * steps_per_epoch,
                        hooks=hooks)
    # Sleep so that eval can finish before closing.
    time.sleep(360)
else:
    for ckpt in evaluation.checkpoints_iterator(args.model_dir):
        eval_results = tpu_estimator.evaluate(
            input_fn=eval_input_fn,
示例#11
0
def main(unused_argv):

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        input_image_size = model_builder_factory.get_model_input_size(
            FLAGS.model_name)

    if FLAGS.holdout_shards:
        holdout_images = int(FLAGS.num_train_images * FLAGS.holdout_shards /
                             1024.0)
        FLAGS.num_train_images -= holdout_images
        if FLAGS.eval_name and 'test' in FLAGS.eval_name:
            FLAGS.holdout_shards = None  # do not use holdout if eval test set.
        else:
            FLAGS.num_eval_images = holdout_images

    # For objectron dataset, include background label if number of output classes
    # is 1001
    include_background_label = (FLAGS.num_label_classes == 1001)

    if FLAGS.tpu or FLAGS.use_tpu:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
    config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            tpu_job_name=FLAGS.tpu_job_name,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long
    # Initializes model parameters.
    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size,
                  use_bfloat16=FLAGS.use_bfloat16)
    est = tf.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu,
        params=params)

    if (FLAGS.model_name.startswith('efficientnet-lite')):
        # lite use binlinear for easier post-quantization.
        resize_method = tf.image.ResizeMethod.BILINEAR
    else:
        resize_method = None
    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    def build_objectron_input(is_training):
        """Generate ObjectronInput for training and eval."""
        if FLAGS.bigtable_instance:
            logging.info('Using Bigtable dataset, table %s',
                         FLAGS.bigtable_table)
            select_train, select_eval = _select_tables_from_flags()
            return objectron_input.ObjectronBigtableInput(
                is_training=is_training,
                use_bfloat16=FLAGS.use_bfloat16,
                transpose_input=FLAGS.transpose_input,
                selection=select_train if is_training else select_eval,
                num_label_classes=FLAGS.num_label_classes,
                include_background_label=include_background_label,
                augment_name=FLAGS.augment_name,
                mixup_alpha=FLAGS.mixup_alpha,
                randaug_num_layers=FLAGS.randaug_num_layers,
                randaug_magnitude=FLAGS.randaug_magnitude,
                resize_method=resize_method)
        else:
            if FLAGS.data_dir == FAKE_DATA_DIR:
                logging.info('Using fake dataset.')
            else:
                logging.info('Using dataset: %s', FLAGS.data_dir)

            return objectron_input.ObjectronInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=FLAGS.transpose_input,
                cache=FLAGS.use_cache and is_training,
                image_size=input_image_size,
                num_parallel_calls=FLAGS.num_parallel_calls,
                use_bfloat16=FLAGS.use_bfloat16,
                num_label_classes=FLAGS.num_label_classes,
                include_background_label=include_background_label,
                augment_name=FLAGS.augment_name,
                mixup_alpha=FLAGS.mixup_alpha,
                randaug_num_layers=FLAGS.randaug_num_layers,
                randaug_magnitude=FLAGS.randaug_magnitude,
                resize_method=resize_method,
                holdout_shards=FLAGS.holdout_shards)

    objectron_train = build_objectron_input(is_training=True)
    objectron_eval = build_objectron_input(is_training=False)

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir,
                                                  timeout=FLAGS.eval_timeout):
            logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=objectron_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt,
                                            name=FLAGS.eval_name)
                elapsed_time = int(time.time() - start_timestamp)
                logging.info('Eval results: %s. Elapsed seconds: %d',
                             eval_results, elapsed_time)
                if FLAGS.archive_ckpt:
                    utils.archive_ckpt(eval_results,
                                       eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                if current_step >= FLAGS.train_steps:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                try:
                    from tensorflow.contrib.tpu.python.tpu import async_checkpoint  # pylint: disable=g-import-not-at-top
                except ImportError as e:
                    logging.exception(
                        'Async checkpointing is not supported in TensorFlow 2.x'
                    )
                    raise e

                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            est.train(input_fn=objectron_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=objectron_train.input_fn,
                          max_steps=next_checkpoint)
                current_step = next_checkpoint

                logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=objectron_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size,
                                            name=FLAGS.eval_name)
                logging.info('Eval results at step %d: %s', next_checkpoint,
                             eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                if FLAGS.archive_ckpt:
                    utils.archive_ckpt(eval_results,
                                       eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
    if FLAGS.export_dir:
        export(est, FLAGS.export_dir, input_image_size)
示例#12
0
def main(unused_argv):
    # Mnas optimize - set the proper image data format
    tf.keras.backend.set_image_data_format(FLAGS.data_format)
    # Mnas optimize - optimization flags
    # gpu_thread_count = 2
    # os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    # os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
    # os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
    # os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
    # enable mixed precision? -> Not much benefits seen yet
    # os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    # Horovod: initialize Horovod.
    if FLAGS.use_horovod:
        hvd.init()
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        if not FLAGS.use_horovod:
            save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
        else:
            save_checkpoints_steps = max(
                100, FLAGS.iterations_per_loop) if hvd.rank() == 0 else None
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    if FLAGS.use_xla:
        config.session_config.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    if FLAGS.use_horovod:
        config.session_config.gpu_options.allow_growth = True
        config.session_config.gpu_options.visible_device_list = str(
            hvd.local_rank())

    # Validates Flags.
    if FLAGS.use_bfloat16 and FLAGS.use_keras:
        raise ValueError(
            'Keras layers do not have full support to bfloat16 activation training.'
            ' You have set use_bfloat as %s and use_keras as %s' %
            (FLAGS.use_bfloat16, FLAGS.use_keras))

    # Initializes model parameters.
    steps_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size
    steps_per_epoch = steps_per_epoch // hvd.size(
    ) if FLAGS.use_horovod else steps_per_epoch
    params = dict(steps_per_epoch=steps_per_epoch,
                  use_bfloat16=FLAGS.use_bfloat16,
                  quantized_training=FLAGS.quantized_training)
    if FLAGS.use_horovod:
        params['hvd'] = True
        params['hvd_curr_host'] = hvd.rank()
        params['hvd_num_hosts'] = hvd.size()
    mnasnet_est = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=mnasnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu,
        params=params)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    if FLAGS.use_horovod:
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=False,
                transpose_input=FLAGS.transpose_input,
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=FLAGS.transpose_input,
                cache=FLAGS.use_cache and is_training,
                image_size=FLAGS.input_image_size,
                num_parallel_calls=FLAGS.num_parallel_calls,
                use_bfloat16=FLAGS.use_bfloat16)
            for is_training in [True, False]
        ]

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = mnasnet_est.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

        if FLAGS.export_dir:
            export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(  # pylint: disable=protected-access
            FLAGS.model_dir)

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            mnasnet_est.train(input_fn=imagenet_train.input_fn,
                              max_steps=FLAGS.train_steps,
                              hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            curr_rank = 0
            if FLAGS.use_horovod:
                curr_rank = hvd.rank()
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                if FLAGS.use_horovod:
                    # try dali pipeline
                    mnasnet_est.train(input_fn=imagenet_train.train_data_fn,
                                      max_steps=next_checkpoint,
                                      hooks=[bcast_hook])
                    # this uses the old tf data pipeline
                    # mnasnet_est.train(
                    #     input_fn=imagenet_train.input_fn, max_steps=next_checkpoint, hooks=[bcast_hook])
                else:
                    mnasnet_est.train(input_fn=imagenet_train.input_fn,
                                      max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d. Hvd rank %d',
                    next_checkpoint, int(time.time() - start_timestamp),
                    curr_rank)

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                eval_on_single_gpu = FLAGS.eval_on_single_gpu
                tf.logging.info('Starting to evaluate.')
                if eval_on_single_gpu:
                    if curr_rank == 0:
                        eval_results = mnasnet_est.evaluate(
                            input_fn=imagenet_eval.train_data_fn,  #input_fn
                            steps=FLAGS.num_eval_images //
                            FLAGS.eval_batch_size)
                        tf.logging.info(
                            'Eval results at step %d: %s. Hvd rank %d',
                            next_checkpoint, eval_results, curr_rank)
                else:
                    eval_results = mnasnet_est.evaluate(
                        input_fn=imagenet_eval.train_data_fn,  #input_fn
                        steps=FLAGS.num_eval_images // FLAGS.eval_batch_size)
                    tf.logging.info('Eval results at step %d: %s. Hvd rank %d',
                                    next_checkpoint, eval_results, curr_rank)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
            if FLAGS.export_dir:
                export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize)
示例#13
0
def main(unused_argv):
  # Mnas optimize - set the proper image data format
  tf.keras.backend.set_image_data_format(FLAGS.data_format)
  # Mnas optimize - optimization flags
  # gpu_thread_count = 2
  # os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
  # os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
  # os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
  # os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
  # enable mixed precision? -> Not much benefits seen yet
  # os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
  
  node0 = "172.31.11.9:6060"
  node1 = "172.31.1.33:6060"

  strategy = tf.distribute.MirroredStrategy() 
  if FLAGS.total_nodes > 1:
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(tf.distribute.experimental.CollectiveCommunication.NCCL)
    if not FLAGS.is_evaluator:
      if FLAGS.node_num == 0:
        os.environ['TF_CONFIG'] = json.dumps({
          'cluster': {
              'worker': [node0, node1]
          },
          'task': {'type': 'worker', 'index': 0}
        })
      else:
        os.environ['TF_CONFIG'] = json.dumps({
          'cluster': {
              'worker': [node0, node1]
          },
          'task': {'type': 'worker', 'index': 1}
        })
    else:
      os.environ['TF_CONFIG'] = json.dumps({
        'cluster': {
            'evaluator': ["localhost:6060"]
        },
        'task': {'type': 'evaluator', 'index': 0}
      })
  
  
  if FLAGS.use_async_checkpointing:
    save_checkpoints_steps = None
  else:
    save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
    
  gconfig = tf.ConfigProto(
          graph_options=tf.GraphOptions(
              rewrite_options=rewriter_config_pb2.RewriterConfig(
                  disable_meta_optimizer=True)))

  if FLAGS.use_xla:
    gconfig.session_config.graph_options.optimizer_options.global_jit_level = (tf.OptimizerOptions.ON_1)
  
  # mnasnet opt - check if this is required!
  gconfig.gpu_options.allow_growth = True
  #gconfig.session_config.gpu_options.visible_device_list = str(hvd.local_rank())
  
  config = tf.estimator.RunConfig(
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=save_checkpoints_steps,
      log_step_count_steps=FLAGS.log_step_count_steps,
      train_distribute=strategy,
      session_config=gconfig)  # pylint: disable=line-too-long

  print('mnasnet opt - config cluster spec', config.cluster_spec)
  
  # Initializes model parameters.
  params = dict(
      steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size,
      batch_size=FLAGS.train_batch_size,
      dtype = tf.float32,
      use_bfloat16=FLAGS.use_bfloat16,
      quantized_training=FLAGS.quantized_training)
  
  mnasnet_est = tf.estimator.Estimator(
      model_fn=mnasnet_model_fn,
      model_dir=FLAGS.model_dir,
      config=config,
      params=params)
  
  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  if FLAGS.bigtable_instance:
    tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table)
    select_train, select_eval = _select_tables_from_flags()
    imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput(
        is_training=is_training,
        use_bfloat16=False,
        transpose_input=FLAGS.transpose_input,
        selection=selection) for (is_training, selection) in
                                     [(True, select_train),
                                      (False, select_eval)]]
  else:
    if FLAGS.data_dir == FAKE_DATA_DIR:
      tf.logging.info('Using fake dataset.')
    else:
      tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=FLAGS.transpose_input,
            cache=FLAGS.use_cache and is_training,
            image_size=FLAGS.input_image_size,
            num_parallel_calls=FLAGS.num_parallel_calls,
            use_bfloat16=FLAGS.use_bfloat16) for is_training in [True, False]
    ]

  if FLAGS.mode == 'eval':
    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = mnasnet_est.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results,
                        elapsed_time)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= FLAGS.train_steps:
          tf.logging.info('Evaluation finished after training step %d',
                          current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint',
                        ckpt)

    if FLAGS.export_dir:
      export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize)
  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    current_step = estimator._load_global_step_from_checkpoint_dir(  # pylint: disable=protected-access
        FLAGS.model_dir)

    tf.logging.info(
        'Training for %d steps (%.2f epochs in total). Current'
        ' step %d.', FLAGS.train_steps,
        FLAGS.train_steps / params['steps_per_epoch'], current_step)

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == 'train':
      hooks = []
      if FLAGS.use_async_checkpointing:
        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=FLAGS.model_dir,
                save_steps=max(100, FLAGS.iterations_per_loop)))
      mnasnet_est.train(
          input_fn=imagenet_train.input_fn,
          max_steps=FLAGS.train_steps,
          hooks=hooks)

    else:
      assert FLAGS.mode == 'train_and_eval'
      train_spec = tf.estimator.TrainSpec(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps)
      eval_spec = tf.estimator.EvalSpec(input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size, throttle_secs=600)
      tf.estimator.train_and_evaluate(mnasnet_est, train_spec, eval_spec)
      
      elapsed_time = int(time.time() - start_timestamp)
      tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                      FLAGS.train_steps, elapsed_time)
      if FLAGS.export_dir:
        export(mnasnet_est, FLAGS.export_dir, FLAGS.post_quantize)
示例#14
0
def main(unused_argv):

    tpu = 'chocoarthur'
    tpu_zone = 'us-central1-f'
    gcp_project = 'cloud-tpu-epfl'

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        tpu if (hparams.tpu or hparams.use_tpu) else '',
        zone=tpu_zone,
        project=gcp_project)

    if hparams.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, hparams.iterations_per_loop)

    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=hparams.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=hparams.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=hparams.iterations_per_loop,
            num_shards=hparams.num_cores,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
                .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=hparams.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=hparams.train_batch_size,
        eval_batch_size=hparams.eval_batch_size,
        export_to_tpu=hparams.export_to_tpu)
    assert hparams.precision == 'bfloat16' or hparams.precision == 'float32', (
        'Invalid value for --precision flag; must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', hparams.precision)
    use_bfloat16 = hparams.precision == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if hparams.data_dir == FAKE_DATA_DIR:
        tf.logging.info('Using fake dataset.')
    else:
        tf.logging.info('Using dataset: %s', hparams.data_dir)
    # imagenet_train, imagenet_eval = [
    #     imagenet_input.ImagenetRecordInput(
    #         is_training=is_training,
    #         data_dir=hparams.data_dir,
    #         transpose_input=hparams.transpose_input,
    #         cache=hparams.use_cache and is_training,
    #         image_size=hparams.image_size,
    #         num_parallel_calls=hparams.num_parallel_calls,
    #         use_bfloat16=use_bfloat16) for is_training in [True, False]
    # ]

    imagenet_train = imagenet_input.InputFunction(
        is_training=True,
        noise_dim=128,
        num_classes=hparams.num_label_classes,
        data_dir=hparams.data_dir,
    )

    imagenet_eval = imagenet_input.InputFunction(
        is_training=False,
        noise_dim=128,
        num_classes=hparams.num_label_classes,
        data_dir=hparams.data_dir,
    )

    eval_steps = hparams.num_eval_images // hparams.eval_batch_size

    if hparams.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                model_dir, timeout=hparams.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time()  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= hparams.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d', current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint', ckpt)

    else:  # hparams.mode == 'train' or hparams.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            hparams.model_dir)  # pylint: disable=protected-access,line-too-long
        steps_per_epoch = hparams.num_train_images // hparams.train_batch_size

        tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                        ' step %d.',
                        hparams.train_steps,
                        hparams.train_steps / steps_per_epoch,
                        current_step)

        start_timestamp = time.time()  # This time will include compilation time

        if hparams.mode == 'train':
            hooks = []
            if hparams.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=model_dir,
                        save_steps=max(100, hparams.iterations_per_loop)))
            if hparams.profile_every_n_steps > 0:
                hooks.append(
                    tpu_profiler_hook.TPUProfilerHook(
                        save_steps=hparams.profile_every_n_steps,
                        output_dir=model_dir, tpu=hparams.tpu)
                )
            resnet_classifier.train(
                input_fn=imagenet_train,
                max_steps=hparams.train_steps,
                hooks=hooks)

        else:
            assert hparams.mode == 'train_and_eval'
            while current_step < hparams.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + hparams.steps_per_eval,
                                      hparams.train_steps)
                resnet_classifier.train(
                    input_fn=imagenet_train, max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                                next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval,
                    steps=hparams.num_eval_images // hparams.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s',
                                next_checkpoint, eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                            hparams.train_steps, elapsed_time)

        if hparams.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            resnet_classifier.export_saved_model(
                export_dir_base=hparams.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
示例#15
0
def main(unused_argv):

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        if FLAGS.model_name.startswith('efficientnet'):
            _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
                FLAGS.model_name)
        else:
            raise ValueError(
                'input_image_size must be set expect for EfficientNet.')

    save_checkpoints_steps = max(100, FLAGS.steps_per_eval)

    config = tf.estimator.RunConfig(
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
    )

    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size,
                  use_bfloat16=FLAGS.use_bfloat16,
                  batch_size=FLAGS.train_batch_size)
    est = tf.estimator.Estimator(model_fn=model_fn,
                                 config=config,
                                 params=params)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.

    if FLAGS.data_dir == FAKE_DATA_DIR:
        tf.logging.info('Using fake dataset.')
    else:
        tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    data_train, data_eval = [
        mnist_input.ImageNetInput(is_training=is_training,
                                  data_dir=FLAGS.data_dir,
                                  transpose_input=FLAGS.transpose_input,
                                  cache=FLAGS.use_cache and is_training,
                                  image_size=input_image_size,
                                  use_bfloat16=FLAGS.use_bfloat16)
        for is_training in [True, False]
    ]

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=data_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

        if FLAGS.export_dir:
            export(est, FLAGS.export_dir, input_image_size)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            est.train(input_fn=data_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=data_train.input_fn,
                          max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=data_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
            if FLAGS.export_dir:
                export(est, FLAGS.export_dir, input_image_size)
示例#16
0
def main(unused_argv):
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu)
    assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', (
        'Invalid value for --precision flag; must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', FLAGS.precision)
    use_bfloat16 = FLAGS.precision == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()

        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=use_bfloat16,
                transpose_input=FLAGS.transpose_input,
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]

    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                prices_dir=FLAGS.prices_dir,
                predict_dir=FLAGS.predict_dir,
                transpose_input=FLAGS.transpose_input,
                cache=FLAGS.use_cache and is_training,
                image_size=FLAGS.image_size,
                num_parallel_calls=FLAGS.num_parallel_calls,
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size
    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            if FLAGS.profile_every_n_steps > 0:
                hooks.append(
                    tpu_profiler_hook.TPUProfilerHook(
                        save_steps=FLAGS.profile_every_n_steps,
                        output_dir=FLAGS.model_dir,
                        tpu=FLAGS.tpu))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=FLAGS.train_steps,
                                    hooks=hooks)

        elif FLAGS.mode == 'train_and_eval':
            # assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=FLAGS.num_eval_images // FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
        else:  # FLAGS.mode == 'predict'

            price_file_pattern = os.path.join(FLAGS.prices_dir, 'price-*')
            while True:
                time.sleep(10)
                price_files = glob.glob(price_file_pattern)
                if len(price_files) == 0:
                    continue
                tf.logging.info('Starting to predict.')
                with open(price_files[0], "r") as fcsv:
                    csvreader = csv.reader(fcsv, delimiter=",")
                    price_batch_size = len(list(csvreader))
                predictions = resnet_classifier.predict(
                    input_fn=lambda params: imagenet_eval.predict_input_fn(
                        params, price_batch_size), )

                # Output predictions to predict-0001.csv BorisTown
                predict_filename = os.path.join(FLAGS.predict_dir,
                                                'predict-0001.csv')
                predict_file = open(predict_filename, "w")
                predict_file.truncate()
                predict_line = ''
                for pred_item in predictions:
                    predict_line = ''
                    for pred_operation in pred_item['probabilities']:
                        if predict_line != '':
                            predict_line += ','
                        predict_line += str(pred_operation)
                    predict_file.write(predict_line + '\n')
                predict_file.close()
                for price_file in price_files:
                    tf.logging.info('Removing ' + price_file)
                    os.remove(price_file)

        if FLAGS.export_dir is not None and FLAGS.mode != 'predict':
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )