示例#1
0
def main(argv):
    del argv  # Unused.

    if FLAGS.start_profiler_server:
        # Starts profiler. It will perform profiling when receive profiling request.
        profiler.start_profiler_server(FLAGS.profiler_port_number)

    if FLAGS.use_tpu:
        if FLAGS.distribution_strategy is None:
            tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
                FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
            tpu_grpc_url = tpu_cluster_resolver.get_master()
            tf.Session.reset(tpu_grpc_url)
        else:
            raise RuntimeError(
                'Distribution strategy must be None when --use_tpu is True.')
    else:
        tpu_cluster_resolver = None

    if FLAGS.mode not in ['train', 'eval', 'train_and_eval']:
        raise ValueError('Unrecognize --mode: %s' % FLAGS.mode)

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')
    if FLAGS.mode == 'train_and_eval':
        if FLAGS.distribution_strategy is not None:
            raise RuntimeError('You must use --distribution_strategy=None for '
                               'train_and_eval.')

    # Parse hparams
    hparams = retinanet_model.default_hparams()
    config_file = FLAGS.config_file
    hparams.num_epochs = FLAGS.num_epochs
    if config_file and tf.gfile.Exists(config_file):
        # load params from file.
        with tf.gfile.Open(config_file, 'r') as f:
            values_map = json.load(f)
            hparams.override_from_dict(values_map)
    hparams.parse(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in hparams, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = hparams.get('image_size')
        for level in range(hparams.get('min_level'),
                           hparams.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None

        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.use_xla and not FLAGS.use_tpu:
        config_proto.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)
    if FLAGS.auto_mixed_precision and FLAGS.distribution_strategy:
        config_proto.graph_options.rewrite_options.auto_mixed_precision = (
            rewriter_config_pb2.RewriterConfig.ON)

    if FLAGS.distribution_strategy is None:
        # Uses TPUEstimator.
        params = dict(
            hparams.values(),
            num_shards=num_shards,
            num_examples_per_epoch=FLAGS.num_examples_per_epoch,
            use_tpu=FLAGS.use_tpu,
            resnet_checkpoint=FLAGS.resnet_checkpoint,
            val_json_file=FLAGS.val_json_file,
            mode=FLAGS.mode,
        )
        tpu_config = contrib_tpu.TPUConfig(
            FLAGS.iterations_per_loop,
            num_shards=num_shards,
            num_cores_per_replica=num_cores_per_replica,
            input_partition_dims=input_partition_dims,
            per_host_input_for_training=contrib_tpu.InputPipelineConfig.
            PER_HOST_V2)

        run_config = contrib_tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            evaluation_master=FLAGS.eval_master,
            model_dir=FLAGS.model_dir,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            tpu_config=tpu_config,
        )
    else:
        if FLAGS.num_gpus < 0:
            raise ValueError('`num_gpus` cannot be negative.')

        def _per_device_batch_size(batch_size, num_gpus):
            """Calculate per GPU batch for Estimator.

      Args:
        batch_size: Global batch size to be divided among devices.
        num_gpus: How many GPUs are used per worker.
      Returns:
        Batch size per device.
      Raises:
        ValueError: if batch_size is not divisible by number of devices
      """
            if num_gpus <= 1:
                return batch_size

            remainder = batch_size % num_gpus
            if remainder:
                raise ValueError(
                    'Batch size must be a multiple of the number GPUs per worker.'
                )
            return int(batch_size / num_gpus)

        # Uses Estimator.
        params = dict(
            hparams.values(),
            num_examples_per_epoch=FLAGS.num_examples_per_epoch,
            use_tpu=FLAGS.use_tpu,
            resnet_checkpoint=FLAGS.resnet_checkpoint,
            val_json_file=FLAGS.val_json_file,
            mode=FLAGS.mode,
            use_bfloat16=False,
            auto_mixed_precision=FLAGS.auto_mixed_precision,
            dataset_max_intra_op_parallelism=FLAGS.
            dataset_max_intra_op_parallelism,
            dataset_private_threadpool_size=FLAGS.
            dataset_private_threadpool_size,
        )

        if FLAGS.distribution_strategy == 'mirrored':
            params['batch_size'] = _per_device_batch_size(
                FLAGS.train_batch_size, FLAGS.num_gpus)

            if FLAGS.num_gpus == 0:
                devices = ['device:CPU:0']
            else:
                devices = [
                    'device:GPU:{}'.format(i) for i in range(FLAGS.num_gpus)
                ]

            if FLAGS.all_reduce_alg:
                dist_strat = tf.distribute.MirroredStrategy(
                    devices=devices,
                    cross_device_ops=contrib_distribute.
                    AllReduceCrossDeviceOps(FLAGS.all_reduce_alg, num_packs=2))
            else:
                dist_strat = tf.distribute.MirroredStrategy(devices=devices)

            run_config = tf.estimator.RunConfig(session_config=config_proto,
                                                train_distribute=dist_strat,
                                                eval_distribute=dist_strat)

        elif FLAGS.distribution_strategy == 'multi_worker_mirrored':
            local_device_protos = device_lib.list_local_devices()
            params['batch_size'] = _per_device_batch_size(
                FLAGS.train_batch_size,
                sum([1 for d in local_device_protos
                     if d.device_type == 'GPU']))

            if FLAGS.worker_hosts is None:
                tf_config_json = json.loads(os.environ.get('TF_CONFIG', '{}'))
                # Replaces master with chief.
                if tf_config_json:
                    if 'master' in tf_config_json['cluster']:
                        tf_config_json['cluster']['chief'] = tf_config_json[
                            'cluster'].pop('master')
                        if tf_config_json['task']['type'] == 'master':
                            tf_config_json['task']['type'] = 'chief'
                        os.environ['TF_CONFIG'] = json.dumps(tf_config_json)

                tf_config_json = json.loads(os.environ['TF_CONFIG'])
                worker_hosts = tf_config_json['cluster']['worker']
                worker_hosts.extend(tf_config_json['cluster'].get('chief', []))
            else:
                # Set TF_CONFIG environment variable
                worker_hosts = FLAGS.worker_hosts.split(',')
                os.environ['TF_CONFIG'] = json.dumps({
                    'cluster': {
                        'worker': worker_hosts
                    },
                    'task': {
                        'type': 'worker',
                        'index': FLAGS.task_index
                    }
                })

            dist_strat = tf.distribute.experimental.MultiWorkerMirroredStrategy(
                communication=_COLLECTIVE_COMMUNICATION_OPTIONS[
                    FLAGS.all_reduce_alg])
            run_config = tf.estimator.RunConfig(session_config=config_proto,
                                                train_distribute=dist_strat)

        else:
            raise ValueError('Unrecognized distribution strategy.')

    if FLAGS.mode == 'train':
        if FLAGS.model_dir is not None:
            if not tf.gfile.Exists(FLAGS.model_dir):
                tf.gfile.MakeDirs(FLAGS.model_dir)
            with tf.gfile.Open(os.path.join(FLAGS.model_dir, 'hparams.json'),
                               'w') as f:
                json.dump(hparams.values(), f, sort_keys=True, indent=2)
        tf.logging.info(params)
        if FLAGS.distribution_strategy is None:
            total_steps = int(
                (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                FLAGS.train_batch_size)
            train_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern, is_training=True),
                                  max_steps=total_steps)

            # Run evaluation after training finishes.
            eval_params = dict(
                params,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
            )
            eval_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            if FLAGS.eval_after_training:

                if FLAGS.val_json_file is None:
                    raise RuntimeError(
                        'You must specify --val_json_file for evaluation.')

                eval_results = evaluation.evaluate(
                    eval_estimator,
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    num_eval_samples=FLAGS.eval_samples,
                    eval_batch_size=FLAGS.eval_batch_size,
                    validation_json_file=FLAGS.val_json_file)
                tf.logging.info('Eval results: %s' % eval_results)
                output_dir = os.path.join(FLAGS.model_dir, 'train_eval')
                tf.gfile.MakeDirs(output_dir)
                summary_writer = tf.summary.FileWriter(output_dir)

                evaluation.write_summary(eval_results, summary_writer,
                                         total_steps)
        else:
            train_estimator = tf.estimator.Estimator(
                model_fn=retinanet_model.est_retinanet_model_fn,
                model_dir=FLAGS.model_dir,
                config=run_config,
                params=params)
            if FLAGS.distribution_strategy == 'mirrored':
                total_steps = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                tf.logging.info('Starting `MirroredStrategy` training...')
                train_estimator.train(input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern, is_training=True),
                                      max_steps=total_steps)
            elif FLAGS.distribution_strategy == 'multi_worker_mirrored':
                total_steps = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    (len(worker_hosts) * FLAGS.train_batch_size))
                train_spec = tf.estimator.TrainSpec(
                    input_fn=dataloader.InputReader(
                        FLAGS.training_file_pattern, is_training=True),
                    max_steps=total_steps)
                eval_spec = tf.estimator.EvalSpec(input_fn=tf.data.Dataset)
                tf.logging.info(
                    'Starting `MultiWorkerMirroredStrategy` training...')
                tf.estimator.train_and_evaluate(train_estimator, train_spec,
                                                eval_spec)
            else:
                raise ValueError('Unrecognized distribution strategy.')

    elif FLAGS.mode == 'eval':
        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')
        eval_params = dict(
            params,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
        )
        if FLAGS.distribution_strategy is None:
            # Uses TPUEstimator.
            eval_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
        else:
            # Uses Estimator.
            if FLAGS.distribution_strategy == 'multi_worker_mirrored':
                raise ValueError(
                    '--distribution_strategy=multi_worker_mirrored is not supported '
                    'for eval.')
            elif FLAGS.distribution_strategy == 'mirrored':
                eval_estimator = tf.estimator.Estimator(
                    model_fn=retinanet_model.est_retinanet_model_fn,
                    model_dir=FLAGS.model_dir,
                    config=run_config,
                    params=params)
            else:
                raise ValueError('Unrecognized distribution strategy.')

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        summary_writer = tf.summary.FileWriter(output_dir)
        # Run evaluation when there's a new checkpoint
        for ckpt in contrib_training.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = evaluation.evaluate(
                    eval_estimator,
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    num_eval_samples=FLAGS.eval_samples,
                    eval_batch_size=FLAGS.eval_batch_size,
                    validation_json_file=FLAGS.val_json_file)
                tf.logging.info('Eval results: %s' % eval_results)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                evaluation.write_summary(eval_results, summary_writer,
                                         current_step)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        if FLAGS.distribution_strategy is not None:
            raise ValueError(
                'Distribution strategy is not implemented for --mode=train_and_eval.'
            )
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

        output_dir = os.path.join(FLAGS.model_dir, 'train_and_eval')
        tf.gfile.MakeDirs(output_dir)
        summary_writer = tf.summary.FileWriter(output_dir)
        num_cycles = int(FLAGS.num_epochs * FLAGS.num_examples_per_epoch /
                         FLAGS.num_steps_per_eval)
        for cycle in range(num_cycles):
            tf.logging.info('Starting training cycle, epoch: %d.' % cycle)
            train_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern, is_training=True),
                                  steps=FLAGS.num_steps_per_eval)

            tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle)
            # Run evaluation after every epoch.
            eval_params = dict(
                params,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
            )

            eval_estimator = contrib_tpu.TPUEstimator(
                model_fn=retinanet_model.tpu_retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = evaluation.evaluate(
                eval_estimator,
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                num_eval_samples=FLAGS.eval_samples,
                eval_batch_size=FLAGS.eval_batch_size,
                validation_json_file=FLAGS.val_json_file)
            tf.logging.info('Evaluation results: %s' % eval_results)
            current_step = int(cycle * FLAGS.num_steps_per_eval)
            evaluation.write_summary(eval_results, summary_writer,
                                     current_step)

    else:
        tf.logging.info('Mode not found.')

    if FLAGS.model_dir:
        tf.logging.info('Exporting saved model.')
        eval_params = dict(
            params,
            use_tpu=True,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
            use_bfloat16=False,
        )
        eval_estimator = contrib_tpu.TPUEstimator(
            model_fn=retinanet_model.tpu_retinanet_model_fn,
            use_tpu=True,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.inference_batch_size,
            config=run_config,
            params=eval_params)

        export_path = eval_estimator.export_saved_model(
            export_dir_base=FLAGS.model_dir,
            serving_input_receiver_fn=build_serving_input_fn(
                hparams.image_size, FLAGS.inference_batch_size))
        if FLAGS.add_warmup_requests:
            inference_warmup.write_warmup_requests(
                export_path,
                FLAGS.model_name,
                hparams.image_size,
                batch_sizes=[FLAGS.inference_batch_size])
示例#2
0
def main(argv):
    del argv  # Unused.

    if FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    # Parse hparams
    hparams = retinanet_model.default_hparams()
    hparams.parse(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in hparams, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = hparams.get('image_size')
        for level in range(hparams.get('min_level'),
                           hparams.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None

        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(
        hparams.values(),
        num_shards=num_shards,
        num_examples_per_epoch=FLAGS.num_examples_per_epoch,
        use_tpu=FLAGS.use_tpu,
        resnet_checkpoint=FLAGS.resnet_checkpoint,
        val_json_file=FLAGS.val_json_file,
        mode=FLAGS.mode,
    )
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.use_xla and not FLAGS.use_tpu:
        config_proto.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)

    tpu_config = tf.contrib.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_shards=num_shards,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
        PER_HOST_V2)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        evaluation_master=FLAGS.eval_master,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
    )

    # TPU Estimator
    if FLAGS.mode == 'train':
        tf.logging.info(params)
        train_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=retinanet_model.retinanet_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(
            input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                            is_training=True),
            max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size))

        # Run evaluation after training finishes.
        eval_params = dict(
            params,
            use_tpu=False,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
            use_bfloat16=False,
        )
        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=retinanet_model.retinanet_model_fn,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)
        if FLAGS.eval_after_training:

            if FLAGS.val_json_file is None:
                raise RuntimeError(
                    'You must specify --val_json_file for evaluation.')

            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
            tf.logging.info('Eval results: %s' % eval_results)
        if FLAGS.model_dir:
            eval_estimator.export_saved_model(
                export_dir_base=FLAGS.model_dir,
                serving_input_receiver_fn=lambda: serving_input_fn(hparams.
                                                                   image_size))

    elif FLAGS.mode == 'eval':
        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.
        eval_params = dict(
            params,
            use_tpu=False,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
            use_bfloat16=False,
        )

        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=retinanet_model.retinanet_model_fn,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.contrib.training.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                tf.logging.info('Eval results: %s' % eval_results)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break
                eval_estimator.export_saved_model(
                    export_dir_base=FLAGS.model_dir,
                    serving_input_receiver_fn=lambda: serving_input_fn(
                        hparams.image_size))

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.num_epochs):
            tf.logging.info('Starting training cycle, epoch: %d.' % cycle)
            train_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=retinanet_model.retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern, is_training=True),
                                  steps=int(FLAGS.num_examples_per_epoch /
                                            FLAGS.train_batch_size))

            tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle)
            # Run evaluation after every epoch.
            eval_params = dict(
                params,
                use_tpu=False,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
            )

            eval_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=retinanet_model.retinanet_model_fn,
                use_tpu=False,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
            tf.logging.info('Evaluation results: %s' % eval_results)
        eval_estimator.export_saved_model(export_dir_base=FLAGS.model_dir,
                                          serving_input_receiver_fn=lambda:
                                          serving_input_fn(hparams.image_size))

    else:
        tf.logging.info('Mode not found.')
示例#3
0
def main(_):
    if FLAGS.strategy == 'tpu':
        tf.disable_eager_execution()
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        # Always enable auto mixed precision graph rewrite
        os.environ[
            'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'] = '1'
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in ('train', 'train_and_eval'):
        if FLAGS.training_file_pattern is None:
            raise RuntimeError(
                'Must specify --training_file_pattern for train.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError(
                'Must specify --validation_file_pattern for eval.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
            'image_masks': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_shards=num_shards,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  backbone_ckpt=FLAGS.backbone_ckpt,
                  ckpt=FLAGS.ckpt,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  profile=FLAGS.profile,
                  mode=FLAGS.mode)
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.strategy != 'tpu':
        if FLAGS.use_xla:
            config_proto.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)
            config_proto.gpu_options.allow_growth = True

    model_dir = FLAGS.model_dir
    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
    max_instances_per_image = config.max_instances_per_image
    if FLAGS.eval_samples:
        eval_steps = int((FLAGS.eval_samples + FLAGS.eval_batch_size - 1) //
                         FLAGS.eval_batch_size)
    else:
        eval_steps = None
    total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
    train_steps = total_examples // FLAGS.train_batch_size
    logging.info(params)

    if not tf.io.gfile.exists(model_dir):
        tf.io.gfile.makedirs(model_dir)

    config_file = os.path.join(model_dir, 'config.yaml')
    if not tf.io.gfile.exists(config_file):
        tf.io.gfile.GFile(config_file, 'w').write(str(config))

    train_input_fn = dataloader.InputReader(
        FLAGS.training_file_pattern,
        is_training=True,
        use_fake_data=FLAGS.use_fake_data,
        max_instances_per_image=max_instances_per_image)
    eval_input_fn = dataloader.InputReader(
        FLAGS.validation_file_pattern,
        is_training=False,
        use_fake_data=FLAGS.use_fake_data,
        max_instances_per_image=max_instances_per_image)

    if FLAGS.strategy == 'tpu':
        tpu_config = tf.estimator.tpu.TPUConfig(
            FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1,
            num_cores_per_replica=num_cores_per_replica,
            input_partition_dims=input_partition_dims,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
            PER_HOST_V2)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=model_dir,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            tpu_config=tpu_config,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tf_random_seed=FLAGS.tf_random_seed,
        )
        # TPUEstimator can do both train and eval.
        train_est = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=params)
        eval_est = train_est
    else:
        strategy = None
        if FLAGS.strategy == 'gpus':
            strategy = tf.distribute.MirroredStrategy()
        run_config = tf.estimator.RunConfig(
            model_dir=model_dir,
            train_distribute=strategy,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tf_random_seed=FLAGS.tf_random_seed,
        )

        def get_estimator(global_batch_size):
            params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1)
            params['batch_size'] = global_batch_size // params['num_shards']
            return tf.estimator.Estimator(model_fn=model_fn_instance,
                                          config=run_config,
                                          params=params)

        # train and eval need different estimator due to different batch size.
        train_est = get_estimator(FLAGS.train_batch_size)
        eval_est = get_estimator(FLAGS.eval_batch_size)

    # start train/eval flow.
    if FLAGS.mode == 'train':
        train_est.train(input_fn=train_input_fn, max_steps=train_steps)
        if FLAGS.eval_after_training:
            eval_est.evaluate(input_fn=eval_input_fn, steps=eval_steps)

    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_est.evaluate(eval_input_fn,
                                                 steps=eval_steps)
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                if current_step >= train_steps:
                    logging.info('Eval finished step %d/%d', current_step,
                                 train_steps)
                    break

            except tf.errors.NotFoundError:
                # Checkpoint might be not already deleted by the time eval finished.
                # We simply skip ssuch case.
                logging.info('Checkpoint %s no longer exists, skipping.', ckpt)

    elif FLAGS.mode == 'train_and_eval':
        ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
        try:
            step = int(os.path.basename(ckpt).split('-')[1])
            current_epoch = (step * FLAGS.train_batch_size //
                             FLAGS.num_examples_per_epoch)
            logging.info('found ckpt at step %d (epoch %d)', step,
                         current_epoch)
        except (IndexError, TypeError):
            logging.info('Folder %s has no ckpt with valid step.',
                         FLAGS.model_dir)
            current_epoch = 0

        def run_train_and_eval(e):
            print('\n   =====> Starting training, epoch: %d.' % e)
            train_est.train(input_fn=train_input_fn,
                            max_steps=e * FLAGS.num_examples_per_epoch //
                            FLAGS.train_batch_size)
            print('\n   =====> Starting evaluation, epoch: %d.' % e)
            eval_results = eval_est.evaluate(input_fn=eval_input_fn,
                                             steps=eval_steps)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

        epochs_per_cycle = 1  # higher number has less graph construction overhead.
        for e in range(current_epoch + 1, config.num_epochs + 1,
                       epochs_per_cycle):
            if FLAGS.run_epoch_in_child_process:
                p = multiprocessing.Process(target=run_train_and_eval,
                                            args=(e, ))
                p.start()
                p.join()
                if p.exitcode != 0:
                    return p.exitcode
            else:
                run_train_and_eval(e)

    else:
        logging.info('Invalid mode: %s', FLAGS.mode)
示例#4
0
def main(argv):
    del argv  # Unused.

    # Configure parameters.
    config = mask_rcnn_params.default_config()
    config = params_io.override_hparams(config, FLAGS.config)

    if FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if (FLAGS.mode in ('train', 'train_and_eval')
            and not config.training_file_pattern):
        raise RuntimeError(
            'You must specify `training_file_pattern` for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if not config.validation_file_pattern:
            raise RuntimeError('You must specify `validation_file_pattern` '
                               'for evaluation.')
        if not config.val_json_file:
            raise RuntimeError(
                'You must specify `val_json_file` for evaluation.')

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` has 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # Note: In the below code, TPUEstimator uses both `shard` and `replica` (with
    # the same meaning).
    if FLAGS.input_partition_dims:
        labels_partition_dims = {
            'gt_boxes': None,
            'gt_classes': None,
            'cropped_gt_masks': None,
        }
        # TODO(b/119617317): The Input Partition Logic. We partition only the
        # partition-able tensors. Spatial partition requires that the
        # to-be-partitioned tensors must have a dimension that is a multiple of
        # `partition_dims`. Depending on the `partition_dims` and the `image_size`
        # and the `max_level` in config, some high-level anchor labels (i.e.,
        # `cls_targets` and `box_targets`) cannot be partitioned. For example, when
        # `partition_dims` is [1, 4, 2, 1], image size is 1536, `max_level` is 9,
        # `cls_targets_8` has a shape of [batch_size, 6, 6, 9], which cannot be
        # partitioned (6 % 4 != 0). In this case, the level-8 and level-9 target
        # tensors are not partition-able, and the highest partition-able level is 7.
        image_size = config.image_size
        for level in range(config.min_level, config.max_level + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['score_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['score_targets_%d' % level] = None
        num_cores_per_replica = np.prod(FLAGS.input_partition_dims)
        features_partition_dims = {
            'images': FLAGS.input_partition_dims,
            'source_ids': None,
            'image_info': None,
        }
        input_partition_dims = [features_partition_dims, labels_partition_dims]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores
    params = dict(
        config.values(),
        num_shards=num_shards,
        use_tpu=FLAGS.use_tpu,
        mode=FLAGS.mode,
        # The following are used by the host_call function.
        model_dir=FLAGS.model_dir,
        iterations_per_loop=FLAGS.iterations_per_loop,
        transpose_input=FLAGS.transpose_input)

    tpu_config = tf.contrib.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_shards=num_shards,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
        PER_HOST_V2)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        evaluation_master=FLAGS.eval_master,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        tpu_config=tpu_config,
    )

    if FLAGS.mode == 'train':
        if FLAGS.model_dir:
            save_config(config, FLAGS.model_dir)

        tf.logging.info(params)
        train_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(input_fn=dataloader.InputReader(
            config.training_file_pattern,
            mode=tf.estimator.ModeKeys.TRAIN,
            use_fake_data=FLAGS.use_fake_data),
                              max_steps=config.total_steps)

        if FLAGS.eval_after_training:
            # Run evaluation after training finishes.
            eval_params_dict = dict(
                params,
                use_tpu=FLAGS.use_tpu,
                input_rand_hflip=False,
                is_training_bn=False,
                transpose_input=False,
            )

            eval_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=mask_rcnn_model.mask_rcnn_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=config.train_batch_size,
                eval_batch_size=config.eval_batch_size,
                predict_batch_size=config.eval_batch_size,
                config=run_config,
                params=eval_params_dict)

            output_dir = os.path.join(FLAGS.model_dir, 'eval')
            tf.gfile.MakeDirs(output_dir)
            # Summary writer writes out eval metrics.
            summary_writer = tf.summary.FileWriter(output_dir)
            eval_results = evaluation(eval_estimator, config)
            write_summary(eval_results, summary_writer, config.total_steps)

            summary_writer.close()

    elif FLAGS.mode == 'eval':
        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        eval_params_dict = dict(
            params,
            use_tpu=FLAGS.use_tpu,
            input_rand_hflip=False,
            is_training_bn=False,
            transpose_input=False,
        )

        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            predict_batch_size=config.eval_batch_size,
            config=run_config,
            params=eval_params_dict)

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.contrib.training.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):
            # Terminate eval job when final checkpoint is reached
            current_step = int(os.path.basename(ckpt).split('-')[1])

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = evaluation(eval_estimator, config)
                write_summary(eval_results, summary_writer, current_step)

                if current_step >= config.total_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)
        summary_writer.close()

        # Export saved model.
        eval_estimator.export_saved_model(
            export_dir_base=FLAGS.model_dir,
            serving_input_receiver_fn=functools.partial(
                dataloader.serving_input_fn,
                batch_size=config.eval_batch_size,
                image_size=config.image_size))

    elif FLAGS.mode == 'train_and_eval':
        if FLAGS.model_dir:
            save_config(config, FLAGS.model_dir)

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        summary_writer = tf.summary.FileWriter(output_dir)
        train_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            config=run_config,
            params=params)
        eval_params_dict = dict(
            params,
            use_tpu=FLAGS.use_tpu,
            input_rand_hflip=False,
            is_training_bn=False,
        )
        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            predict_batch_size=config.eval_batch_size,
            config=run_config,
            params=eval_params_dict)

        num_cycles = int(config.total_steps / config.num_steps_per_eval)
        for cycle in range(num_cycles):
            tf.logging.info('Start training cycle %d.' % cycle)
            train_estimator.train(input_fn=dataloader.InputReader(
                config.training_file_pattern,
                mode=tf.estimator.ModeKeys.TRAIN),
                                  steps=config.num_steps_per_eval)

            tf.logging.info('Start evaluation cycle %d.' % cycle)
            eval_results = evaluation(eval_estimator, config)

            current_step = int(cycle * config.num_steps_per_eval)
            write_summary(eval_results, summary_writer, current_step)

        tf.logging.info('Starting training cycle %d.' % num_cycles)
        train_estimator.train(input_fn=dataloader.InputReader(
            config.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN),
                              max_steps=config.total_steps)
        eval_results = evaluation(eval_estimator, config)
        write_summary(eval_results, summary_writer, config.total_steps)
        summary_writer.close()

        # Export saved model.
        eval_estimator.export_saved_model(
            export_dir_base=FLAGS.model_dir,
            serving_input_receiver_fn=functools.partial(
                dataloader.serving_input_fn,
                batch_size=config.eval_batch_size,
                image_size=config.image_size))

    else:
        tf.logging.info('Mode not found.')
示例#5
0
def main(argv):
    del argv  # Unused.

    # Check flag values
    if FLAGS.master is None and FLAGS.tpu_name is None:
        raise RuntimeError('You must specify either --master or --tpu_name.')

    if FLAGS.master is not None:
        if FLAGS.tpu_name is not None:
            tf.logging.warn('Both --master and --tpu_name are set. Ignoring '
                            '--tpu_name and using --master.')
        tpu_grpc_url = FLAGS.master
    else:
        tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
        tpu_grpc_url = tpu_cluster_resolver.get_master()
    tf.Session.reset(tpu_grpc_url)

    if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode is 'eval':
        if FLAGS.valid_data_dir is None:
            raise RuntimeError(
                'You must specify --valid_data_dir for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    # Parse hparams
    hparams = retinanet_model.default_hparams()
    hparams.parse(FLAGS.hparams)

    params = dict(
        hparams.values(),
        num_shards=FLAGS.num_shards,
        use_tpu=FLAGS.use_tpu,
        resnet_checkpoint=FLAGS.resnet_checkpoint,
        val_json_file=FLAGS.val_json_file,
        mode=FLAGS.mode,
    )
    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        evaluation_master=FLAGS.eval_master,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop,
                                        FLAGS.num_shards))

    # TPU Estimator
    if FLAGS.mode == 'train':
        train_estimator = tpu_estimator.TPUEstimator(
            model_fn=retinanet_model.retinanet_50_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(
            input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                            is_training=True),
            steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                      FLAGS.train_batch_size))

        if FLAGS.eval_after_training:
            # Run evaluation after training finishes.
            eval_params = dict(
                params,
                use_tpu=False,
                input_rand_hflip=False,
                skip_crowd=False,
                resnet_checkpoint=None,
                is_training_bn=False,
            )
            eval_estimator = tpu_estimator.TPUEstimator(
                model_fn=retinanet_model.retinanet_50_model_fn,
                use_tpu=False,
                eval_batch_size=1,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_steps)
            tf.logging.info('Eval results: %s' % eval_results)

    elif FLAGS.mode == 'eval':
        # eval only runs on CPU or GPU host with batch_size = 1

        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        eval_params = dict(
            params,
            use_tpu=False,
            input_rand_hflip=False,
            skip_crowd=False,
            resnet_checkpoint=None,
            is_training_bn=False,
        )

        eval_estimator = tpu_estimator.TPUEstimator(
            model_fn=retinanet_model.retinanet_50_model_fn,
            use_tpu=False,
            eval_batch_size=1,
            config=run_config,
            params=eval_params)

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_steps)
                tf.logging.info('Eval results: %s' % eval_results)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)
    else:
        tf.logging.info('Mode not found.')
示例#6
0
 def get_dataset(cfg, file_pattern, is_training):
     """Returns a tf.data.Dataset"""
     return dataloader.InputReader(
         cfg, is_training, FLAGS.use_tfrecord, FLAGS.mixed_precision)(
             file_pattern,
             cfg.TRAIN.BATCH_SIZE if is_training else cfg.TEST.BATCH_SIZE)
示例#7
0
  def export(self,
             output_dir: Text = None,
             tensorrt: Text = None,
             tflite: Text = None,
             file_pattern: Text = None,
             num_calibration_steps: int = 2000):
    """Export a saved model, frozen graph, and potential tflite/tensorrt model.

    Args:
      output_dir: the output folder for saved model.
      tensorrt: If not None, must be {'FP32', 'FP16', 'INT8'}.
      tflite: Type for post-training quantization.
      file_pattern: Glob for tfrecords, e.g. coco/val-*.tfrecord.
      num_calibration_steps: Number of post-training quantization calibration
        steps to run.
    """
    export_model, input_spec = self._get_model_and_spec(tflite)
    image_size = utils.parse_image_size(self.params['image_size'])
    if output_dir:
      tf.saved_model.save(
          export_model,
          output_dir,
          signatures=export_model.__call__.get_concrete_function(input_spec))
      logging.info('Model saved at %s', output_dir)

      # also save freeze pb file.
      graphdef = self.freeze(
          export_model.__call__.get_concrete_function(input_spec))
      proto_path = tf.io.write_graph(
          graphdef, output_dir, self.model_name + '_frozen.pb', as_text=False)
      logging.info('Frozen graph saved at %s', proto_path)

    if tflite:
      shape = (self.batch_size, *image_size, 3)
      input_spec = tf.TensorSpec(
          shape=shape, dtype=input_spec.dtype, name=input_spec.name)
      # from_saved_model supports advanced converter features like op fusing.
      converter = tf.lite.TFLiteConverter.from_saved_model(output_dir)
      if tflite == 'FP32':
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.target_spec.supported_types = [tf.float32]
      elif tflite == 'FP16':
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.target_spec.supported_types = [tf.float16]
      elif tflite == 'INT8':
        # Enables MLIR-based post-training quantization.
        converter.experimental_new_quantizer = True
        if file_pattern:
          config = hparams_config.get_efficientdet_config(self.model_name)
          config.override(self.params)
          ds = dataloader.InputReader(
              file_pattern,
              is_training=False,
              max_instances_per_image=config.max_instances_per_image)(
                  config, batch_size=self.batch_size)

          def representative_dataset_gen():
            for image, _ in ds.take(num_calibration_steps):
              yield [image]
        else:  # Used for debugging, can remove later.
          logging.warn('Use real representative dataset instead of fake ones.')
          num_calibration_steps = 10
          def representative_dataset_gen():  # rewrite this for real data.
            for _ in range(num_calibration_steps):
              yield [tf.ones(shape, dtype=input_spec.dtype)]

        converter.representative_dataset = representative_dataset_gen
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.inference_input_type = tf.uint8
        # TFLite's custom NMS op isn't supported by post-training quant,
        # so we add TFLITE_BUILTINS as well.
        supported_ops = [
            tf.lite.OpsSet.TFLITE_BUILTINS_INT8, tf.lite.OpsSet.TFLITE_BUILTINS
        ]
        converter.target_spec.supported_ops = supported_ops

      else:
        raise ValueError(f'Invalid tflite {tflite}: must be FP32, FP16, INT8.')

      tflite_path = os.path.join(output_dir, tflite.lower() + '.tflite')
      tflite_model = converter.convert()
      tf.io.gfile.GFile(tflite_path, 'wb').write(tflite_model)
      logging.info('TFLite is saved at %s', tflite_path)

    if tensorrt:
      trt_path = os.path.join(output_dir, 'tensorrt_' + tensorrt.lower())
      conversion_params = tf.experimental.tensorrt.ConversionParams(
          max_workspace_size_bytes=(2 << 20),
          maximum_cached_engines=1,
          precision_mode=tensorrt.upper())
      converter = tf.experimental.tensorrt.Converter(
          output_dir, conversion_params=conversion_params)
      converter.convert()
      converter.save(trt_path)
      logging.info('TensorRT model is saved at %s', trt_path)
def main(argv):
    del argv  # Unused.

    # TODO(b/132208296): remove this workaround that uses control flow v2.
    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True

    tpu = FLAGS.tpu or FLAGS.master
    tpu_cluster_resolver = runner_utils.create_tpu_cluster_resolver(
        FLAGS.use_tpu, tpu, FLAGS.tpu_zone, FLAGS.gcp_project)
    if tpu_cluster_resolver:
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)

    # Check data path
    run_train = FLAGS.mode in ('train', 'train_and_eval')
    if run_train and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    run_eval = FLAGS.mode in ('eval', 'train_and_eval') or (
        FLAGS.mode == 'train' and FLAGS.eval_after_training)
    if run_eval:
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    # Parse hparams
    hparams = mask_rcnn_params.default_hparams()
    hparams.parse(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` has 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # Note: In the below code, TPUEstimator uses both `shard` and `replica` (with
    # the same meaning).
    # Note that spatial partition is part of the model-parallelism optimization.
    # See core_assignment_utils.py for more details about model parallelism.
    if FLAGS.input_partition_dims:
        labels_partition_dims = {
            'gt_boxes': None,
            'gt_classes': None,
            'cropped_gt_masks': None,
        }
        for level in range(hparams.get('min_level'),
                           hparams.get('max_level') + 1):
            labels_partition_dims['box_targets_%d' % level] = None
            labels_partition_dims['score_targets_%d' % level] = None
        num_cores_per_replica = int(np.prod(FLAGS.input_partition_dims))
        image_partition_dims = [
            FLAGS.input_partition_dims[i] for i in [1, 0, 2]
        ] if hparams.get('transpose_input') else FLAGS.input_partition_dims
        features_partition_dims = {
            'images': image_partition_dims,
            'source_ids': None,
            'image_info': None,
        }
        input_partition_dims = [features_partition_dims, labels_partition_dims]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(hparams.values(),
                  num_shards=num_shards,
                  num_cores_per_replica=num_cores_per_replica,
                  use_tpu=FLAGS.use_tpu,
                  resnet_checkpoint=FLAGS.resnet_checkpoint,
                  val_json_file=FLAGS.val_json_file,
                  model_dir=FLAGS.model_dir)

    tpu_config = tf.contrib.tpu.TPUConfig(
        params['iterations_per_loop'],
        num_shards=num_shards,
        num_cores_per_replica=params['num_cores_per_replica'],
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
        PER_HOST_V2,
        tpu_job_name=FLAGS.tpu_job_name,
    )

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=params['iterations_per_loop'],
        tpu_config=tpu_config,
        save_checkpoints_steps=params['iterations_per_loop'],
    )

    train_replicas_per_worker = (
        params['cores_per_worker'] // params['num_cores_per_replica']
    ) if params['num_cores_per_replica'] else params['cores_per_worker']
    train_params = dict(
        params,
        replicas_per_worker=train_replicas_per_worker,
    )
    eval_params = dict(
        params,
        input_rand_hflip=False,
        resnet_checkpoint=None,
        is_training_bn=False,
    )

    # MLPerf logging.
    mlp_log.mlperf_print(key='init_start', value=None)
    mlp_log.mlperf_print(key='global_batch_size',
                         value=params['train_batch_size'])
    runner = None
    if run_train and run_eval:
        if params['train_use_tpu_estimator'] or params[
                'eval_use_tpu_estimator']:
            raise RuntimeError(
                'train_and_eval runner does not support TPUEstimator.')
        dist_eval_params = dict(
            eval_params,
            replicas_per_worker=train_replicas_per_worker,
        )
        runner = mask_rcnn_runner.TrainEvalRunner(
            model_fn=mask_rcnn_model.MaskRcnnModelFn(),
            input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                            mode=tf.estimator.ModeKeys.TRAIN,
                                            use_fake_data=FLAGS.use_fake_data),
            eval_input_fn=dataloader.InputReader(
                FLAGS.validation_file_pattern,
                mode=tf.estimator.ModeKeys.PREDICT,
                distributed_eval=True),
            eval_metric=coco_metric.EvaluationMetric(FLAGS.val_json_file,
                                                     use_cpp_extension=True),
            train_params=train_params,
            eval_params=dist_eval_params,
            run_config=run_config)
    elif run_train:
        # Check low-level train runner compatibility.
        if not params['train_use_tpu_estimator']:
            if FLAGS.mode == 'train_and_eval':
                raise RuntimeError(
                    'Low level train runner does not support mode '
                    'train_and_eval yet.')
        train_params = dict(
            params,
            replicas_per_worker=train_replicas_per_worker,
        )
        runner = mask_rcnn_runner.TrainRunner(
            model_fn=mask_rcnn_model.MaskRcnnModelFn(),
            input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                            mode=tf.estimator.ModeKeys.TRAIN,
                                            use_fake_data=FLAGS.use_fake_data),
            params=train_params,
            run_config=run_config,
            use_tpu_estimator=train_params['train_use_tpu_estimator'])
    else:
        sidecar_eval_params = dict(
            eval_params,
            # sidecar eval only uses one worker and does not use spatial partition.
            replicas_per_worker=FLAGS.num_cores,
        )
        runner = mask_rcnn_runner.EvalRunner(
            mask_rcnn_model.MaskRcnnModelFn(),
            dataloader.InputReader(FLAGS.validation_file_pattern,
                                   mode=tf.estimator.ModeKeys.PREDICT),
            coco_metric.EvaluationMetric(FLAGS.val_json_file,
                                         use_cpp_extension=True),
            sidecar_eval_params,
            run_config,
            use_tpu_estimator=sidecar_eval_params['eval_use_tpu_estimator'])

    if FLAGS.mode == 'train':
        runner.train()
    elif FLAGS.mode == 'eval':

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        run_success = False
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.contrib.training.checkpoints_iterator(
                params['model_dir'],
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info('Starting to evaluate.')
            try:

                eval_results = runner.evaluate(ckpt)
                current_step, _ = runner.get_step_and_epoch_number(ckpt)

                if (eval_results['AP'] >= mask_rcnn_params.BOX_EVAL_TARGET
                        and eval_results['mask_AP'] >=
                        mask_rcnn_params.MASK_EVAL_TARGET):
                    mlp_log.mlperf_print(key='run_stop',
                                         metadata={'status': 'success'})
                    run_success = True
                    break

                if int(current_step) >= params['total_steps']:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)
        if not run_success:
            mlp_log.mlperf_print(key='run_stop',
                                 metadata={'status': 'aborted'})

    elif FLAGS.mode == 'train_and_eval':
        runner.train_and_eval()
    else:
        tf.logging.info('Mode not found.')
示例#9
0
文件: main.py 项目: ljun901527/automl
def main(_):
  if FLAGS.strategy == 'tpu':
    tf.disable_eager_execution()
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    tpu_grpc_url = tpu_cluster_resolver.get_master()
    tf.Session.reset(tpu_grpc_url)
  else:
    tpu_cluster_resolver = None

  # Check data path
  if FLAGS.mode in ('train', 'train_and_eval'):
    if FLAGS.training_file_pattern is None:
      raise RuntimeError('Must specify --training_file_pattern for train.')
  if FLAGS.mode in ('eval', 'train_and_eval'):
    if FLAGS.validation_file_pattern is None:
      raise RuntimeError('Must specify --validation_file_pattern for eval.')

  # Parse and override hparams
  config = hparams_config.get_detection_config(FLAGS.model_name)
  config.override(FLAGS.hparams)
  if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
    config.num_epochs = FLAGS.num_epochs

  # Parse image size in case it is in string format.
  config.image_size = utils.parse_image_size(config.image_size)

  # The following is for spatial partitioning. `features` has one tensor while
  # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
  # partition is performed on `features` and all partitionable tensors of
  # `labels`, see the partition logic below.
  # In the TPUEstimator context, the meaning of `shard` and `replica` is the
  # same; follwing the API, here has mixed use of both.
  if FLAGS.use_spatial_partition:
    # Checks input_partition_dims agrees with num_cores_per_replica.
    if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
      raise RuntimeError('--num_cores_per_replica must be a product of array'
                         'elements in --input_partition_dims.')

    labels_partition_dims = {
        'mean_num_positives': None,
        'source_ids': None,
        'groundtruth_data': None,
        'image_scales': None,
        'image_masks': None,
    }
    # The Input Partition Logic: We partition only the partition-able tensors.
    feat_sizes = utils.get_feat_sizes(
        config.get('image_size'), config.get('max_level'))
    for level in range(config.get('min_level'), config.get('max_level') + 1):

      def _can_partition(spatial_dim):
        partitionable_index = np.where(
            spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
        return len(partitionable_index[0]) == len(FLAGS.input_partition_dims)

      spatial_dim = feat_sizes[level]
      if _can_partition(spatial_dim['height']) and _can_partition(
          spatial_dim['width']):
        labels_partition_dims['box_targets_%d' %
                              level] = FLAGS.input_partition_dims
        labels_partition_dims['cls_targets_%d' %
                              level] = FLAGS.input_partition_dims
      else:
        labels_partition_dims['box_targets_%d' % level] = None
        labels_partition_dims['cls_targets_%d' % level] = None
    num_cores_per_replica = FLAGS.num_cores_per_replica
    input_partition_dims = [FLAGS.input_partition_dims, labels_partition_dims]
    num_shards = FLAGS.num_cores // num_cores_per_replica
  else:
    num_cores_per_replica = None
    input_partition_dims = None
    num_shards = FLAGS.num_cores

  params = dict(
      config.as_dict(),
      model_name=FLAGS.model_name,
      iterations_per_loop=FLAGS.iterations_per_loop,
      model_dir=FLAGS.model_dir,
      num_shards=num_shards,
      num_examples_per_epoch=FLAGS.num_examples_per_epoch,
      strategy=FLAGS.strategy,
      backbone_ckpt=FLAGS.backbone_ckpt,
      ckpt=FLAGS.ckpt,
      val_json_file=FLAGS.val_json_file,
      testdev_dir=FLAGS.testdev_dir,
      mode=FLAGS.mode)
  config_proto = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=False)
  if FLAGS.strategy != 'tpu':
    if FLAGS.use_xla:
      config_proto.graph_options.optimizer_options.global_jit_level = (
          tf.OptimizerOptions.ON_1)
    config_proto.gpu_options.allow_growth = True

  model_dir = FLAGS.model_dir
  strategy = None
  if FLAGS.strategy == 'tpu':
    tpu_config = tf.estimator.tpu.TPUConfig(
        FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig
        .PER_HOST_V2)
    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tf_random_seed=FLAGS.tf_random_seed,
    )
  else:
    if FLAGS.strategy == 'gpus':
      strategy = tf.distribute.MirroredStrategy()
    run_config = tf.estimator.RunConfig(
        model_dir=model_dir,
        train_distribute=strategy,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tf_random_seed=FLAGS.tf_random_seed,
    )

  model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
  max_instances_per_image = config.max_instances_per_image
  eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size)
  total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
  train_steps = total_examples // FLAGS.train_batch_size
  logging.info(params)

  train_input_fn = dataloader.InputReader(
      FLAGS.training_file_pattern,
      is_training=True,
      use_fake_data=FLAGS.use_fake_data,
      max_instances_per_image=max_instances_per_image)
  eval_input_fn = dataloader.InputReader(
      FLAGS.validation_file_pattern,
      is_training=False,
      use_fake_data=FLAGS.use_fake_data,
      max_instances_per_image=max_instances_per_image)

  if FLAGS.strategy == 'tpu':
    estimator = tf.estimator.tpu.TPUEstimator(
        model_fn=model_fn_instance,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        config=run_config,
        params=params)
  else:
    params['batch_size'] = (
        FLAGS.train_batch_size // getattr(strategy, 'num_replicas_in_sync', 1))
    params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1)
    estimator = tf.estimator.Estimator(
        model_fn=model_fn_instance,
        config=run_config,
        params=params)

  # start train/eval flow.
  if FLAGS.mode == 'train':
    estimator.train(input_fn=train_input_fn, max_steps=train_steps)
    if FLAGS.eval_after_training:
      estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

  elif FLAGS.mode == 'eval':
    # Run evaluation when there's a new checkpoint
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir,
        min_interval_secs=FLAGS.min_eval_interval,
        timeout=FLAGS.eval_timeout):

      logging.info('Starting to evaluate.')
      try:
        eval_results = estimator.evaluate(eval_input_fn, steps=eval_steps)
        # Terminate eval job when final checkpoint is reached.
        try:
          current_step = int(os.path.basename(ckpt).split('-')[1])
        except IndexError:
          logging.info('%s has no global step info: stop!', ckpt)
          break

        utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
        if current_step >= train_steps:
          logging.info('Eval finished step %d/%d', current_step, train_steps)
          break

      except tf.errors.NotFoundError:
        # Checkpoint might be not already deleted by the time eval finished.
        # We simply skip ssuch case.
        logging.info('Checkpoint %s no longer exists, skipping.', ckpt)

  elif FLAGS.mode == 'train_and_eval':
    train_spec = tf.estimator.TrainSpec(
        input_fn=train_input_fn, max_steps=train_steps)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=eval_input_fn, steps=eval_steps, throttle_secs=600)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
  else:
    logging.info('Invalid mode: %s', FLAGS.mode)
示例#10
0
def main(_):
    config = hparams_config.get_efficientdet_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    config.batch_size = FLAGS.batch_size
    config.val_json_file = FLAGS.val_json_file
    config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS
    base_height, base_width = utils.parse_image_size(config['image_size'])

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    # in format (height, width, flip)
    augmentations = []
    if FLAGS.enable_tta:
        for size_offset in (0, 128, 256):
            for flip in (False, True):
                augmentations.append((base_height + size_offset,
                                      base_width + size_offset, flip))
    else:
        augmentations.append((base_height, base_width, False))

    all_detections = []
    all_labels = []
    with ds_strategy.scope():
        # Network
        model = efficientdet_keras.EfficientDetNet(config=config)
        model.build((config.batch_size, base_height, base_width, 3))
        model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir))

        first_loop = True
        for height, width, flip in augmentations:
            config.image_size = (height, width)
            # dataset
            ds = dataloader.InputReader(
                FLAGS.val_file_pattern,
                is_training=False,
                use_fake_data=False,
                max_instances_per_image=config.max_instances_per_image)(config)
            if FLAGS.eval_samples:
                ds = ds.take(FLAGS.eval_samples // config.batch_size)

            # create the function once per augmentation, since it closes over the
            # value of config, which gets updated with the new image size
            @tf.function
            def f(images, labels):
                cls_outputs, box_outputs = model(images, training=False)
                return postprocess.generate_detections(config, cls_outputs,
                                                       box_outputs,
                                                       labels['image_scales'],
                                                       labels['source_ids'],
                                                       flip)

            # inference
            for images, labels in ds:
                if flip:
                    images = tf.image.flip_left_right(images)
                detections = f(images, labels)

                all_detections.append(detections)
                if first_loop:
                    all_labels.append(labels)

            first_loop = False

    # collect the giant list of detections into a map from image id to
    # detections
    detections_per_source = dict()
    for batch in all_detections:
        for d in batch:
            img_id = d[0][0]
            if img_id.numpy() in detections_per_source:
                detections_per_source[img_id.numpy()] = tf.concat(
                    [d, detections_per_source[img_id.numpy()]], 0)
            else:
                detections_per_source[img_id.numpy()] = d

    # collect the groundtruth per image id
    groundtruth_per_source = dict()
    for batch in all_labels:
        for img_id, groundtruth in zip(batch['source_ids'],
                                       batch['groundtruth_data']):
            groundtruth_per_source[img_id.numpy()] = groundtruth

    # calucate the AP scores for all the images
    label_map = label_util.get_label_map(config.label_map)
    evaluator = coco_metric.EvaluationMetric(filename=config.val_json_file,
                                             label_map=label_map)
    for img_id, d in detections_per_source.items():
        if FLAGS.enable_tta:
            d = wbf.ensemble_detections(config, d, len(augmentations))
        evaluator.update_state(
            tf.stack([groundtruth_per_source[img_id]]).numpy(),
            postprocess.transform_detections(tf.stack([d])).numpy())

    # compute the final eval results.
    if evaluator:
        metrics = evaluator.result()
        metric_dict = {}
        for i, name in enumerate(evaluator.metric_names):
            metric_dict[name] = metrics[i]

        if label_map:
            for i, cid in enumerate(sorted(label_map.keys())):
                name = 'AP_/%s' % label_map[cid]
                metric_dict[name] = metrics[i + len(evaluator.metric_names)]
        print(metric_dict)
示例#11
0
def main(_):
    config = hparams_config.get_efficientdet_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    config.batch_size = FLAGS.batch_size
    config.val_json_file = FLAGS.val_json_file
    config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS
    base_height, base_width = utils.parse_image_size(config['image_size'])

    # Network
    model = efficientdet_keras.EfficientDetNet(config=config)
    model.build((config.batch_size, base_height, base_width, 3))
    model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir))

    @tf.function
    def f(imgs, labels, flip):
        cls_outputs, box_outputs = model(imgs, training=False)
        return postprocess.generate_detections(config, cls_outputs,
                                               box_outputs,
                                               labels['image_scales'],
                                               labels['source_ids'], flip)

    # in format (height, width, flip)
    augmentations = []
    if FLAGS.enable_tta:
        for size_offset in (0, 128, 256):
            for flip in (False, True):
                augmentations.append((base_height + size_offset,
                                      base_width + size_offset, flip))
    else:
        augmentations.append((base_height, base_width, False))

    evaluator = None
    detections_per_source = dict()
    for height, width, flip in augmentations:
        config.image_size = (height, width)
        # dataset
        ds = dataloader.InputReader(
            FLAGS.val_file_pattern,
            is_training=False,
            use_fake_data=False,
            max_instances_per_image=config.max_instances_per_image)(config)

        # compute stats for all batches.
        total_steps = FLAGS.eval_samples // FLAGS.batch_size
        progress = tf.keras.utils.Progbar(total_steps)
        for i, (images, labels) in enumerate(ds):
            progress.update(i, values=None)
            if i > total_steps:
                break

            if flip:
                images = tf.image.flip_left_right(images)
            detections = f(images, labels, flip)

            for img_id, d in zip(labels['source_ids'], detections):
                if img_id.numpy() in detections_per_source:
                    detections_per_source[img_id.numpy()] = tf.concat(
                        [d, detections_per_source[img_id.numpy()]], 0)
                else:
                    detections_per_source[img_id.numpy()] = d

            evaluator = coco_metric.EvaluationMetric(
                filename=config.val_json_file)
            for d in detections_per_source.values():
                if FLAGS.enable_tta:
                    d = wbf.ensemble_detections(config, d, len(augmentations))
                evaluator.update_state(
                    labels['groundtruth_data'].numpy(),
                    postprocess.transform_detections(tf.stack([d])).numpy())

    # compute the final eval results.
    if evaluator:
        metrics = evaluator.result()
        metric_dict = {}
        for i, name in enumerate(evaluator.metric_names):
            metric_dict[name] = metrics[i]

        label_map = label_util.get_label_map(config.label_map)
        if label_map:
            for i, cid in enumerate(sorted(label_map.keys())):
                name = 'AP_/%s' % label_map[cid]
                metric_dict[name] = metrics[i - len(evaluator.metric_names)]
        print(metric_dict)
示例#12
0
def main(argv):
    del argv  # Unused.

    # if given an efficentdet ckpt don't use default backbone ckpt
    if FLAGS.backbone_ckpt == BACKBONE_CKPT_DEFAULT_DIR and FLAGS.ckpt is not None:
        print("Using ckpt flag: {}, ignoring default backbone_ckpt: {}".format(
            FLAGS.ckpt, FLAGS.backbone_ckpt))
        FLAGS.backbone_ckpt = None

    if FLAGS.use_horovod is not None:
        if FLAGS.dump_all_ranks:
            FLAGS.model_dir += "/worker_" + str(hvd.rank())
        if not 'HOROVOD_CYCLE_TIME' in os.environ:
            os.environ['HOROVOD_CYCLE_TIME'] = '0.5'
        if not 'HABANA_HCCL_COMM_API' in os.environ:
            os.environ['HABANA_HCCL_COMM_API'] = '0'
        hvd_init()

    if not FLAGS.no_hpu:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

        if FLAGS.use_horovod:
            assert (horovod_enabled())

    set_env(use_amp=FLAGS.use_amp)

    # deterministic setting
    if FLAGS.sbs_test or FLAGS.deterministic:
        set_deterministic()

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if not FLAGS.val_json_file and not FLAGS.testdev_dir:
            raise RuntimeError(
                'You must specify --val_json_file or --testdev for evaluation.'
            )

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = config.get('image_size')
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores
        if horovod_enabled():
            num_shards = hvd.size()
        else:
            num_shards = 1

    params = build_estimator_params('train', config, num_shards)
    # disabling input data scaling/flip manipulations.
    if FLAGS.sbs_test:
        sbs_params = dict(input_rand_hflip=False,
                          train_scale_min=1,
                          train_scale_max=1,
                          dropout_rate=0.0)
        params.update(sbs_params)

    tf_random_seed = 0 if FLAGS.deterministic else None
    run_config = build_estimator_config('train', config, num_shards,
                                        num_cores_per_replica,
                                        input_partition_dims)
    write_hparams_v1(FLAGS.model_dir, {
        'batch_size': FLAGS.train_batch_size,
        **FLAGS.flag_values_dict()
    })

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)

    # TPU Estimator
    logging.info(params)

    if FLAGS.mode == 'train':
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=run_config,
                                           params=params)

        # for deterministic input, we pass to dataloader False for not manipulating input data
        is_training = not FLAGS.deterministic
        use_fake_data = FLAGS.use_fake_data or FLAGS.deterministic

        input_fn = dataloader.InputReader(FLAGS.training_file_pattern,
                                          is_training=is_training,
                                          params=params,
                                          use_fake_data=use_fake_data,
                                          is_deterministic=FLAGS.deterministic)
        max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                        (FLAGS.train_batch_size * num_shards)) + 1

        # for sbs test, train under sbs callbacks
        if FLAGS.sbs_test:
            from TensorFlow.common.debug import dump_callback
            SBS_TEST_CONFIG = os.path.join(
                os.environ['TF_TESTS_ROOT'],
                "tests/tf_training_tests/side_by_side/topologies/efficientdet/dump_config.json"
            )
            with dump_callback(SBS_TEST_CONFIG):
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)
        else:
            if FLAGS.ckpt is not None:
                train_estimator.train(input_fn=input_fn, steps=max_steps)
            else:
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)

    elif FLAGS.mode == 'eval':
        eval_params = build_estimator_params('eval', config, num_shards)
        eval_config = build_estimator_config('eval', config, num_shards,
                                             num_cores_per_replica,
                                             input_partition_dims)

        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.

        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=eval_config,
            params=eval_params)

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                logging.info('Eval results: %s', eval_results)

                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                write_summary(eval_results, ckpt, current_step)

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        train_params = build_estimator_params('train', config, num_shards)
        train_config = build_estimator_config('train', config, num_shards,
                                              num_cores_per_replica,
                                              input_partition_dims)
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=train_config,
                                           params=train_params)

        eval_estimator = None

        for cycle in range(FLAGS.num_epochs):
            logging.info('Starting training cycle, epoch: %d.', cycle)

            train_estimator.train(
                input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern,
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                max_steps=(cycle + 1) *
                int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size))

            # synchronization point for all ranks
            if horovod_enabled():
                hvd.allreduce(tf.constant(0))

            logging.info('Starting evaluation cycle, epoch: %d.', cycle)
            # Run evaluation after every epoch.

            if eval_estimator is None:
                eval_params = build_estimator_params('eval', config,
                                                     num_shards)
                eval_config = build_estimator_config('eval', config,
                                                     num_shards,
                                                     num_cores_per_replica,
                                                     input_partition_dims)
                eval_estimator = tf.estimator.tpu.TPUEstimator(
                    model_fn=model_fn_instance,
                    use_tpu=False,
                    train_batch_size=FLAGS.train_batch_size,
                    eval_batch_size=FLAGS.eval_batch_size,
                    config=eval_config,
                    params=eval_params)

            if is_rank0():
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)

                checkpoint_path = Path(FLAGS.model_dir)
                last_ckpt = tf.train.latest_checkpoint(str(checkpoint_path),
                                                       latest_filename=None)
                current_step = int(os.path.basename(last_ckpt).split('-')[1])
                write_summary(eval_results, FLAGS.model_dir, current_step)
                logging.info('Evaluation results: %s', eval_results)

                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
        pass

    else:
        logging.info('Mode not found.')
def main(argv):
    del argv  # Unused.
    tpu_cluster_resolver = create_tpu_cluster_resolver()
    if tpu_cluster_resolver:
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    # Parse hparams
    hparams = mask_rcnn_model.default_hparams()
    hparams.parse(FLAGS.hparams)

    params = dict(
        hparams.values(),
        num_shards=FLAGS.num_cores,
        num_examples_per_epoch=FLAGS.num_examples_per_epoch,
        use_tpu=FLAGS.use_tpu,
        resnet_checkpoint=FLAGS.resnet_checkpoint,
        val_json_file=FLAGS.val_json_file,
        mode=FLAGS.mode,
        # The following are used by the host_call function.
        model_dir=FLAGS.model_dir,
        iterations_per_loop=FLAGS.iterations_per_loop,
        dynamic_input_shapes=FLAGS.dynamic_input_shapes,
        transpose_input=FLAGS.transpose_input)

    tpu_config = tf.contrib.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_cores,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
        PER_HOST_V2)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        tpu_config=tpu_config,
    )

    if FLAGS.mode != 'eval':
        mlperf_log.maskrcnn_print(key=mlperf_log.RUN_START)
        mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_LOOP)
        mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_EPOCH, value=0)

    if FLAGS.mode == 'train':

        max_steps = int(
            (FLAGS.num_epochs * float(FLAGS.num_examples_per_epoch)) /
            float(FLAGS.train_batch_size))
        if params['dynamic_input_shapes']:
            train_with_dynamic_shapes(params, max_steps,
                                      FLAGS.iterations_per_loop)
        else:
            tf.logging.info(params)
            train_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=mask_rcnn_model.mask_rcnn_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN),
                                  max_steps=max_steps)

        if FLAGS.eval_after_training:
            # Run evaluation after training finishes.
            eval_params = dict(
                params,
                use_tpu=FLAGS.use_tpu,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
                dynamic_input_shapes=False,
                transpose_input=False,
            )

            eval_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=mask_rcnn_model.mask_rcnn_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                predict_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)

            output_dir = os.path.join(FLAGS.model_dir, 'eval')
            tf.gfile.MakeDirs(output_dir)
            # Summary writer writes out eval metrics.
            summary_writer = tf.summary.FileWriter(output_dir)
            eval_results = evaluation(eval_estimator, FLAGS.num_epochs,
                                      params['val_json_file'])
            write_summary(eval_results, summary_writer, max_steps)

            if (eval_results['AP'] >= BOX_EVAL_TARGET
                    and eval_results['mask_AP'] >= MASK_EVAL_TARGET):
                mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP,
                                          value={'success': 'true'})
            else:
                mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP,
                                          value={'success': 'false'})

            summary_writer.close()
            mlperf_log.maskrcnn_print(key=mlperf_log.RUN_FINAL)

    elif FLAGS.mode == 'eval':

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        # Summary writer writes out eval metrics.
        summary_writer = tf.summary.FileWriter(output_dir)

        eval_params = dict(
            params,
            use_tpu=FLAGS.use_tpu,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
            transpose_input=False,
        )

        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        run_success = False
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.contrib.training.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):
            # Terminate eval job when final checkpoint is reached
            current_step = int(os.path.basename(ckpt).split('-')[1])

            tf.logging.info('Starting to evaluate.')
            try:

                current_epoch = current_step / (float(
                    FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)
                eval_results = evaluation(eval_estimator, current_epoch,
                                          params['val_json_file'])
                write_summary(eval_results, summary_writer, current_step)
                if (eval_results['AP'] >= BOX_EVAL_TARGET
                        and eval_results['mask_AP'] >= MASK_EVAL_TARGET):
                    mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP,
                                              value={'success': 'true'})
                    run_success = True
                    break

                total_step = int(
                    (FLAGS.num_epochs * float(FLAGS.num_examples_per_epoch)) /
                    float(FLAGS.train_batch_size))
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)
        if not run_success:
            mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP,
                                      value={'success': 'false'})
        mlperf_log.maskrcnn_print(key=mlperf_log.RUN_FINAL)
        summary_writer.close()

    elif FLAGS.mode == 'train_and_eval':

        output_dir = os.path.join(FLAGS.model_dir, 'eval')
        tf.gfile.MakeDirs(output_dir)
        summary_writer = tf.summary.FileWriter(output_dir)
        train_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        eval_params = dict(params,
                           use_tpu=FLAGS.use_tpu,
                           input_rand_hflip=False,
                           resnet_checkpoint=None,
                           is_training_bn=False,
                           dynamic_input_shapes=False)
        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=mask_rcnn_model.mask_rcnn_model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)
        run_success = False
        steps_per_epoch = int(FLAGS.num_examples_per_epoch /
                              FLAGS.train_batch_size)
        for cycle in range(int(math.floor(FLAGS.num_epochs))):
            tf.logging.info('Starting training cycle, epoch: %d.' % cycle)
            mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_EPOCH, value=cycle)
            if params['dynamic_input_shapes']:
                tf.logging.info(
                    'Use dynamic input shapes training for %d steps. Train '
                    'to %d steps', steps_per_epoch,
                    (cycle + 1) * steps_per_epoch)
                train_with_dynamic_shapes(params,
                                          (cycle + 1) * steps_per_epoch,
                                          FLAGS.iterations_per_loop)
            else:
                train_estimator.train(input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern,
                    mode=tf.estimator.ModeKeys.TRAIN),
                                      steps=steps_per_epoch)

            tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle)
            # Run evaluation after every epoch.
            eval_results = evaluation(eval_estimator, cycle,
                                      params['val_json_file'])
            current_step = (cycle + 1) * steps_per_epoch
            write_summary(eval_results, summary_writer, current_step)
            if (eval_results['AP'] >= BOX_EVAL_TARGET
                    and eval_results['mask_AP'] >= MASK_EVAL_TARGET):
                mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP,
                                          value={'success': 'true'})
                run_success = True
                break

        if not run_success:
            current_epoch = int(math.floor(FLAGS.num_epochs))
            max_steps = int(
                (FLAGS.num_epochs * float(FLAGS.num_examples_per_epoch)) /
                float(FLAGS.train_batch_size))
            # Final epoch.
            tf.logging.info('Starting training cycle, epoch: %d.' %
                            current_epoch)
            mlperf_log.maskrcnn_print(key=mlperf_log.TRAIN_EPOCH,
                                      value=current_epoch)
            if params['dynamic_input_shapes']:
                remaining_steps = max_steps - int(
                    current_epoch * steps_per_epoch)
                if remaining_steps > 0:
                    tf.logging.info(
                        'Use dynamic input shapes training for %d steps. '
                        'Train to %d steps', remaining_steps, max_steps)
                    train_with_dynamic_shapes(params, max_steps,
                                              remaining_steps)
            else:
                train_estimator.train(input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern,
                    mode=tf.estimator.ModeKeys.TRAIN),
                                      max_steps=max_steps)
            eval_results = evaluation(eval_estimator, current_epoch,
                                      params['val_json_file'])
            write_summary(eval_results, summary_writer, max_steps)
            if (eval_results['AP'] >= BOX_EVAL_TARGET
                    and eval_results['mask_AP'] >= MASK_EVAL_TARGET):
                mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP,
                                          value={'success': 'true'})
            else:
                mlperf_log.maskrcnn_print(key=mlperf_log.RUN_STOP,
                                          value={'success': 'false'})
        mlperf_log.maskrcnn_print(key=mlperf_log.RUN_FINAL)
        summary_writer.close()
    else:
        tf.logging.info('Mode not found.')
def train_with_dynamic_shapes(params, max_steps, iterations_per_loop):
    """Train with dynamic input shapes."""
    params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_cores
    params['global_batch_size'] = FLAGS.train_batch_size
    tf.logging.info(params)

    tpu_cluster_resolver = create_tpu_cluster_resolver()

    tpu_strategy = tf.contrib.distribute.TPUStrategy(tpu_cluster_resolver,
                                                     steps_per_run=1,
                                                     num_cores=FLAGS.num_cores)
    session_config = tf.ConfigProto(allow_soft_placement=True)
    tpu_strategy.configure(session_config)
    sess = tf.Session(tpu_cluster_resolver.get_master(), config=session_config)
    # Call tpu.initialize_system() before everything!
    sess.run(tpu.initialize_system())

    input_fn = dataloader.InputReader(FLAGS.training_file_pattern,
                                      mode=tf.estimator.ModeKeys.TRAIN)
    host_dataset = input_fn(params)
    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
        host_dataset,
        devices=['/device:TPU:{}'.format(x) for x in range(FLAGS.num_cores)],
        prefetch_buffer_size=2)

    inputs_flattener = utils.InputsFlattener()
    per_host_sharded_inputs = []
    captured_scaffold_fn = utils.CapturedObject()

    def single_step_fn():
        """Function for a single TPU step."""
        all_input_data = multi_device_iterator.get_next()
        for core in range(FLAGS.num_cores):
            features_shape, features, labels = all_input_data[core]
            flattened_inputs = (inputs_flattener.flatten_features_and_labels(
                features, labels))
            per_host_sharded_inputs.append(flattened_inputs)

            if params['transpose_input']:
                is_height_short_side = tf.less(features_shape[0],
                                               features_shape[1])
            else:
                is_height_short_side = tf.less(features_shape[1],
                                               features_shape[2])

        def height_short_side_model_fn(*args):
            """Mode function for input images with height on the short side."""
            features, labels = inputs_flattener.unflatten_features_and_labels(
                args)
            features, labels = _set_feature_and_label_shapes(
                features, labels, params)
            spec = mask_rcnn_model.mask_rcnn_model_fn(
                features, labels, tf.estimator.ModeKeys.TRAIN, params)
            captured_scaffold_fn.capture(spec.scaffold_fn)
            return spec.train_op

        def height_long_side_model_fn(*args):
            """Mode function for input images with height on the long side."""
            features, labels = inputs_flattener.unflatten_features_and_labels(
                args)
            # Create a new params which has the reversed dynamic image shape.
            new_params = copy.deepcopy(params)
            new_params['dynamic_image_size'] = new_params[
                'dynamic_image_size'][::-1]
            features, labels = _set_feature_and_label_shapes(
                features, labels, new_params)
            spec = mask_rcnn_model.mask_rcnn_model_fn(
                features, labels, tf.estimator.ModeKeys.TRAIN, new_params)
            captured_scaffold_fn.capture(spec.scaffold_fn)
            return spec.train_op

        rewrite_computation = tf.cond(
            is_height_short_side,
            lambda: tpu.replicate(height_short_side_model_fn,
                                  per_host_sharded_inputs),  # pylint: disable=line-too-long
            lambda: tpu.replicate(height_long_side_model_fn,
                                  per_host_sharded_inputs)  # pylint: disable=line-too-long
        )

        return rewrite_computation

    def multiple_steps_fn():
        """function for multiple TPU steps in a host training loop."""
        return utils.wrap_computation_in_while_loop(single_step_fn,
                                                    n=iterations_per_loop,
                                                    parallel_iterations=1)

    with tpu_strategy.scope():
        # NOTE: `tpu_strategy.extended.call_for_each_replica` is not supported
        # in TF 1.12, use `tpu_strategy.call_for_each_tower` in that version.
        computation = tpu_strategy.extended.call_for_each_replica(multiple_steps_fn)  # pylint: disable=line-too-long

    saver = tf.train.Saver()
    latest_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
    if latest_checkpoint:
        saver.restore(sess, latest_checkpoint)
    else:
        captured_scaffold_fn.get()()
        sess.run(tf.global_variables_initializer())
    sess.run(multi_device_iterator.initializer)
    current_step = sess.run(tf.train.get_global_step())

    # Save a 0-step checkpoint.
    if current_step == 0:
        saver.save(sess, FLAGS.model_dir + '/model', global_step=current_step)

    for iter_steps in range(current_step, max_steps, iterations_per_loop):
        tf.logging.info('Dynamic shape training steps: %d', iter_steps)
        _ = sess.run(computation)
        # Save checkpoints.
        saver.save(sess,
                   FLAGS.model_dir + '/model',
                   global_step=iter_steps + iterations_per_loop)

    sess.run(tpu.shutdown_system())
    sess.close()
示例#15
0
def main(_):
  config = hparams_config.get_efficientdet_config(FLAGS.model_name)
  config.override(FLAGS.hparams)
  config.batch_size = FLAGS.batch_size
  config.val_json_file = FLAGS.val_json_file
  config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS
  base_height, base_width = utils.parse_image_size(config['image_size'])

  # Network
  model = efficientdet_keras.EfficientDetNet(config=config)
  model.build((config.batch_size, base_height, base_width, 3))
  model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir))

  # in format (height, width, flip)
  augmentations = [] 
  if FLAGS.enable_tta:
    for size_offset in (0, 128, 256):
      for flip in (False, True):
        augmentations.append((base_height + size_offset, base_width + size_offset, flip))
  else:
    augmentations.append((base_height, base_width, False))

  detections_per_source = dict()
  for height, width, flip in augmentations:
    config.image_size = (height, width)
    # dataset
    ds = dataloader.InputReader(
        FLAGS.val_file_pattern,
        is_training=False,
        use_fake_data=False,
        max_instances_per_image=config.max_instances_per_image)(
            config)

    # compute stats for all batches.
    for images, labels in ds:
      if flip:
        images = tf.image.flip_left_right(images)
      cls_outputs, box_outputs = model(images, training=False)
      detections = postprocess.generate_detections(config, cls_outputs,
                                                  box_outputs,
                                                  labels['image_scales'],
                                                  labels['source_ids'], flip)

      for id, d in zip(labels['source_ids'], detections):
        if id.numpy() in detections_per_source:
          detections_per_source[id.numpy()] = tf.concat([d, detections_per_source[id.numpy()]], 0)
        else:
          detections_per_source[id.numpy()] = d


  evaluator = coco_metric.EvaluationMetric(filename=config.val_json_file)
  for d in detections_per_source.values():
    if FLAGS.enable_tta:
      d = wbf.ensemble_detections(config, d, len(augmentations))
    evaluator.update_state(
        labels['groundtruth_data'].numpy(),
        postprocess.transform_detections(tf.stack([d])).numpy())

  # compute the final eval results.
  metric_values = evaluator.result()
  metric_dict = {}
  for i, metric_value in enumerate(metric_values):
    metric_dict[evaluator.metric_names[i]] = metric_value
  print(metric_dict)
示例#16
0
def main(_):
    config = hparams_config.get_efficientdet_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    config.val_json_file = FLAGS.val_json_file
    config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS
    config.drop_remainder = False  # eval all examples w/o drop.
    config.image_size = utils.parse_image_size(config['image_size'])

    if config.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif config.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    with ds_strategy.scope():
        # Network
        model = efficientdet_keras.EfficientDetNet(config=config)
        model.build((1, *config.image_size, 3))
        model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir))

        @tf.function
        def model_fn(images, labels):
            cls_outputs, box_outputs = model(images, training=False)
            return postprocess.generate_detections(config, cls_outputs,
                                                   box_outputs,
                                                   labels['image_scales'],
                                                   labels['source_ids'])

        # Evaluator for AP calculation.
        label_map = label_util.get_label_map(config.label_map)
        evaluator = coco_metric.EvaluationMetric(filename=config.val_json_file,
                                                 label_map=label_map)

        @tf.function
        def eval_update(gt, pred):
            tf.numpy_function(evaluator.update_state,
                              [gt, postprocess.transform_detections(pred)], [])

        # dataset
        batch_size = FLAGS.batch_size  # global batch size.
        ds = dataloader.InputReader(
            FLAGS.val_file_pattern,
            is_training=False,
            max_instances_per_image=config.max_instances_per_image)(
                config, batch_size=batch_size)
        if FLAGS.eval_samples:
            ds = ds.take((FLAGS.eval_samples + batch_size - 1) // batch_size)
        ds = ds_strategy.experimental_distribute_dataset(ds)

        # evaluate all images.
        eval_samples = FLAGS.eval_samples or 5000
        pbar = tf.keras.utils.Progbar(
            (eval_samples + batch_size - 1) // batch_size)
        for i, (images, labels) in enumerate(ds):
            detections = ds_strategy.run(model_fn, (images, labels))
            ds_strategy.run(eval_update,
                            (labels['groundtruth_data'], detections))
            pbar.update(i)

    # compute the final eval results.
    metrics = evaluator.result()
    metric_dict = {}
    for i, name in enumerate(evaluator.metric_names):
        metric_dict[name] = metrics[i]

    if label_map:
        for i, cid in enumerate(sorted(label_map.keys())):
            name = 'AP_/%s' % label_map[cid]
            metric_dict[name] = metrics[i + len(evaluator.metric_names)]
    print(FLAGS.model_name, metric_dict)
示例#17
0
def main(_):

    if FLAGS.strategy == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        logging.info('Use horovod with multi gpus')
        hvd.init()
        os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())
    import tensorflow.compat.v1 as tf  # pylint: disable=g-import-not-at-top
    tf.enable_v2_tensorshape()
    tf.disable_eager_execution()

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_shards=num_shards,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  backbone_ckpt=FLAGS.backbone_ckpt,
                  ckpt=FLAGS.ckpt,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        config_proto.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)
        config_proto.gpu_options.allow_growth = True

    tpu_config = tf.estimator.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_shards=num_shards,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
        PER_HOST_V2)

    if FLAGS.strategy == 'horovod':
        model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
    else:
        model_dir = FLAGS.model_dir

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        evaluation_master=FLAGS.eval_master,
        model_dir=model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
        tf_random_seed=FLAGS.tf_random_seed,
    )

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
    max_instances_per_image = config.max_instances_per_image
    use_tpu = (FLAGS.strategy == 'tpu')

    # TPU Estimator
    logging.info(params)
    if FLAGS.mode == 'train':
        train_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(
            input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern,
                is_training=True,
                use_fake_data=FLAGS.use_fake_data,
                max_instances_per_image=max_instances_per_image),
            max_steps=int((config.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size))

        if FLAGS.eval_after_training:
            # Run evaluation after training finishes.
            eval_params = dict(
                params,
                strategy=FLAGS.strategy,
                input_rand_hflip=False,
                is_training_bn=False,
                mixed_precision=None,
            )
            eval_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(
                    FLAGS.validation_file_pattern,
                    is_training=False,
                    max_instances_per_image=max_instances_per_image),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size,
                name=FLAGS.eval_name)
            logging.info('Eval results: %s', eval_results)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    elif FLAGS.mode == 'eval':
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        eval_params = dict(
            params,
            strategy=FLAGS.strategy,
            input_rand_hflip=False,
            is_training_bn=False,
            mixed_precision=None,
        )

        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern,
                        is_training=False,
                        max_instances_per_image=max_instances_per_image),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size,
                    name=FLAGS.eval_name)
                logging.info('Eval results: %s', eval_results)

                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (config.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(config.num_epochs):
            logging.info('Starting training cycle, epoch: %d.', cycle)
            train_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern,
                is_training=True,
                use_fake_data=FLAGS.use_fake_data,
                max_instances_per_image=max_instances_per_image),
                                  steps=int(FLAGS.num_examples_per_epoch /
                                            FLAGS.train_batch_size))

            logging.info('Starting evaluation cycle, epoch: %d.', cycle)
            # Run evaluation after every epoch.
            eval_params = dict(
                params,
                strategy=FLAGS.strategy,
                input_rand_hflip=False,
                is_training_bn=False,
            )

            eval_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(
                    FLAGS.validation_file_pattern,
                    is_training=False,
                    max_instances_per_image=max_instances_per_image),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size,
                name=FLAGS.eval_name)
            logging.info('Evaluation results: %s', eval_results)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    else:
        logging.info('Mode not found.')
示例#18
0
def main(argv):
    del argv  # Unused.

    if FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if FLAGS.val_json_file is None:
            raise RuntimeError(
                'You must specify --val_json_file for evaluation.')

    # Parse hparams
    hparams = retinanet_model.default_hparams()
    hparams.parse(FLAGS.hparams)

    params = dict(
        hparams.values(),
        num_shards=FLAGS.num_cores,
        num_examples_per_epoch=FLAGS.num_examples_per_epoch,
        use_tpu=FLAGS.use_tpu,
        resnet_checkpoint=FLAGS.resnet_checkpoint,
        val_json_file=FLAGS.val_json_file,
        mode=FLAGS.mode,
    )
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.use_xla and not FLAGS.use_tpu:
        config_proto.graph_options.optimizer_options.global_jit_level = (
            tf.OptimizerOptions.ON_1)

    tpu_config = tf.contrib.tpu.TPUConfig(
        FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_cores,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
        PER_HOST_V2)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        evaluation_master=FLAGS.eval_master,
        model_dir=FLAGS.model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
    )

    model_fn = retinanet_model.retinanet_model_fn

    # TPU Estimator
    if FLAGS.mode == 'train':
        tf.logging.info(params)
        train_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=model_fn,
            use_tpu=FLAGS.use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(
            input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                            is_training=True),
            max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                          FLAGS.train_batch_size))

        if FLAGS.eval_after_training:
            # Run evaluation after training finishes.
            eval_params = dict(
                params,
                use_tpu=False,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
                use_bfloat16=False,
            )
            eval_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=retinanet_model.retinanet_model_fn,
                use_tpu=False,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
            tf.logging.info('Eval results: %s' % eval_results)

    elif FLAGS.mode == 'eval':
        # Eval only runs on CPU or GPU host with batch_size = 1.

        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.
        eval_params = dict(
            params,
            use_tpu=False,
            input_rand_hflip=False,
            resnet_checkpoint=None,
            is_training_bn=False,
            use_bfloat16=False,
        )

        eval_estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=retinanet_model.retinanet_model_fn,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)

        def terminate_eval():
            tf.logging.info(
                'Terminating eval after %d seconds of no checkpoints' %
                FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.contrib.training.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                tf.logging.info('Eval results: %s' % eval_results)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.num_epochs):
            tf.logging.info('Starting training cycle, epoch: %d.' % cycle)
            train_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=retinanet_model.retinanet_model_fn,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern, is_training=True),
                                  steps=int(FLAGS.num_examples_per_epoch /
                                            FLAGS.train_batch_size))

            tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle)
            # Run evaluation after every epoch.
            eval_params = dict(
                params,
                use_tpu=False,
                input_rand_hflip=False,
                resnet_checkpoint=None,
                is_training_bn=False,
            )

            eval_estimator = tf.contrib.tpu.TPUEstimator(
                model_fn=retinanet_model.retinanet_model_fn,
                use_tpu=False,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
            tf.logging.info('Evaluation results: %s' % eval_results)

    else:
        tf.logging.info('Mode not found.')