예제 #1
0
    def __init__(self,
                 model_name: Text,
                 logdir: Text,
                 tensorrt: Text = False,
                 use_xla: bool = False,
                 ckpt_path: Text = None,
                 export_ckpt: Text = None,
                 saved_model_dir: Text = None,
                 tflite_path: Text = None,
                 batch_size: int = 1,
                 hparams: Text = ''):
        self.model_name = model_name
        self.logdir = logdir
        self.tensorrt = tensorrt
        self.use_xla = use_xla
        self.ckpt_path = ckpt_path
        self.export_ckpt = export_ckpt
        self.saved_model_dir = saved_model_dir
        self.tflite_path = tflite_path

        model_config = hparams_config.get_detection_config(model_name)
        model_config.override(hparams)  # Add custom overrides
        model_config.is_training_bn = False
        model_config.image_size = utils.parse_image_size(
            model_config.image_size)

        # If batch size is 0, then build a graph with dynamic batch size.
        self.batch_size = batch_size or None
        self.labels_shape = [batch_size, model_config.num_classes]

        height, width = model_config.image_size
        if model_config.data_format == 'channels_first':
            self.inputs_shape = [batch_size, 3, height, width]
        else:
            self.inputs_shape = [batch_size, height, width, 3]

        self.model_config = model_config
예제 #2
0
    def __init__(self,
                 model_name: Text,
                 ckpt_path: Text,
                 image_size: Union[int, Tuple[int, int]] = None,
                 num_classes: int = None,
                 enable_ema: bool = True,
                 data_format: Text = None,
                 label_id_mapping: Dict[int, Text] = None):
        """Initialize the inference driver.

    Args:
      model_name: target model name, such as efficientdet-d0.
      ckpt_path: checkpoint path, such as /tmp/efficientdet-d0/.
      image_size: user specified image size. If None, use the default image size
        defined by model_name.
      num_classes: number of classes. If None, use the default COCO classes.
      enable_ema: whether to enable moving average.
      data_format: data format such as 'channel_last'.
      label_id_mapping: a dictionary from id to name. If None, use the default
        coco_id_mapping (with 90 classes).
    """
        self.model_name = model_name
        self.ckpt_path = ckpt_path
        self.label_id_mapping = label_id_mapping or coco_id_mapping

        self.params = hparams_config.get_detection_config(
            self.model_name).as_dict()
        self.params.update(dict(is_training_bn=False, use_bfloat16=False))
        if image_size:
            self.params.update(dict(image_size=image_size))
        if num_classes:
            self.params.update(dict(num_classes=num_classes))
        if data_format:
            self.params.update(dict(data_format=data_format))

        self.disable_pyfun = True
        self.enable_ema = enable_ema
예제 #3
0
    def __init__(self,
                 model_name: Text,
                 ckpt_path: Text,
                 image_size: int = None,
                 label_id_mapping: Dict[int, Text] = None):
        """Initialize the inference driver.

    Args:
      model_name: target model name, such as efficientdet-d0.
      ckpt_path: checkpoint path, such as /tmp/efficientdet-d0/.
      image_size: user specified image size. If None, use the default image size
        defined by model_name.
      label_id_mapping: a dictionary from id to name. If None, use the default
        coco_id_mapping (with 90 classes).
    """
        self.model_name = model_name
        self.ckpt_path = ckpt_path
        self.label_id_mapping = label_id_mapping or coco_id_mapping

        self.params = hparams_config.get_detection_config(
            self.model_name).as_dict()
        self.params.update(dict(is_training_bn=False, use_bfloat16=False))
        if image_size:
            self.params.update(dict(image_size=image_size))
예제 #4
0
def main(_):
    tf.config.run_functions_eagerly(FLAGS.debug)
    devices = tf.config.list_physical_devices('GPU')
    for device in devices:
        tf.config.experimental.set_memory_growth(device, True)

    model_config = hparams_config.get_detection_config(FLAGS.model_name)
    model_config.override(FLAGS.hparams)  # Add custom overrides
    model_config.is_training_bn = False
    if FLAGS.image_size != -1:
        model_config.image_size = FLAGS.image_size
    model_config.image_size = utils.parse_image_size(model_config.image_size)

    model_params = model_config.as_dict()
    ckpt_path_or_file = FLAGS.ckpt_path
    if tf.io.gfile.isdir(ckpt_path_or_file):
        ckpt_path_or_file = tf.train.latest_checkpoint(ckpt_path_or_file)
    driver = inference.ServingDriver(FLAGS.model_name, ckpt_path_or_file,
                                     FLAGS.batch_size or None, model_params)
    if FLAGS.mode == 'export':
        if not FLAGS.saved_model_dir:
            raise ValueError('Please specify --saved_model_dir=')
        model_dir = FLAGS.saved_model_dir
        if tf.io.gfile.exists(model_dir):
            tf.io.gfile.rmtree(model_dir)
        driver.export(model_dir, FLAGS.tensorrt, FLAGS.tflite)
        print('Model are exported to %s' % model_dir)
    elif FLAGS.mode == 'infer':
        if FLAGS.saved_model_dir:
            driver.load(FLAGS.saved_model_dir)
        image_file = tf.io.read_file(FLAGS.input_image)
        image_arrays = tf.io.decode_image(image_file)
        image_arrays.set_shape((None, None, 3))
        image_arrays = tf.expand_dims(image_arrays, axis=0)
        detections_bs = driver.serve(image_arrays)
        boxes, scores, classes, _ = tf.nest.map_structure(
            np.array, detections_bs)
        raw_image = Image.open(FLAGS.input_image)
        img = driver.visualize(
            raw_image,
            boxes[0],
            classes[0],
            scores[0],
            min_score_thresh=model_config.nms_configs.score_thresh,
            max_boxes_to_draw=model_config.nms_configs.max_output_size)
        output_image_path = os.path.join(FLAGS.output_image_dir, '0.jpg')
        Image.fromarray(img).save(output_image_path)
        print('writing file to %s' % output_image_path)
    elif FLAGS.mode == 'benchmark':
        if FLAGS.saved_model_dir:
            driver.load(FLAGS.saved_model_dir)

        batch_size = FLAGS.batch_size or 1
        if FLAGS.input_image:
            image_file = tf.io.read_file(FLAGS.input_image)
            image_arrays = tf.image.decode_image(image_file)
            image_arrays.set_shape((None, None, 3))
            image_arrays = tf.expand_dims(image_arrays, 0)
            if batch_size > 1:
                image_arrays = tf.tile(image_arrays, [batch_size, 1, 1, 1])
        else:
            # use synthetic data if no image is provided.
            image_arrays = tf.ones((batch_size, *model_config.image_size, 3),
                                   dtype=tf.uint8)
        driver.benchmark(image_arrays, FLAGS.bm_runs, FLAGS.trace_filename)
    elif FLAGS.mode == 'dry':
        # transfer to tf2 format ckpt
        driver.build()
        if FLAGS.export_ckpt:
            driver.model.save_weights(FLAGS.export_ckpt)
    elif FLAGS.mode == 'video':
        import cv2  # pylint: disable=g-import-not-at-top
        if FLAGS.saved_model_dir:
            driver.load(FLAGS.saved_model_dir)
        cap = cv2.VideoCapture(FLAGS.input_video)
        if not cap.isOpened():
            print('Error opening input video: {}'.format(FLAGS.input_video))

        out_ptr = None
        if FLAGS.output_video:
            frame_width, frame_height = int(cap.get(3)), int(cap.get(4))
            out_ptr = cv2.VideoWriter(
                FLAGS.output_video, cv2.VideoWriter_fourcc('m', 'p', '4', 'v'),
                25, (frame_width, frame_height))

        while cap.isOpened():
            # Capture frame-by-frame
            ret, frame = cap.read()
            if not ret:
                break

            raw_frames = np.array([frame])
            detections_bs = driver.serve(raw_frames)
            boxes, scores, classes, _ = tf.nest.map_structure(
                np.array, detections_bs)
            new_frame = driver.visualize(
                raw_frames[0],
                boxes[0],
                scores[0],
                classes[0],
                min_score_thresh=model_config.nms_configs.score_thresh,
                max_boxes_to_draw=model_config.nms_configs.max_output_size)

            if out_ptr:
                # write frame into output file.
                out_ptr.write(new_frame)
            else:
                # show the frame online, mainly used for real-time speed test.
                cv2.imshow('Frame', new_frame)
                # Press Q on keyboard to  exit
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
예제 #5
0
def main(argv):
    del argv  # Unused.

    hvd_try_init()
    set_env(use_amp=FLAGS.use_amp)

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if not FLAGS.val_json_file and not FLAGS.testdev_dir:
            raise RuntimeError(
                'You must specify --val_json_file or --testdev for evaluation.'
            )

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = config.get('image_size')
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores
        if hvd is not None:
            num_shards = hvd.size()

    params = dict(
        config.as_dict(),
        model_name=FLAGS.model_name,
        num_epochs=FLAGS.num_epochs,
        iterations_per_loop=FLAGS.iterations_per_loop,
        model_dir=FLAGS.model_dir,
        num_shards=num_shards,
        num_examples_per_epoch=FLAGS.num_examples_per_epoch,
        use_tpu=FLAGS.use_tpu,
        backbone_ckpt=FLAGS.backbone_ckpt,
        ckpt=FLAGS.ckpt,
        val_json_file=FLAGS.val_json_file,
        testdev_dir=FLAGS.testdev_dir,
        mode=FLAGS.mode,
    )

    run_config = tf.estimator.RunConfig(
        session_config=get_session_config(use_xla=FLAGS.use_xla),
        save_checkpoints_steps=600)

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)

    # TPU Estimator
    logging.info(params)
    if FLAGS.mode == 'train':
        # train_estimator = tf.estimator.tpu.TPUEstimator(
        #     model_fn=model_fn_instance,
        #     use_tpu=FLAGS.use_tpu,
        #     train_batch_size=FLAGS.train_batch_size,
        #     config=run_config,
        #     params=params)
        params['batch_size'] = FLAGS.train_batch_size
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=run_config,
                                           params=params)

        input_fn = dataloader.InputReader(FLAGS.training_file_pattern,
                                          is_training=True,
                                          params=params,
                                          use_fake_data=FLAGS.use_fake_data)
        max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                        (FLAGS.train_batch_size * num_shards)) + 1

        train_estimator.train(input_fn=input_fn, max_steps=max_steps)

        # if FLAGS.eval_after_training:
        #   # Run evaluation after training finishes.
        #   eval_params = dict(
        #       params,
        #       use_tpu=False,
        #       input_rand_hflip=False,
        #       is_training_bn=False,
        #       use_bfloat16=False,
        #   )
        #   eval_estimator = tf.estimator.tpu.TPUEstimator(
        #       model_fn=model_fn_instance,
        #       use_tpu=False,
        #       train_batch_size=FLAGS.train_batch_size,
        #       eval_batch_size=FLAGS.eval_batch_size,
        #       config=run_config,
        #       params=eval_params)
        #   eval_results = eval_estimator.evaluate(
        #       input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
        #                                       is_training=False),
        #       steps=FLAGS.eval_samples//FLAGS.eval_batch_size)
        #   logging.info('Eval results: %s', eval_results)
        #   ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
        #   utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    elif FLAGS.mode == 'eval':
        config_proto = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        if FLAGS.use_xla and not FLAGS.use_tpu:
            config_proto.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)

        tpu_config = tf.estimator.tpu.TPUConfig(
            FLAGS.iterations_per_loop,
            num_shards=num_shards,
            num_cores_per_replica=num_cores_per_replica,
            input_partition_dims=input_partition_dims,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
            PER_HOST_V2)

        run_config = tf.estimator.tpu.RunConfig(
            cluster=None,
            evaluation_master=FLAGS.eval_master,
            model_dir=FLAGS.model_dir,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            tpu_config=tpu_config,
        )

        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.
        eval_params = dict(
            params,
            use_tpu=False,
            input_rand_hflip=False,
            is_training_bn=False,
            use_bfloat16=False,
        )

        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                logging.info('Eval results: %s', eval_results)

                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.num_epochs):
            logging.info('Starting training cycle, epoch: %d.', cycle)
            train_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=FLAGS.use_tpu,
                train_batch_size=FLAGS.train_batch_size,
                config=run_config,
                params=params)
            train_estimator.train(input_fn=dataloader.InputReader(
                FLAGS.training_file_pattern,
                is_training=True,
                use_fake_data=FLAGS.use_fake_data),
                                  steps=int(FLAGS.num_examples_per_epoch /
                                            FLAGS.train_batch_size))

            logging.info('Starting evaluation cycle, epoch: %d.', cycle)
            # Run evaluation after every epoch.
            eval_params = dict(
                params,
                use_tpu=False,
                input_rand_hflip=False,
                is_training_bn=False,
            )

            eval_estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn_instance,
                use_tpu=False,
                train_batch_size=FLAGS.train_batch_size,
                eval_batch_size=FLAGS.eval_batch_size,
                config=run_config,
                params=eval_params)
            eval_results = eval_estimator.evaluate(
                input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                                is_training=False),
                steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
            logging.info('Evaluation results: %s', eval_results)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
        pass

    else:
        logging.info('Mode not found.')
예제 #6
0
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.debugging.set_log_device_placement(True)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        tf.random.set_seed(FLAGS.tf_random_seed)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        gpus = tf.config.list_physical_devices('GPU')
        if FLAGS.batch_size % len(gpus):
            raise ValueError(
                'Batch size divide gpus number must be interger, but got %f' %
                (FLAGS.batch_size / len(gpus)))
        if platform.system() == 'Windows':
            # Windows doesn't support nccl use HierarchicalCopyAllReduce instead
            # TODO(fsx950223): investigate HierarchicalCopyAllReduce performance issue
            cross_device_ops = tf.distribute.HierarchicalCopyAllReduce()
        else:
            cross_device_ops = None
        ds_strategy = tf.distribute.MirroredStrategy(
            cross_device_ops=cross_device_ops)
        logging.info('All devices: %s', gpus)
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size
    params = dict(profile=FLAGS.profile,
                  model_name=FLAGS.model_name,
                  steps_per_execution=FLAGS.steps_per_execution,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size,
                  tf_random_seed=FLAGS.tf_random_seed,
                  debug=FLAGS.debug,
                  val_json_file=FLAGS.val_json_file,
                  eval_samples=FLAGS.eval_samples,
                  num_shards=ds_strategy.num_replicas_in_sync)
    config.override(params, True)
    # set mixed precision policy by keras api.
    precision = utils.get_precision(config.strategy, config.mixed_precision)
    policy = tf.keras.mixed_precision.Policy(precision)
    tf.keras.mixed_precision.set_global_policy(policy)

    def get_dataset(is_training, config):
        file_pattern = (FLAGS.train_file_pattern
                        if is_training else FLAGS.val_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image,
            debug=FLAGS.debug)(config.as_dict())

    with ds_strategy.scope():
        if config.model_optimizations:
            tfmot.set_config(config.model_optimizations.as_dict())
        if FLAGS.hub_module_url:
            model = train_lib.EfficientDetNetTrainHub(
                config=config, hub_module_url=FLAGS.hub_module_url)
        else:
            model = train_lib.EfficientDetNetTrain(config=config)
        model = setup_model(model, config)
        if FLAGS.debug:
            tf.config.run_functions_eagerly(True)
        if FLAGS.pretrained_ckpt and not FLAGS.hub_module_url:
            ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt)
            util_keras.restore_ckpt(model,
                                    ckpt_path,
                                    config.moving_average_decay,
                                    exclude_layers=['class_net'])
        init_experimental(config)
        if 'train' in FLAGS.mode:
            val_dataset = get_dataset(False,
                                      config) if 'eval' in FLAGS.mode else None
            model.fit(
                get_dataset(True, config),
                epochs=config.num_epochs,
                steps_per_epoch=steps_per_epoch,
                callbacks=train_lib.get_callbacks(config.as_dict(),
                                                  val_dataset),
                validation_data=val_dataset,
                validation_steps=(FLAGS.eval_samples // FLAGS.batch_size))
        else:
            # Continuous eval.
            for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir,
                                                      min_interval_secs=180):
                logging.info('Starting to evaluate.')
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_epoch = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    current_epoch = 0

                val_dataset = get_dataset(False, config)
                logging.info('start loading model.')
                model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir))
                logging.info('finish loading model.')
                coco_eval = train_lib.COCOCallback(val_dataset, 1)
                coco_eval.set_model(model)
                eval_results = coco_eval.on_epoch_end(current_epoch)
                logging.info('eval results for %s: %s', ckpt, eval_results)

                try:
                    utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                except tf.errors.NotFoundError:
                    # Checkpoint might be not already deleted by the time eval finished.
                    logging.info('Checkpoint %s no longer exists, skipping.',
                                 ckpt)

                if current_epoch >= config.num_epochs or not current_epoch:
                    logging.info('Eval epoch %d / %d', current_epoch,
                                 config.num_epochs)
                    break
예제 #7
0
def main(_):
    if FLAGS.strategy == 'tpu':
        tf.disable_eager_execution()
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in ('train', 'train_and_eval'):
        if FLAGS.training_file_pattern is None:
            raise RuntimeError(
                'Must specify --training_file_pattern for train.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError(
                'Must specify --validation_file_pattern for eval.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
            'image_masks': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_shards=num_shards,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  backbone_ckpt=FLAGS.backbone_ckpt,
                  ckpt=FLAGS.ckpt,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.strategy != 'tpu':
        if FLAGS.use_xla:
            config_proto.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)
        config_proto.gpu_options.allow_growth = True

    model_dir = FLAGS.model_dir
    strategy = None
    if FLAGS.strategy == 'tpu':
        tpu_config = tf.estimator.tpu.TPUConfig(
            FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1,
            num_cores_per_replica=num_cores_per_replica,
            input_partition_dims=input_partition_dims,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
            PER_HOST_V2)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=model_dir,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            tpu_config=tpu_config,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tf_random_seed=FLAGS.tf_random_seed,
        )
    else:
        if FLAGS.strategy == 'gpus':
            strategy = tf.distribute.MirroredStrategy()
        run_config = tf.estimator.RunConfig(
            model_dir=model_dir,
            train_distribute=strategy,
            log_step_count_steps=FLAGS.iterations_per_loop,
            session_config=config_proto,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tf_random_seed=FLAGS.tf_random_seed,
        )

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
    max_instances_per_image = config.max_instances_per_image
    eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size)
    total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
    train_steps = total_examples // FLAGS.train_batch_size
    logging.info(params)
    with tf.io.gfile.GFile(os.path.join(model_dir, 'config.yaml'), 'w') as f:
        f.write(str(config))

    train_input_fn = dataloader.InputReader(
        FLAGS.training_file_pattern,
        is_training=True,
        use_fake_data=FLAGS.use_fake_data,
        max_instances_per_image=max_instances_per_image)
    eval_input_fn = dataloader.InputReader(
        FLAGS.validation_file_pattern,
        is_training=False,
        use_fake_data=FLAGS.use_fake_data,
        max_instances_per_image=max_instances_per_image)

    if FLAGS.strategy == 'tpu':
        estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=params)
    else:
        params['batch_size'] = (FLAGS.train_batch_size //
                                getattr(strategy, 'num_replicas_in_sync', 1))
        params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1)
        estimator = tf.estimator.Estimator(model_fn=model_fn_instance,
                                           config=run_config,
                                           params=params)

    # start train/eval flow.
    if FLAGS.mode == 'train':
        estimator.train(input_fn=train_input_fn, max_steps=train_steps)
        if FLAGS.eval_after_training:
            estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout):

            logging.info('Starting to evaluate.')
            try:
                eval_results = estimator.evaluate(eval_input_fn,
                                                  steps=eval_steps)
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                if current_step >= train_steps:
                    logging.info('Eval finished step %d/%d', current_step,
                                 train_steps)
                    break

            except tf.errors.NotFoundError:
                # Checkpoint might be not already deleted by the time eval finished.
                # We simply skip ssuch case.
                logging.info('Checkpoint %s no longer exists, skipping.', ckpt)

    elif FLAGS.mode == 'train_and_eval':
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=train_steps)
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                          steps=eval_steps,
                                          throttle_secs=600)
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    else:
        logging.info('Invalid mode: %s', FLAGS.mode)
예제 #8
0
def main(_):

    if FLAGS.strategy == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        logging.info('Use horovod with multi gpus')
        hvd.init()
        os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())
    import tensorflow.compat.v1 as tf  # pylint: disable=g-import-not-at-top
    tf.enable_v2_tensorshape()
    tf.disable_eager_execution()

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tpu_grpc_url = tpu_cluster_resolver.get_master()
        tf.Session.reset(tpu_grpc_url)
    else:
        tpu_cluster_resolver = None

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        feat_sizes = utils.get_feat_sizes(config.get('image_size'),
                                          config.get('max_level'))
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = feat_sizes[level]
            if _can_partition(spatial_dim['height']) and _can_partition(
                    spatial_dim['width']):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_shards=num_shards,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  backbone_ckpt=FLAGS.backbone_ckpt,
                  ckpt=FLAGS.ckpt,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)
    config_proto = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    if FLAGS.strategy != 'tpu':
        if FLAGS.use_xla:
            config_proto.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_1)
        config_proto.gpu_options.allow_growth = True

    tpu_config = tf.estimator.tpu.TPUConfig(
        FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1,
        num_cores_per_replica=num_cores_per_replica,
        input_partition_dims=input_partition_dims,
        per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
        PER_HOST_V2)

    if FLAGS.strategy == 'horovod':
        model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
    else:
        model_dir = FLAGS.model_dir

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=model_dir,
        log_step_count_steps=FLAGS.iterations_per_loop,
        session_config=config_proto,
        tpu_config=tpu_config,
        tf_random_seed=FLAGS.tf_random_seed,
    )

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)
    max_instances_per_image = config.max_instances_per_image
    eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size)
    use_tpu = (FLAGS.strategy == 'tpu')
    logging.info(params)

    def _train(steps):
        """Build train estimator and run training if steps > 0."""
        train_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            config=run_config,
            params=params)
        train_estimator.train(input_fn=dataloader.InputReader(
            FLAGS.training_file_pattern,
            is_training=True,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=max_instances_per_image),
                              max_steps=steps)

    def _eval(steps):
        """Build estimator and eval the latest checkpoint if steps > 0."""
        eval_params = dict(
            params,
            strategy=FLAGS.strategy,
            input_rand_hflip=False,
            is_training_bn=False,
        )
        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=use_tpu,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=run_config,
            params=eval_params)
        eval_results = eval_estimator.evaluate(input_fn=dataloader.InputReader(
            FLAGS.validation_file_pattern,
            is_training=False,
            max_instances_per_image=max_instances_per_image),
                                               steps=steps,
                                               name=FLAGS.eval_name)
        logging.info('Evaluation results: %s', eval_results)
        return eval_results

    # start train/eval flow.
    if FLAGS.mode == 'train':
        total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch)
        _train(total_examples // FLAGS.train_batch_size)
        if FLAGS.eval_after_training:
            _eval(eval_steps)

    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout):

            logging.info('Starting to evaluate.')
            try:
                eval_results = _eval(eval_steps)
                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (config.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info('Checkpoint %s no longer exists, skipping.', ckpt)

    elif FLAGS.mode == 'train_and_eval':
        ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
        try:
            step = int(os.path.basename(ckpt).split("-")[1])
            current_epoch = (step * FLAGS.train_batch_size //
                             FLAGS.num_examples_per_epoch)
            logging.info('found ckpt at step %d (epoch %d)', step,
                         current_epoch)
        except (IndexError, TypeError):
            logging.info("Folder has no ckpt with valid step.",
                         FLAGS.model_dir)
            current_epoch = 0

        epochs_per_cycle = 1  # higher number has less graph construction overhead.
        for e in range(current_epoch + 1, config.num_epochs + 1,
                       epochs_per_cycle):
            print('-----------------------------------------------------\n'
                  '=====> Starting training, epoch: %d.' % e)
            _train(e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size)
            print('-----------------------------------------------------\n'
                  '=====> Starting evaluation, epoch: %d.' % e)
            eval_results = _eval(eval_steps)
            ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
            utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

    else:
        logging.info('Invalid mode: %s', FLAGS.mode)
예제 #9
0
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        tf.random.set_seed(FLAGS.tf_random_seed)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size
    params = dict(profile=FLAGS.profile,
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size,
                  tf_random_seed=FLAGS.tf_random_seed,
                  debug=FLAGS.debug,
                  val_json_file=FLAGS.val_json_file,
                  eval_samples=FLAGS.eval_samples,
                  num_shards=ds_strategy.num_replicas_in_sync)
    config.override(params, True)
    # set mixed precision policy by keras api.
    precision = utils.get_precision(config.strategy, config.mixed_precision)
    policy = tf.keras.mixed_precision.experimental.Policy(precision)
    tf.keras.mixed_precision.experimental.set_policy(policy)

    def get_dataset(is_training, config):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.val_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image,
            debug=FLAGS.debug)(config.as_dict())

    with ds_strategy.scope():
        if config.model_optimizations:
            tfmot.set_config(config.model_optimizations.as_dict())
        model = setup_model(config)
        if FLAGS.pretrained_ckpt:
            ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt)
            util_keras.restore_ckpt(model, ckpt_path)
        init_experimental(config)
        val_dataset = get_dataset(False, config).repeat()
        model.fit(get_dataset(True, config),
                  epochs=config.num_epochs,
                  steps_per_epoch=steps_per_epoch,
                  callbacks=train_lib.get_callbacks(config.as_dict(),
                                                    val_dataset),
                  validation_data=val_dataset,
                  validation_steps=(FLAGS.eval_samples // FLAGS.batch_size))
    model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))
예제 #10
0
  def test_train(self):
    tf.random.set_seed(1111)
    config = hparams_config.get_detection_config('efficientdet-d0')
    config.batch_size = 1
    config.num_examples_per_epoch = 1
    config.model_dir = tempfile.mkdtemp()
    x = tf.ones((1, 512, 512, 3))
    labels = {
        'box_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 36))
        for i in range(3, 8)
    }
    labels.update({
        'cls_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 9),
                                      dtype=tf.int32) for i in range(3, 8)
    })
    labels.update({'mean_num_positives': tf.constant([10.0])})

    params = config.as_dict()
    params['num_shards'] = 1
    model = train_lib.EfficientDetNetTrain(config=config)
    model.build((1, 512, 512, 3))
    model.compile(
        optimizer=train_lib.get_optimizer(params),
        loss={
            'box_loss':
                train_lib.BoxLoss(
                    params['delta'], reduction=tf.keras.losses.Reduction.NONE),
            'box_iou_loss':
                train_lib.BoxIouLoss(
                    params['iou_loss_type'],
                    reduction=tf.keras.losses.Reduction.NONE),
            'class_loss':
                train_lib.FocalLoss(
                    params['alpha'],
                    params['gamma'],
                    label_smoothing=params['label_smoothing'],
                    reduction=tf.keras.losses.Reduction.NONE)
        })

    # Test single-batch
    outputs = model.train_on_batch(x, labels, return_dict=True)
    expect_results = {'loss': 26278.25,
                      'det_loss': 26277.033203125,
                      'cls_loss': 5060.716796875,
                      'box_loss': 424.3263244628906,
                      'box_iou_loss': 0,
                      'gnorm': 5873.78759765625}
    self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.)
    outputs = model.test_on_batch(x, labels, return_dict=True)
    expect_results = {'loss': 26079.712890625,
                      'det_loss': 26078.49609375,
                      'cls_loss': 5063.3759765625,
                      'box_loss': 420.30242919921875,
                      'box_iou_loss': 0}
    self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.)

    # Test fit.
    hist = model.fit(
        x,
        labels,
        steps_per_epoch=1,
        epochs=1,
        callbacks=train_lib.get_callbacks(params))
    expect_results = {'loss': [26063.099609375],
                      'det_loss': [26061.8828125],
                      'cls_loss': [5058.1337890625],
                      'box_loss': [420.074951171875],
                      'box_iou_loss': [0],
                      'gnorm': [5107.46435546875]}
    self.assertAllClose(
        hist.history, expect_results, rtol=.1, atol=100.)
예제 #11
0
    def test_train(self):
        tf.random.set_seed(1111)
        config = hparams_config.get_detection_config('efficientdet-d0')
        config.heads = ['object_detection', 'segmentation']
        config.batch_size = 1
        config.num_examples_per_epoch = 1
        config.model_dir = tempfile.mkdtemp()
        config.steps_per_epoch = 1
        x = tf.ones((1, 512, 512, 3))
        labels = {
            'box_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 36))
            for i in range(3, 8)
        }
        labels.update({
            'cls_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 9),
                                          dtype=tf.int32)
            for i in range(3, 8)
        })
        labels.update({'image_masks': tf.ones((1, 128, 128, 1))})
        labels.update({'mean_num_positives': tf.constant([10.0])})

        params = config.as_dict()
        params['num_shards'] = 1
        model = train_lib.EfficientDetNetTrain(config=config)
        model.build((1, 512, 512, 3))
        model.compile(
            optimizer=train_lib.get_optimizer(params),
            loss={
                'box_loss':
                train_lib.BoxLoss(params['delta'],
                                  reduction=tf.keras.losses.Reduction.NONE),
                'box_iou_loss':
                train_lib.BoxIouLoss(params['iou_loss_type'],
                                     params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'],
                                     reduction=tf.keras.losses.Reduction.NONE),
                'class_loss':
                train_lib.FocalLoss(params['alpha'],
                                    params['gamma'],
                                    label_smoothing=params['label_smoothing'],
                                    reduction=tf.keras.losses.Reduction.NONE),
                'seg_loss':
                tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            })

        # Test single-batch
        outputs = model.train_on_batch(x, labels, return_dict=True)
        expect_results = {
            'loss': [26278.3, 5061.9, 425.5, 1.217],
            'det_loss': 26277.033203125,
            'cls_loss': 5060.716796875,
            'box_loss': 424.3263244628906,
            'gnorm': 5873.78759765625,
            'seg_loss': 1.2215478420257568,
        }
        self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.)
        outputs = model.test_on_batch(x, labels, return_dict=True)
        expect_results = {
            'loss': [26278.3, 5061.9, 425.5, 1.217],
            'det_loss': 26078.49609375,
            'cls_loss': 5063.3759765625,
            'box_loss': 420.30242919921875,
            'seg_loss': 1.2299377918243408,
        }
        self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.)

        # Test fit.
        hist = model.fit(x,
                         labels,
                         steps_per_epoch=1,
                         epochs=1,
                         callbacks=train_lib.get_callbacks(params))

        self.assertAllClose(hist.history['loss'],
                            [[26067, 5057.5, 421.4, 1.2]],
                            rtol=.1,
                            atol=10.)
        self.assertAllClose(hist.history['det_loss'], [26061.],
                            rtol=.1,
                            atol=10.)
        self.assertAllClose(hist.history['cls_loss'], [5058.],
                            rtol=.1,
                            atol=10.)
        self.assertAllClose(hist.history['box_loss'], [420.],
                            rtol=.1,
                            atol=100.)
        self.assertAllClose(hist.history['seg_loss'], [1.2299],
                            rtol=.1,
                            atol=100.)
예제 #12
0
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        tf.random.set_seed(111111)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')

    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  num_examples_per_epoch=FLAGS.num_examples_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size //
                  ds_strategy.num_replicas_in_sync,
                  num_shards=ds_strategy.num_replicas_in_sync,
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode=FLAGS.mode)

    # set mixed precision policy by keras api.
    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    policy = tf.keras.mixed_precision.experimental.Policy(precision)
    tf.keras.mixed_precision.experimental.set_policy(policy)

    def get_dataset(is_training, params):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.validation_file_pattern)
        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image)(params)

    with ds_strategy.scope():
        model = train_lib.EfficientDetNetTrain(params['model_name'], config)
        height, width = utils.parse_image_size(params['image_size'])
        model.build((params['batch_size'], height, width, 3))
        model.compile(
            optimizer=train_lib.get_optimizer(params),
            loss={
                'box_loss':
                train_lib.BoxLoss(params['delta'],
                                  reduction=tf.keras.losses.Reduction.NONE),
                'box_iou_loss':
                train_lib.BoxIouLoss(params['iou_loss_type'],
                                     params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'],
                                     reduction=tf.keras.losses.Reduction.NONE),
                'class_loss':
                train_lib.FocalLoss(params['alpha'],
                                    params['gamma'],
                                    label_smoothing=params['label_smoothing'],
                                    reduction=tf.keras.losses.Reduction.NONE)
            })
    ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir)
    if ckpt_path:
        model.load_weights(ckpt_path)
    model.freeze_vars(params['var_freeze_expr'])
    model.fit(get_dataset(True, params=params),
              steps_per_epoch=FLAGS.num_examples_per_epoch,
              callbacks=train_lib.get_callbacks(params, FLAGS.profile),
              validation_data=get_dataset(False, params=params),
              validation_steps=FLAGS.eval_samples)
    model.save_weights(os.path.join(FLAGS.model_dir, 'model'))
예제 #13
0
파일: train.py 프로젝트: lushun666/automl
def main(_):
    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs

    # Parse image size in case it is in string format.
    config.image_size = utils.parse_image_size(config.image_size)

    if FLAGS.use_xla and FLAGS.strategy != 'tpu':
        tf.config.optimizer.set_jit(True)
        for gpu in tf.config.list_physical_devices('GPU'):
            tf.config.experimental.set_memory_growth(gpu, True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        tf.random.set_seed(111111)
        logging.set_verbosity(logging.DEBUG)

    if FLAGS.strategy == 'tpu':
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
        ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver)
        logging.info('All devices: %s', tf.config.list_logical_devices('TPU'))
    elif FLAGS.strategy == 'gpus':
        ds_strategy = tf.distribute.MirroredStrategy()
        logging.info('All devices: %s', tf.config.list_physical_devices('GPU'))
    else:
        if tf.config.list_physical_devices('GPU'):
            ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0')
        else:
            ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0')

    steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size
    params = dict(config.as_dict(),
                  profile=FLAGS.profile,
                  model_name=FLAGS.model_name,
                  iterations_per_loop=FLAGS.iterations_per_loop,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  strategy=FLAGS.strategy,
                  batch_size=FLAGS.batch_size,
                  num_shards=ds_strategy.num_replicas_in_sync)

    # set mixed precision policy by keras api.
    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    policy = tf.keras.mixed_precision.experimental.Policy(precision)
    tf.keras.mixed_precision.experimental.set_policy(policy)

    def get_dataset(is_training, params):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.validation_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image)(params)

    with ds_strategy.scope():
        model = train_lib.EfficientDetNetTrain(params['model_name'], config)
        model.compile(
            optimizer=train_lib.get_optimizer(params),
            loss={
                'box_loss':
                train_lib.BoxLoss(params['delta'],
                                  reduction=tf.keras.losses.Reduction.NONE),
                'box_iou_loss':
                train_lib.BoxIouLoss(params['iou_loss_type'],
                                     params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'],
                                     reduction=tf.keras.losses.Reduction.NONE),
                'class_loss':
                train_lib.FocalLoss(params['alpha'],
                                    params['gamma'],
                                    label_smoothing=params['label_smoothing'],
                                    reduction=tf.keras.losses.Reduction.NONE),
                'seg_loss':
                tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
            })

        if FLAGS.pretrained_ckpt:
            ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt)
            util_keras.restore_ckpt(model, ckpt_path,
                                    params['moving_average_decay'])
        tf.io.gfile.makedirs(FLAGS.model_dir)
        if params['model_optimizations']:
            model_optimization.set_config(params['model_optimizations'])
        model.build((FLAGS.batch_size, *config.image_size, 3))
        model.fit(get_dataset(True, params=params),
                  epochs=params['num_epochs'],
                  steps_per_epoch=steps_per_epoch,
                  callbacks=train_lib.get_callbacks(params),
                  validation_data=get_dataset(False, params=params).repeat(),
                  validation_steps=(FLAGS.eval_samples // FLAGS.batch_size))
    model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))
예제 #14
0
def main(argv):
    del argv  # Unused.

    # if given an efficentdet ckpt don't use default backbone ckpt
    if FLAGS.backbone_ckpt == BACKBONE_CKPT_DEFAULT_DIR and FLAGS.ckpt is not None:
        print("Using ckpt flag: {}, ignoring default backbone_ckpt: {}".format(
            FLAGS.ckpt, FLAGS.backbone_ckpt))
        FLAGS.backbone_ckpt = None

    if FLAGS.use_horovod is not None:
        if FLAGS.dump_all_ranks:
            FLAGS.model_dir += "/worker_" + str(hvd.rank())
        if not 'HOROVOD_CYCLE_TIME' in os.environ:
            os.environ['HOROVOD_CYCLE_TIME'] = '0.5'
        if not 'HABANA_HCCL_COMM_API' in os.environ:
            os.environ['HABANA_HCCL_COMM_API'] = '0'
        hvd_init()

    if not FLAGS.no_hpu:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

        if FLAGS.use_horovod:
            assert (horovod_enabled())

    set_env(use_amp=FLAGS.use_amp)

    # deterministic setting
    if FLAGS.sbs_test or FLAGS.deterministic:
        set_deterministic()

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if not FLAGS.val_json_file and not FLAGS.testdev_dir:
            raise RuntimeError(
                'You must specify --val_json_file or --testdev for evaluation.'
            )

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = config.get('image_size')
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores
        if horovod_enabled():
            num_shards = hvd.size()
        else:
            num_shards = 1

    params = build_estimator_params('train', config, num_shards)
    # disabling input data scaling/flip manipulations.
    if FLAGS.sbs_test:
        sbs_params = dict(input_rand_hflip=False,
                          train_scale_min=1,
                          train_scale_max=1,
                          dropout_rate=0.0)
        params.update(sbs_params)

    tf_random_seed = 0 if FLAGS.deterministic else None
    run_config = build_estimator_config('train', config, num_shards,
                                        num_cores_per_replica,
                                        input_partition_dims)
    write_hparams_v1(FLAGS.model_dir, {
        'batch_size': FLAGS.train_batch_size,
        **FLAGS.flag_values_dict()
    })

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)

    # TPU Estimator
    logging.info(params)

    if FLAGS.mode == 'train':
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=run_config,
                                           params=params)

        # for deterministic input, we pass to dataloader False for not manipulating input data
        is_training = not FLAGS.deterministic
        use_fake_data = FLAGS.use_fake_data or FLAGS.deterministic

        input_fn = dataloader.InputReader(FLAGS.training_file_pattern,
                                          is_training=is_training,
                                          params=params,
                                          use_fake_data=use_fake_data,
                                          is_deterministic=FLAGS.deterministic)
        max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                        (FLAGS.train_batch_size * num_shards)) + 1

        # for sbs test, train under sbs callbacks
        if FLAGS.sbs_test:
            from TensorFlow.common.debug import dump_callback
            SBS_TEST_CONFIG = os.path.join(
                os.environ['TF_TESTS_ROOT'],
                "tests/tf_training_tests/side_by_side/topologies/efficientdet/dump_config.json"
            )
            with dump_callback(SBS_TEST_CONFIG):
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)
        else:
            if FLAGS.ckpt is not None:
                train_estimator.train(input_fn=input_fn, steps=max_steps)
            else:
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)

    elif FLAGS.mode == 'eval':
        eval_params = build_estimator_params('eval', config, num_shards)
        eval_config = build_estimator_config('eval', config, num_shards,
                                             num_cores_per_replica,
                                             input_partition_dims)

        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.

        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=eval_config,
            params=eval_params)

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                logging.info('Eval results: %s', eval_results)

                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                write_summary(eval_results, ckpt, current_step)

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        train_params = build_estimator_params('train', config, num_shards)
        train_config = build_estimator_config('train', config, num_shards,
                                              num_cores_per_replica,
                                              input_partition_dims)
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=train_config,
                                           params=train_params)

        eval_estimator = None

        for cycle in range(FLAGS.num_epochs):
            logging.info('Starting training cycle, epoch: %d.', cycle)

            train_estimator.train(
                input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern,
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                max_steps=(cycle + 1) *
                int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size))

            # synchronization point for all ranks
            if horovod_enabled():
                hvd.allreduce(tf.constant(0))

            logging.info('Starting evaluation cycle, epoch: %d.', cycle)
            # Run evaluation after every epoch.

            if eval_estimator is None:
                eval_params = build_estimator_params('eval', config,
                                                     num_shards)
                eval_config = build_estimator_config('eval', config,
                                                     num_shards,
                                                     num_cores_per_replica,
                                                     input_partition_dims)
                eval_estimator = tf.estimator.tpu.TPUEstimator(
                    model_fn=model_fn_instance,
                    use_tpu=False,
                    train_batch_size=FLAGS.train_batch_size,
                    eval_batch_size=FLAGS.eval_batch_size,
                    config=eval_config,
                    params=eval_params)

            if is_rank0():
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)

                checkpoint_path = Path(FLAGS.model_dir)
                last_ckpt = tf.train.latest_checkpoint(str(checkpoint_path),
                                                       latest_filename=None)
                current_step = int(os.path.basename(last_ckpt).split('-')[1])
                write_summary(eval_results, FLAGS.model_dir, current_step)
                logging.info('Evaluation results: %s', eval_results)

                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
        pass

    else:
        logging.info('Mode not found.')
예제 #15
0
                labels['cls_targets_%d' % level] = cls_targets[level]
                labels['box_targets_%d' % level] = box_targets[level]
            # Concatenate groundtruth annotations to a tensor.
            groundtruth_data = tf.concat([boxes, areas, classes], axis=2)
            #labels['source_ids'] = source_ids
            labels['groundtruth_data'] = groundtruth_data
            labels['image_scales'] = image_scales
            return images, labels

        dataset = dataset.map(_process_example)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        #if self._use_fake_data:
        # Turn this dataset into a semi-fake dataset which always loop at the
        # first batch. This reduces variance in performance and is useful in
        # testing.
        #dataset = dataset.take(1).cache().repeat()
        return dataset


if __name__ == '__main__':
    Datatset = InputReader(
        "F:/automl-master/efficientdet/dataset/tfrecord/-00000-of-00001.tfrecord",
        is_training=True,
        use_fake_data=False)
    config = hparams_config.get_detection_config('efficientdet-d1')
    params = dict(config.as_dict())
    dataset = Datatset(params)
    iter = dataset.make_one_shot_iterator()
    images, labels = iter.get_next()
    with tf.Session() as sess:
        print(sess.run(images))
예제 #16
0
파일: main.py 프로젝트: pandora2000/automl
def main(argv):
  assert len(argv) >= 1
  if len(argv) > 1:  # Do not accept unknown args.
    raise ValueError('Received unknown arguments: {}'.format(argv[1:]))

  if FLAGS.use_tpu:
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu,
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)
    tpu_grpc_url = tpu_cluster_resolver.get_master()
    tf.Session.reset(tpu_grpc_url)
  else:
    tpu_cluster_resolver = None

  # Check data path
  if FLAGS.mode in ('train',
                    'train_and_eval') and FLAGS.training_file_pattern is None:
    raise RuntimeError('You must specify --training_file_pattern for training.')
  if FLAGS.mode in ('eval', 'train_and_eval'):
    if FLAGS.validation_file_pattern is None:
      raise RuntimeError('You must specify --validation_file_pattern '
                         'for evaluation.')

  # Parse and override hparams
  config = hparams_config.get_detection_config(FLAGS.model_name)
  config.override(FLAGS.hparams)
  if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
    config.num_epochs = FLAGS.num_epochs

  # Parse image size in case it is in string format.
  config.image_size = utils.parse_image_size(config.image_size)

  # The following is for spatial partitioning. `features` has one tensor while
  # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
  # partition is performed on `features` and all partitionable tensors of
  # `labels`, see the partition logic below.
  # In the TPUEstimator context, the meaning of `shard` and `replica` is the
  # same; follwing the API, here has mixed use of both.
  if FLAGS.use_spatial_partition:
    # Checks input_partition_dims agrees with num_cores_per_replica.
    if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
      raise RuntimeError('--num_cores_per_replica must be a product of array'
                         'elements in --input_partition_dims.')

    labels_partition_dims = {
        'mean_num_positives': None,
        'source_ids': None,
        'groundtruth_data': None,
        'image_scales': None,
    }
    # The Input Partition Logic: We partition only the partition-able tensors.
    # Spatial partition requires that the to-be-partitioned tensors must have a
    # dimension that is a multiple of `partition_dims`. Depending on the
    # `partition_dims` and the `image_size` and the `max_level` in config, some
    # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
    # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
    # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
    # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
    # case, the level-8 and level-9 target tensors are not partition-able, and
    # the highest partition-able level is 7.
    feat_sizes = utils.get_feat_sizes(
        config.get('image_size'), config.get('max_level'))
    for level in range(config.get('min_level'), config.get('max_level') + 1):

      def _can_partition(spatial_dim):
        partitionable_index = np.where(
            spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
        return len(partitionable_index[0]) == len(FLAGS.input_partition_dims)

      spatial_dim = feat_sizes[level]
      if _can_partition(spatial_dim['height']) and _can_partition(
          spatial_dim['width']):
        labels_partition_dims['box_targets_%d' %
                              level] = FLAGS.input_partition_dims
        labels_partition_dims['cls_targets_%d' %
                              level] = FLAGS.input_partition_dims
      else:
        labels_partition_dims['box_targets_%d' % level] = None
        labels_partition_dims['cls_targets_%d' % level] = None
    num_cores_per_replica = FLAGS.num_cores_per_replica
    input_partition_dims = [
        FLAGS.input_partition_dims, labels_partition_dims]
    num_shards = FLAGS.num_cores // num_cores_per_replica
  else:
    num_cores_per_replica = None
    input_partition_dims = None
    num_shards = FLAGS.num_cores

  params = dict(
      config.as_dict(),
      model_name=FLAGS.model_name,

      iterations_per_loop=FLAGS.iterations_per_loop,
      model_dir=FLAGS.model_dir,
      num_shards=num_shards,
      num_examples_per_epoch=FLAGS.num_examples_per_epoch,
      use_tpu=FLAGS.use_tpu,
      backbone_ckpt=FLAGS.backbone_ckpt,
      ckpt=FLAGS.ckpt,
      val_json_file=FLAGS.val_json_file,
      testdev_dir=FLAGS.testdev_dir,
      mode=FLAGS.mode,
  )
  config_proto = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=False)
  if FLAGS.use_xla and not FLAGS.use_tpu:
    config_proto.graph_options.optimizer_options.global_jit_level = (
        tf.OptimizerOptions.ON_1)

  tpu_config = tf.estimator.tpu.TPUConfig(
      FLAGS.iterations_per_loop,
      num_shards=num_shards,
      num_cores_per_replica=num_cores_per_replica,
      input_partition_dims=input_partition_dims,
      per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig
      .PER_HOST_V2)

  run_config = tf.estimator.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      evaluation_master=FLAGS.eval_master,
      model_dir=FLAGS.model_dir,
      log_step_count_steps=FLAGS.iterations_per_loop,
      session_config=config_proto,
      tpu_config=tpu_config,
      tf_random_seed=FLAGS.tf_random_seed,
  )

  model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)

  # TPU Estimator
  logging.info(params)
  if FLAGS.mode == 'train':
    train_estimator = tf.estimator.tpu.TPUEstimator(
        model_fn=model_fn_instance,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        config=run_config,
        params=params)
    train_estimator.train(
        input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                        is_training=True,
                                        use_fake_data=FLAGS.use_fake_data),
        max_steps=int((config.num_epochs * FLAGS.num_examples_per_epoch) /
                      FLAGS.train_batch_size))

    if FLAGS.eval_after_training:
      # Run evaluation after training finishes.
      eval_params = dict(
          params,
          use_tpu=FLAGS.use_tpu,
          input_rand_hflip=False,
          is_training_bn=False,
          precision=None,
      )
      eval_estimator = tf.estimator.tpu.TPUEstimator(
          model_fn=model_fn_instance,
          use_tpu=FLAGS.use_tpu,
          train_batch_size=FLAGS.train_batch_size,
          eval_batch_size=FLAGS.eval_batch_size,
          config=run_config,
          params=eval_params)
      eval_results = eval_estimator.evaluate(
          input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                          is_training=False),
          steps=FLAGS.eval_samples//FLAGS.eval_batch_size)
      logging.info('Eval results: %s', eval_results)
      ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
      utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)

  elif FLAGS.mode == 'eval':
    # Eval only runs on CPU or GPU host with batch_size = 1.
    # Override the default options: disable randomization in the input pipeline
    # and don't run on the TPU.
    eval_params = dict(
        params,
        use_tpu=FLAGS.use_tpu,
        input_rand_hflip=False,
        is_training_bn=False,
        precision=None,
    )

    eval_estimator = tf.estimator.tpu.TPUEstimator(
        model_fn=model_fn_instance,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        config=run_config,
        params=eval_params)

    def terminate_eval():
      logging.info('Terminating eval after %d seconds of no checkpoints',
                   FLAGS.eval_timeout)
      return True

    # Run evaluation when there's a new checkpoint
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir,
        min_interval_secs=FLAGS.min_eval_interval,
        timeout=FLAGS.eval_timeout,
        timeout_fn=terminate_eval):

      logging.info('Starting to evaluate.')
      try:
        eval_results = eval_estimator.evaluate(
            input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                            is_training=False),
            steps=FLAGS.eval_samples//FLAGS.eval_batch_size)
        logging.info('Eval results: %s', eval_results)

        # Terminate eval job when final checkpoint is reached.
        try:
          current_step = int(os.path.basename(ckpt).split('-')[1])
        except IndexError:
          logging.info('%s has no global step info: stop!', ckpt)
          break

        utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
        total_step = int((config.num_epochs * FLAGS.num_examples_per_epoch) /
                         FLAGS.train_batch_size)
        if current_step >= total_step:
          logging.info('Evaluation finished after training step %d',
                       current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        logging.info('Checkpoint %s no longer exists, skipping checkpoint',
                     ckpt)

  elif FLAGS.mode == 'train_and_eval':
    # resumeした場合、global_stepにstep数が入っていて、cycleは関係がない
    for cycle in range(config.num_epochs):
      logging.info('Starting training cycle, epoch: %d.', cycle)
      train_estimator = tf.estimator.tpu.TPUEstimator(
          model_fn=model_fn_instance,
          use_tpu=FLAGS.use_tpu,
          train_batch_size=FLAGS.train_batch_size,
          config=run_config,
          params=params)
      train_estimator.train(
          input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                          is_training=True,
                                          use_fake_data=FLAGS.use_fake_data),
          steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size))

      logging.info('Starting evaluation cycle, epoch: %d.', cycle)
      # Run evaluation after every epoch.
      eval_params = dict(
          params,
          use_tpu=FLAGS.use_tpu,
          input_rand_hflip=False,
          is_training_bn=False,
      )

      eval_estimator = tf.estimator.tpu.TPUEstimator(
          model_fn=model_fn_instance,
          use_tpu=FLAGS.use_tpu,
          train_batch_size=FLAGS.train_batch_size,
          eval_batch_size=FLAGS.eval_batch_size,
          config=run_config,
          params=eval_params)
      eval_results = eval_estimator.evaluate(
          input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                          is_training=False),
          steps=FLAGS.eval_samples//FLAGS.eval_batch_size)
      logging.info('Evaluation results: %s', eval_results)
      ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
      logging.info(f'save_checkpoint_start for epoch: {cycle}')
      now = datetime.now().strftime('%Y%m%d%H%M%S')
      with open('/tmp/main_inner.log', 'a') as f:
        f.write(f'{now}: save_checkpoint_start {cycle}\n')
      utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
      with open('/tmp/main_inner.log', 'a') as f:
        f.write(f'{now}: save_checkpoint_end {cycle}\n')
      logging.info(f'save_checkpoint_end for epoch: {cycle}')

  else:
    logging.info('Mode not found.')