示例#1
0
 def __init__(self, model_name='efficientnet-b0', batch_size=1):
     """Initialize internal variables."""
     self.model_name = model_name
     self.batch_size = batch_size
     self.num_classes = 1000
     # Model Scaling parameters
     _, _, self.image_size, _ = efficientnet_builder.efficientnet_params(
         model_name)
示例#2
0
    def unl_dst_parser(self, value):
        keys_to_features = {
            'probabilities':
            tf.FixedLenFeature([FLAGS.num_label_classes], tf.float32),
            'label':
            tf.FixedLenFeature([], tf.int64, -1),
            'prob':
            tf.FixedLenFeature([], tf.float32),
            'image/encoded':
            tf.FixedLenFeature((), tf.string, ''),
        }
        parsed = tf.parse_single_example(value, keys_to_features)
        image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
        ori_image = tf.image.decode_jpeg(image_bytes, channels=3)

        if FLAGS.unl_aug == 'default':
            augment_name = FLAGS.augment_name
        else:
            augment_name = FLAGS.unl_aug

        image = self.image_preprocessing_fn(
            input_tensor=ori_image,
            is_training=self.is_training and not FLAGS.remove_aug,
            image_size=self.image_size,
            use_bfloat16=self.use_bfloat16,
            augment_name=augment_name,
            randaug_mag=FLAGS.randaug_mag,
            is_image_bytes=False,
        )

        label = tf.cast(tf.reshape(parsed['label'], shape=[]), dtype=tf.int32)
        probabilities = tf.cast(tf.reshape(parsed['probabilities'],
                                           shape=[FLAGS.num_label_classes]),
                                dtype=tf.float32)
        top_1_prob = tf.cast(tf.reshape(parsed['prob'], shape=[]),
                             dtype=tf.float32)
        parsed_results = {
            'unl_image': image,
            'unl_label': label,
            'unl_probs': probabilities,
            'top_1_prob': top_1_prob,
        }
        if FLAGS.teacher_model_name:
            teacher_image_size = efficientnet_builder.efficientnet_params(
                FLAGS.teacher_model_name)[2]
            if FLAGS.small_image_model:
                teacher_image_size = FLAGS.input_image_size
            teacher_image = self.image_preprocessing_fn(
                input_tensor=image_bytes,
                is_training=False,
                image_size=teacher_image_size,
                use_bfloat16=self.use_bfloat16,
                augment_name=augment_name,
                randaug_mag=FLAGS.randaug_mag)
            parsed_results['unl_teacher_image'] = teacher_image
        return parsed_results
示例#3
0
def get_eval_driver(model_name, include_background_label=False):
    """Get a eval driver."""
    if model_name.startswith('efficientnet-edgetpu'):
        _, _, image_size, _ = (efficientnet_edgetpu_builder.
                               efficientnet_edgetpu_params(model_name))
    elif model_name.startswith('efficientnet'):
        _, _, image_size, _ = efficientnet_builder.efficientnet_params(
            model_name)
    else:
        raise ValueError(
            'Model must be either efficientnet-b* or efficientnet-edgetpu*')

    return EvalCkptDriver(model_name=model_name,
                          batch_size=1,
                          image_size=image_size,
                          include_background_label=include_background_label)
示例#4
0
    def __init__(self,
                 model_name='efficientnet-b0',
                 num_classes=1000,
                 skip_load=[]):

        self.scores = None
        self.model_name = model_name
        self.skip_load_ = skip_load
        self.params = {"num_classes": num_classes}

        _, _, image_size, _ = efficientnet_builder.efficientnet_params(
            model_name)
        self.scale_size_ = (image_size, image_size)

        print("EfficientNet model %s size: %s" %
              (model_name, str(self.scale_size_)))
示例#5
0
 def __init__(self, model_name='efficientnet-b0', batch_size=1):
     """Initialize internal variables."""
     self.model_name = model_name
     self.batch_size = batch_size
     self.num_classes = 1000
     # Model Scaling parameters
     if model_name.startswith('efficientnet-edgetpu'):
         _, _, self.image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params(
             model_name)
     elif model_name.startswith('efficientnet'):
         _, _, self.image_size, _ = efficientnet_builder.efficientnet_params(
             model_name)
     else:
         raise ValueError(
             'Model must be either efficientnet-b* or efficientnet-edgetpu*'
         )
示例#6
0
def get_model_input_size(model_name):
    """Get model input size for a given model name."""
    if model_name.startswith('efficientnet-lite'):
        _, _, image_size, _ = (
            efficientnet_lite_builder.efficientnet_lite_params(model_name))
    elif model_name.startswith('efficientnet-edgetpu-'):
        _, _, image_size, _ = (efficientnet_edgetpu_builder.
                               efficientnet_edgetpu_params(model_name))
    elif model_name.startswith('efficientnet-condconv-'):
        _, _, image_size, _, _ = (efficientnet_condconv_builder.
                                  efficientnet_condconv_params(model_name))
    elif model_name.startswith('efficientnet'):
        _, _, image_size, _ = efficientnet_builder.efficientnet_params(
            model_name)
    else:
        raise ValueError(
            'Model must be either efficientnet-b* or efficientnet-edgetpu* or '
            'efficientnet-condconv*, efficientnet-lite*')
    return image_size
示例#7
0
    def dataset_parser(self, value):
        '''Parses an image and its label from a serialized ResNet-50 TFExample.

    Args:
      value: serialized string containing an ImageNet TFExample.

    Returns:
      Returns a tuple of (image, label) from the TFExample.
    '''
        keys_to_features = {
            'image/encoded': tf.FixedLenFeature((), tf.string, ''),
            'image/class/label': tf.FixedLenFeature([], tf.int64, -1),
        }

        parsed = tf.parse_single_example(value, keys_to_features)
        image_bytes = tf.reshape(parsed['image/encoded'], shape=[])

        image = self.image_preprocessing_fn(
            input_tensor=image_bytes,
            is_training=self.is_training and not FLAGS.remove_aug,
            image_size=self.image_size,
            use_bfloat16=self.use_bfloat16,
            augment_name=FLAGS.augment_name,
            randaug_mag=FLAGS.randaug_mag,
        )
        label = tf.cast(tf.reshape(parsed['image/class/label'], shape=[]),
                        dtype=tf.int32)
        # Subtract one so that labels are in [0, 1000).
        if self.label_minus_one:
            label = label - 1
        parsed_results = {'image': image, 'label': label}
        if FLAGS.teacher_model_name:
            teacher_image_size = efficientnet_builder.efficientnet_params(
                FLAGS.teacher_model_name)[2]
            if FLAGS.small_image_model:
                teacher_image_size = FLAGS.input_image_size
            teacher_image = self.image_preprocessing_fn(
                input_tensor=image_bytes,
                is_training=False,
                image_size=teacher_image_size,
                use_bfloat16=self.use_bfloat16)
            parsed_results['teacher_image'] = teacher_image
        return parsed_results
示例#8
0
def get_efficientnet(model_name,
                     training: bool = True,
                     model_ckpt: str = None,
                     regression=False,
                     image_size=None):
    """ Build efficientnet_b0 and load pre-trained weights """
    model_param = efficientnet_builder.efficientnet_params(model_name)
    _, global_params = efficientnet_builder.get_model_params(model_name, {})
    image_size = model_param[2] if not image_size else image_size

    inputs = tf.keras.layers.Input(shape=(image_size, image_size, 3),
                                   dtype=tf.uint8,
                                   name="image_tensor")

    features = _get_efficientnet(inputs,
                                 model_name=model_name,
                                 training=training,
                                 model_ckpt=model_ckpt)

    with tf.variable_scope("head"):
        features = tf.keras.layers.Conv2D(
            filters=efficientnet_model.round_filters(1280, global_params),
            kernel_size=(1, 1),
            strides=(1, 1),
            padding="same",
            use_bias=False)(features)
        features = tf.keras.layers.BatchNormalization()(features)
        features = tf.keras.layers.ReLU()(features)
        features = tf.keras.layers.GlobalAveragePooling2D(
            data_format="channels_last")(features)

    if training:
        features = tf.keras.layers.Dropout(model_param[3])(features)

    if regression:
        logits = tf.keras.layers.Dense(1)(features)
    else:
        logits = tf.keras.layers.Dense(5, activation="softmax",
                                       name="scores")(features)

    return tf.keras.Model(inputs, logits)
示例#9
0
def main(unused_argv):

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        if FLAGS.model_name.startswith('efficientnet-edgetpu'):
            _, _, input_image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params(
                FLAGS.model_name)
        elif FLAGS.model_name.startswith('efficientnet-tpu'):
            _, _, input_image_size, _ = efficientnet_tpu_builder.efficientnet_tpu_params(
                FLAGS.model_name)
        elif FLAGS.model_name.startswith('efficientnet'):
            _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
                FLAGS.model_name)
        else:
            raise ValueError(
                'input_image_size must be set except for EfficientNet')

    # For imagenet dataset, include background label if number of output classes
    # is 1001
    include_background_label = (FLAGS.num_label_classes == 1001)

    if FLAGS.tpu or FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long
    # Initializes model parameters.
    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size,
                  use_bfloat16=FLAGS.use_bfloat16)
    est = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu,
                                      model_fn=model_fn,
                                      config=config,
                                      train_batch_size=FLAGS.train_batch_size,
                                      eval_batch_size=FLAGS.eval_batch_size,
                                      export_to_tpu=FLAGS.export_to_tpu,
                                      params=params)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    def build_imagenet_input(is_training):
        """Generate ImageNetInput for training and eval."""
        if FLAGS.bigtable_instance:
            tf.logging.info('Using Bigtable dataset, table %s',
                            FLAGS.bigtable_table)
            select_train, select_eval = _select_tables_from_flags()
            return imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=FLAGS.use_bfloat16,
                transpose_input=FLAGS.transpose_input,
                selection=select_train if is_training else select_eval,
                include_background_label=include_background_label,
                autoaugment_name=FLAGS.autoaugment_name)
        else:
            if FLAGS.data_dir == FAKE_DATA_DIR:
                tf.logging.info('Using fake dataset.')
            else:
                tf.logging.info('Using dataset: %s', FLAGS.data_dir)

            return imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=FLAGS.transpose_input,
                cache=FLAGS.use_cache and is_training,
                image_size=input_image_size,
                num_parallel_calls=FLAGS.num_parallel_calls,
                use_bfloat16=FLAGS.use_bfloat16,
                include_background_label=include_background_label,
                autoaugment_name=FLAGS.autoaugment_name)

    imagenet_train = build_imagenet_input(is_training=True)
    imagenet_eval = build_imagenet_input(is_training=False)

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            est.train(input_fn=imagenet_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=imagenet_train.input_fn,
                          max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
    if FLAGS.export_dir:
        export(est, FLAGS.export_dir, input_image_size)
示例#10
0
def main(unused_argv):
    if FLAGS.task_name == 'svhn':
        FLAGS.input_image_size = 32
        FLAGS.small_image_model = True
        FLAGS.num_label_classes = 10
    if FLAGS.num_train_images is None:
        FLAGS.num_train_images = task_info.get_num_train_images(
            FLAGS.task_name)
    if FLAGS.num_eval_images is None:
        FLAGS.num_eval_images = task_info.get_num_eval_images(FLAGS.task_name)
    if FLAGS.num_test_images is None and FLAGS.task_name != 'imagenet':
        FLAGS.num_test_images = task_info.get_num_test_images(FLAGS.task_name)

    steps_per_epoch = (FLAGS.num_train_images /
                       (FLAGS.train_batch_size * FLAGS.label_data_sample_prob))
    if FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval':
        tf.gfile.MakeDirs(FLAGS.model_dir)
        flags_dict = tf.app.flags.FLAGS.flag_values_dict()
        with tf.gfile.Open(os.path.join(FLAGS.model_dir, 'FLAGS.json'),
                           'w') as ouf:
            json.dump(flags_dict, ouf)
    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
            FLAGS.model_name)
        FLAGS.input_image_size = input_image_size
    if FLAGS.train_last_step_num == -1:
        FLAGS.train_last_step_num = FLAGS.train_steps
    if FLAGS.train_ratio != 1:
        FLAGS.train_last_step_num *= FLAGS.train_ratio
        FLAGS.train_steps *= FLAGS.train_ratio
        FLAGS.train_last_step_num = int(FLAGS.train_last_step_num)
        FLAGS.train_steps = int(FLAGS.train_steps)

    if (FLAGS.tpu or FLAGS.use_tpu) and not FLAGS.master:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if FLAGS.use_tpu:
        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
            PER_HOST_V2)
    else:
        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.
            PER_HOST_V2)
    config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=max(FLAGS.save_checkpoints_steps, FLAGS.iterations_per_loop),
        log_step_count_steps=FLAGS.log_step_count_steps,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tpu_config)  # pylint: disable=line-too-long
    # Initializes model parameters.
    params = dict(steps_per_epoch=steps_per_epoch,
                  use_bfloat16=FLAGS.use_bfloat16)
    est = tf.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=8,
        params=params)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.label_data_dir == FAKE_DATA_DIR:
        tf.logging.info('Using fake dataset.')
    else:
        tf.logging.info('Using dataset: %s', FLAGS.label_data_dir)

    train_data = data_input.DataInput(is_training=True,
                                      data_dir=FLAGS.label_data_dir,
                                      transpose_input=FLAGS.transpose_input,
                                      cache=FLAGS.use_cache,
                                      image_size=input_image_size,
                                      use_bfloat16=FLAGS.use_bfloat16)
    if FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval':
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_last_step_num,
            FLAGS.train_last_step_num / params['steps_per_epoch'],
            current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            est.train(input_fn=train_data.input_fn,
                      max_steps=FLAGS.train_last_step_num,
                      hooks=[])
    elif FLAGS.mode == 'eval':
        input_fn_mapping = {}
        for subset in ['dev', 'test']:
            input_fn_mapping[subset] = data_input.DataInput(
                is_training=False,
                data_dir=FLAGS.label_data_dir,
                transpose_input=FLAGS.transpose_input,
                cache=False,
                image_size=input_image_size,
                use_bfloat16=FLAGS.use_bfloat16,
                subset=subset).input_fn
            if subset == 'dev':
                num_images = FLAGS.num_eval_images
            else:
                num_images = FLAGS.num_test_images
            eval_results = est.evaluate(input_fn=input_fn_mapping[subset],
                                        steps=num_images //
                                        FLAGS.eval_batch_size)
            tf.logging.info('%s, results: %s', subset, eval_results)
    elif FLAGS.mode == 'predict':
        predict_label.run_prediction(est)
    else:
        assert False
示例#11
0
def get_model_input_size(model_name):
    """Get model input size for a given model name."""
    _, _, image_size, _ = efficientnet_builder.efficientnet_params(model_name)
    return image_size
示例#12
0
文件: train.py 项目: yichenj/tpu
def main(unused_argv):
    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        if FLAGS.model_name.startswith('efficientnet'):
            _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
                FLAGS.model_name)
        else:
            raise ValueError(
                'input_image_size must be set except for EfficientNet')

    config = tf.estimator.RunConfig(
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(graph_options=tf.GraphOptions(
            rewrite_options=rewriter_config_pb2.RewriterConfig(
                disable_meta_optimizer=True))))
    # Initializes model parameters.
    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size)
    est = tf.estimator.Estimator(model_fn=model_fn,
                                 config=config,
                                 params=params)

    def build_input(is_training):
        """Input for training and eval."""
        tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        return egg_candler_input.EggCandlerInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            image_size=input_image_size)

    image_train = build_input(is_training=True)
    image_eval = build_input(is_training=False)

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir,
                                                  timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=image_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results, eval_results['val_accuracy'],
                                   ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            est.train(input_fn=image_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=[])
        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=image_train.input_fn,
                          max_steps=next_checkpoint,
                          hooks=[])
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=image_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results, eval_results['val_accuracy'],
                                   ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
    if FLAGS.export_dir:
        export(est, FLAGS.export_dir, input_image_size)
示例#13
0
def main(unused_argv):

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        if FLAGS.model_name.startswith('efficientnet'):
            _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
                FLAGS.model_name)
        else:
            raise ValueError(
                'input_image_size must be set expect for EfficientNet.')

    save_checkpoints_steps = max(100, FLAGS.steps_per_eval)

    config = tf.estimator.RunConfig(
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
    )

    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size,
                  use_bfloat16=FLAGS.use_bfloat16,
                  batch_size=FLAGS.train_batch_size)
    est = tf.estimator.Estimator(model_fn=model_fn,
                                 config=config,
                                 params=params)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.

    if FLAGS.data_dir == FAKE_DATA_DIR:
        tf.logging.info('Using fake dataset.')
    else:
        tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    data_train, data_eval = [
        mnist_input.ImageNetInput(is_training=is_training,
                                  data_dir=FLAGS.data_dir,
                                  transpose_input=FLAGS.transpose_input,
                                  cache=FLAGS.use_cache and is_training,
                                  image_size=input_image_size,
                                  use_bfloat16=FLAGS.use_bfloat16)
        for is_training in [True, False]
    ]

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=data_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

        if FLAGS.export_dir:
            export(est, FLAGS.export_dir, input_image_size)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            est.train(input_fn=data_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=data_train.input_fn,
                          max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=data_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results,
                                   eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
            if FLAGS.export_dir:
                export(est, FLAGS.export_dir, input_image_size)
示例#14
0
#                     'Labels map from label id to its meaning.')
# flags.DEFINE_integer('num_images', 5000,
#                      'Number of images to eval. Use -1 to eval all images.')
# FLAGS = flags.FLAGS

MEAN_RGB = [0.485 * 255, 0.456 * 255, 0.406 * 255]
STDDEV_RGB = [0.229 * 255, 0.224 * 255, 0.225 * 255]

model_name = 'efficientnet-b0'
batch_size = 256
"""Initialize internal variables."""
model_name = model_name
batch_size = batch_size
num_classes = 1000
# Model Scaling parameters
_, _, image_size, _ = efficientnet_builder.efficientnet_params(model_name)


def restore_model(sess, ckpt_dir):
    """Restore variables from checkpoint dir."""
    checkpoint = tf.train.latest_checkpoint(ckpt_dir)
    ema = tf.train.ExponentialMovingAverage(decay=0.9999)
    ema_vars = tf.trainable_variables() + tf.get_collection('moving_vars')
    for v in tf.global_variables():
        if 'moving_mean' in v.name or 'moving_variance' in v.name:
            ema_vars.append(v)
    ema_vars = list(set(ema_vars))
    var_dict = ema.variables_to_restore(ema_vars)
    saver = tf.train.Saver(var_dict, max_to_keep=1)
    saver.restore(sess, checkpoint)