def __init__(self, model_name='efficientnet-b0', batch_size=1): """Initialize internal variables.""" self.model_name = model_name self.batch_size = batch_size self.num_classes = 1000 # Model Scaling parameters _, _, self.image_size, _ = efficientnet_builder.efficientnet_params( model_name)
def unl_dst_parser(self, value): keys_to_features = { 'probabilities': tf.FixedLenFeature([FLAGS.num_label_classes], tf.float32), 'label': tf.FixedLenFeature([], tf.int64, -1), 'prob': tf.FixedLenFeature([], tf.float32), 'image/encoded': tf.FixedLenFeature((), tf.string, ''), } parsed = tf.parse_single_example(value, keys_to_features) image_bytes = tf.reshape(parsed['image/encoded'], shape=[]) ori_image = tf.image.decode_jpeg(image_bytes, channels=3) if FLAGS.unl_aug == 'default': augment_name = FLAGS.augment_name else: augment_name = FLAGS.unl_aug image = self.image_preprocessing_fn( input_tensor=ori_image, is_training=self.is_training and not FLAGS.remove_aug, image_size=self.image_size, use_bfloat16=self.use_bfloat16, augment_name=augment_name, randaug_mag=FLAGS.randaug_mag, is_image_bytes=False, ) label = tf.cast(tf.reshape(parsed['label'], shape=[]), dtype=tf.int32) probabilities = tf.cast(tf.reshape(parsed['probabilities'], shape=[FLAGS.num_label_classes]), dtype=tf.float32) top_1_prob = tf.cast(tf.reshape(parsed['prob'], shape=[]), dtype=tf.float32) parsed_results = { 'unl_image': image, 'unl_label': label, 'unl_probs': probabilities, 'top_1_prob': top_1_prob, } if FLAGS.teacher_model_name: teacher_image_size = efficientnet_builder.efficientnet_params( FLAGS.teacher_model_name)[2] if FLAGS.small_image_model: teacher_image_size = FLAGS.input_image_size teacher_image = self.image_preprocessing_fn( input_tensor=image_bytes, is_training=False, image_size=teacher_image_size, use_bfloat16=self.use_bfloat16, augment_name=augment_name, randaug_mag=FLAGS.randaug_mag) parsed_results['unl_teacher_image'] = teacher_image return parsed_results
def get_eval_driver(model_name, include_background_label=False): """Get a eval driver.""" if model_name.startswith('efficientnet-edgetpu'): _, _, image_size, _ = (efficientnet_edgetpu_builder. efficientnet_edgetpu_params(model_name)) elif model_name.startswith('efficientnet'): _, _, image_size, _ = efficientnet_builder.efficientnet_params( model_name) else: raise ValueError( 'Model must be either efficientnet-b* or efficientnet-edgetpu*') return EvalCkptDriver(model_name=model_name, batch_size=1, image_size=image_size, include_background_label=include_background_label)
def __init__(self, model_name='efficientnet-b0', num_classes=1000, skip_load=[]): self.scores = None self.model_name = model_name self.skip_load_ = skip_load self.params = {"num_classes": num_classes} _, _, image_size, _ = efficientnet_builder.efficientnet_params( model_name) self.scale_size_ = (image_size, image_size) print("EfficientNet model %s size: %s" % (model_name, str(self.scale_size_)))
def __init__(self, model_name='efficientnet-b0', batch_size=1): """Initialize internal variables.""" self.model_name = model_name self.batch_size = batch_size self.num_classes = 1000 # Model Scaling parameters if model_name.startswith('efficientnet-edgetpu'): _, _, self.image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params( model_name) elif model_name.startswith('efficientnet'): _, _, self.image_size, _ = efficientnet_builder.efficientnet_params( model_name) else: raise ValueError( 'Model must be either efficientnet-b* or efficientnet-edgetpu*' )
def get_model_input_size(model_name): """Get model input size for a given model name.""" if model_name.startswith('efficientnet-lite'): _, _, image_size, _ = ( efficientnet_lite_builder.efficientnet_lite_params(model_name)) elif model_name.startswith('efficientnet-edgetpu-'): _, _, image_size, _ = (efficientnet_edgetpu_builder. efficientnet_edgetpu_params(model_name)) elif model_name.startswith('efficientnet-condconv-'): _, _, image_size, _, _ = (efficientnet_condconv_builder. efficientnet_condconv_params(model_name)) elif model_name.startswith('efficientnet'): _, _, image_size, _ = efficientnet_builder.efficientnet_params( model_name) else: raise ValueError( 'Model must be either efficientnet-b* or efficientnet-edgetpu* or ' 'efficientnet-condconv*, efficientnet-lite*') return image_size
def dataset_parser(self, value): '''Parses an image and its label from a serialized ResNet-50 TFExample. Args: value: serialized string containing an ImageNet TFExample. Returns: Returns a tuple of (image, label) from the TFExample. ''' keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, ''), 'image/class/label': tf.FixedLenFeature([], tf.int64, -1), } parsed = tf.parse_single_example(value, keys_to_features) image_bytes = tf.reshape(parsed['image/encoded'], shape=[]) image = self.image_preprocessing_fn( input_tensor=image_bytes, is_training=self.is_training and not FLAGS.remove_aug, image_size=self.image_size, use_bfloat16=self.use_bfloat16, augment_name=FLAGS.augment_name, randaug_mag=FLAGS.randaug_mag, ) label = tf.cast(tf.reshape(parsed['image/class/label'], shape=[]), dtype=tf.int32) # Subtract one so that labels are in [0, 1000). if self.label_minus_one: label = label - 1 parsed_results = {'image': image, 'label': label} if FLAGS.teacher_model_name: teacher_image_size = efficientnet_builder.efficientnet_params( FLAGS.teacher_model_name)[2] if FLAGS.small_image_model: teacher_image_size = FLAGS.input_image_size teacher_image = self.image_preprocessing_fn( input_tensor=image_bytes, is_training=False, image_size=teacher_image_size, use_bfloat16=self.use_bfloat16) parsed_results['teacher_image'] = teacher_image return parsed_results
def get_efficientnet(model_name, training: bool = True, model_ckpt: str = None, regression=False, image_size=None): """ Build efficientnet_b0 and load pre-trained weights """ model_param = efficientnet_builder.efficientnet_params(model_name) _, global_params = efficientnet_builder.get_model_params(model_name, {}) image_size = model_param[2] if not image_size else image_size inputs = tf.keras.layers.Input(shape=(image_size, image_size, 3), dtype=tf.uint8, name="image_tensor") features = _get_efficientnet(inputs, model_name=model_name, training=training, model_ckpt=model_ckpt) with tf.variable_scope("head"): features = tf.keras.layers.Conv2D( filters=efficientnet_model.round_filters(1280, global_params), kernel_size=(1, 1), strides=(1, 1), padding="same", use_bias=False)(features) features = tf.keras.layers.BatchNormalization()(features) features = tf.keras.layers.ReLU()(features) features = tf.keras.layers.GlobalAveragePooling2D( data_format="channels_last")(features) if training: features = tf.keras.layers.Dropout(model_param[3])(features) if regression: logits = tf.keras.layers.Dense(1)(features) else: logits = tf.keras.layers.Dense(5, activation="softmax", name="scores")(features) return tf.keras.Model(inputs, logits)
def main(unused_argv): input_image_size = FLAGS.input_image_size if not input_image_size: if FLAGS.model_name.startswith('efficientnet-edgetpu'): _, _, input_image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params( FLAGS.model_name) elif FLAGS.model_name.startswith('efficientnet-tpu'): _, _, input_image_size, _ = efficientnet_tpu_builder.efficientnet_tpu_params( FLAGS.model_name) elif FLAGS.model_name.startswith('efficientnet'): _, _, input_image_size, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) else: raise ValueError( 'input_image_size must be set except for EfficientNet') # For imagenet dataset, include background label if number of output classes # is 1001 include_background_label = (FLAGS.num_label_classes == 1001) if FLAGS.tpu or FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long # Initializes model parameters. params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, use_bfloat16=FLAGS.use_bfloat16) est = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. def build_imagenet_input(is_training): """Generate ImageNetInput for training and eval.""" if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() return imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=FLAGS.use_bfloat16, transpose_input=FLAGS.transpose_input, selection=select_train if is_training else select_eval, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name) else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) return imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=FLAGS.use_bfloat16, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name) imagenet_train = build_imagenet_input(is_training=True) imagenet_eval = build_imagenet_input(is_training=False) if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) est.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size)
def main(unused_argv): if FLAGS.task_name == 'svhn': FLAGS.input_image_size = 32 FLAGS.small_image_model = True FLAGS.num_label_classes = 10 if FLAGS.num_train_images is None: FLAGS.num_train_images = task_info.get_num_train_images( FLAGS.task_name) if FLAGS.num_eval_images is None: FLAGS.num_eval_images = task_info.get_num_eval_images(FLAGS.task_name) if FLAGS.num_test_images is None and FLAGS.task_name != 'imagenet': FLAGS.num_test_images = task_info.get_num_test_images(FLAGS.task_name) steps_per_epoch = (FLAGS.num_train_images / (FLAGS.train_batch_size * FLAGS.label_data_sample_prob)) if FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval': tf.gfile.MakeDirs(FLAGS.model_dir) flags_dict = tf.app.flags.FLAGS.flag_values_dict() with tf.gfile.Open(os.path.join(FLAGS.model_dir, 'FLAGS.json'), 'w') as ouf: json.dump(flags_dict, ouf) input_image_size = FLAGS.input_image_size if not input_image_size: _, _, input_image_size, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) FLAGS.input_image_size = input_image_size if FLAGS.train_last_step_num == -1: FLAGS.train_last_step_num = FLAGS.train_steps if FLAGS.train_ratio != 1: FLAGS.train_last_step_num *= FLAGS.train_ratio FLAGS.train_steps *= FLAGS.train_ratio FLAGS.train_last_step_num = int(FLAGS.train_last_step_num) FLAGS.train_steps = int(FLAGS.train_steps) if (FLAGS.tpu or FLAGS.use_tpu) and not FLAGS.master: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if FLAGS.use_tpu: tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) else: tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_steps=max(FLAGS.save_checkpoints_steps, FLAGS.iterations_per_loop), log_step_count_steps=FLAGS.log_step_count_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tpu_config) # pylint: disable=line-too-long # Initializes model parameters. params = dict(steps_per_epoch=steps_per_epoch, use_bfloat16=FLAGS.use_bfloat16) est = tf.estimator.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=8, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.label_data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.label_data_dir) train_data = data_input.DataInput(is_training=True, data_dir=FLAGS.label_data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache, image_size=input_image_size, use_bfloat16=FLAGS.use_bfloat16) if FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval': current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_last_step_num, FLAGS.train_last_step_num / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': est.train(input_fn=train_data.input_fn, max_steps=FLAGS.train_last_step_num, hooks=[]) elif FLAGS.mode == 'eval': input_fn_mapping = {} for subset in ['dev', 'test']: input_fn_mapping[subset] = data_input.DataInput( is_training=False, data_dir=FLAGS.label_data_dir, transpose_input=FLAGS.transpose_input, cache=False, image_size=input_image_size, use_bfloat16=FLAGS.use_bfloat16, subset=subset).input_fn if subset == 'dev': num_images = FLAGS.num_eval_images else: num_images = FLAGS.num_test_images eval_results = est.evaluate(input_fn=input_fn_mapping[subset], steps=num_images // FLAGS.eval_batch_size) tf.logging.info('%s, results: %s', subset, eval_results) elif FLAGS.mode == 'predict': predict_label.run_prediction(est) else: assert False
def get_model_input_size(model_name): """Get model input size for a given model name.""" _, _, image_size, _ = efficientnet_builder.efficientnet_params(model_name) return image_size
def main(unused_argv): input_image_size = FLAGS.input_image_size if not input_image_size: if FLAGS.model_name.startswith('efficientnet'): _, _, input_image_size, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) else: raise ValueError( 'input_image_size must be set except for EfficientNet') config = tf.estimator.RunConfig( model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto(graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)))) # Initializes model parameters. params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size) est = tf.estimator.Estimator(model_fn=model_fn, config=config, params=params) def build_input(is_training): """Input for training and eval.""" tf.logging.info('Using dataset: %s', FLAGS.data_dir) return egg_candler_input.EggCandlerInput( is_training=is_training, data_dir=FLAGS.data_dir, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, image_size=input_image_size) image_train = build_input(is_training=True) image_eval = build_input(is_training=False) if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = est.evaluate(input_fn=image_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) utils.archive_ckpt(eval_results, eval_results['val_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': est.train(input_fn=image_train.input_fn, max_steps=FLAGS.train_steps, hooks=[]) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) est.train(input_fn=image_train.input_fn, max_steps=next_checkpoint, hooks=[]) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = est.evaluate(input_fn=image_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['val_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size)
def main(unused_argv): input_image_size = FLAGS.input_image_size if not input_image_size: if FLAGS.model_name.startswith('efficientnet'): _, _, input_image_size, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) else: raise ValueError( 'input_image_size must be set expect for EfficientNet.') save_checkpoints_steps = max(100, FLAGS.steps_per_eval) config = tf.estimator.RunConfig( model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, ) params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, use_bfloat16=FLAGS.use_bfloat16, batch_size=FLAGS.train_batch_size) est = tf.estimator.Estimator(model_fn=model_fn, config=config, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) data_train, data_eval = [ mnist_input.ImageNetInput(is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=input_image_size, use_bfloat16=FLAGS.use_bfloat16) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = est.evaluate(input_fn=data_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) est.train(input_fn=data_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) est.train(input_fn=data_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = est.evaluate(input_fn=data_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size)
# 'Labels map from label id to its meaning.') # flags.DEFINE_integer('num_images', 5000, # 'Number of images to eval. Use -1 to eval all images.') # FLAGS = flags.FLAGS MEAN_RGB = [0.485 * 255, 0.456 * 255, 0.406 * 255] STDDEV_RGB = [0.229 * 255, 0.224 * 255, 0.225 * 255] model_name = 'efficientnet-b0' batch_size = 256 """Initialize internal variables.""" model_name = model_name batch_size = batch_size num_classes = 1000 # Model Scaling parameters _, _, image_size, _ = efficientnet_builder.efficientnet_params(model_name) def restore_model(sess, ckpt_dir): """Restore variables from checkpoint dir.""" checkpoint = tf.train.latest_checkpoint(ckpt_dir) ema = tf.train.ExponentialMovingAverage(decay=0.9999) ema_vars = tf.trainable_variables() + tf.get_collection('moving_vars') for v in tf.global_variables(): if 'moving_mean' in v.name or 'moving_variance' in v.name: ema_vars.append(v) ema_vars = list(set(ema_vars)) var_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(var_dict, max_to_keep=1) saver.restore(sess, checkpoint)