def __init__(self, model_name: Text, logdir: Text, tensorrt: Text = False, use_xla: bool = False, ckpt_path: Text = None, export_ckpt: Text = None, saved_model_dir: Text = None, tflite_path: Text = None, batch_size: int = 1, hparams: Text = ''): self.model_name = model_name self.logdir = logdir self.tensorrt = tensorrt self.use_xla = use_xla self.ckpt_path = ckpt_path self.export_ckpt = export_ckpt self.saved_model_dir = saved_model_dir self.tflite_path = tflite_path model_config = hparams_config.get_detection_config(model_name) model_config.override(hparams) # Add custom overrides model_config.is_training_bn = False model_config.image_size = utils.parse_image_size( model_config.image_size) # If batch size is 0, then build a graph with dynamic batch size. self.batch_size = batch_size or None self.labels_shape = [batch_size, model_config.num_classes] height, width = model_config.image_size if model_config.data_format == 'channels_first': self.inputs_shape = [batch_size, 3, height, width] else: self.inputs_shape = [batch_size, height, width, 3] self.model_config = model_config
def __init__(self, model_name: Text, ckpt_path: Text, image_size: Union[int, Tuple[int, int]] = None, num_classes: int = None, enable_ema: bool = True, data_format: Text = None, label_id_mapping: Dict[int, Text] = None): """Initialize the inference driver. Args: model_name: target model name, such as efficientdet-d0. ckpt_path: checkpoint path, such as /tmp/efficientdet-d0/. image_size: user specified image size. If None, use the default image size defined by model_name. num_classes: number of classes. If None, use the default COCO classes. enable_ema: whether to enable moving average. data_format: data format such as 'channel_last'. label_id_mapping: a dictionary from id to name. If None, use the default coco_id_mapping (with 90 classes). """ self.model_name = model_name self.ckpt_path = ckpt_path self.label_id_mapping = label_id_mapping or coco_id_mapping self.params = hparams_config.get_detection_config( self.model_name).as_dict() self.params.update(dict(is_training_bn=False, use_bfloat16=False)) if image_size: self.params.update(dict(image_size=image_size)) if num_classes: self.params.update(dict(num_classes=num_classes)) if data_format: self.params.update(dict(data_format=data_format)) self.disable_pyfun = True self.enable_ema = enable_ema
def __init__(self, model_name: Text, ckpt_path: Text, image_size: int = None, label_id_mapping: Dict[int, Text] = None): """Initialize the inference driver. Args: model_name: target model name, such as efficientdet-d0. ckpt_path: checkpoint path, such as /tmp/efficientdet-d0/. image_size: user specified image size. If None, use the default image size defined by model_name. label_id_mapping: a dictionary from id to name. If None, use the default coco_id_mapping (with 90 classes). """ self.model_name = model_name self.ckpt_path = ckpt_path self.label_id_mapping = label_id_mapping or coco_id_mapping self.params = hparams_config.get_detection_config( self.model_name).as_dict() self.params.update(dict(is_training_bn=False, use_bfloat16=False)) if image_size: self.params.update(dict(image_size=image_size))
def main(_): tf.config.run_functions_eagerly(FLAGS.debug) devices = tf.config.list_physical_devices('GPU') for device in devices: tf.config.experimental.set_memory_growth(device, True) model_config = hparams_config.get_detection_config(FLAGS.model_name) model_config.override(FLAGS.hparams) # Add custom overrides model_config.is_training_bn = False if FLAGS.image_size != -1: model_config.image_size = FLAGS.image_size model_config.image_size = utils.parse_image_size(model_config.image_size) model_params = model_config.as_dict() ckpt_path_or_file = FLAGS.ckpt_path if tf.io.gfile.isdir(ckpt_path_or_file): ckpt_path_or_file = tf.train.latest_checkpoint(ckpt_path_or_file) driver = inference.ServingDriver(FLAGS.model_name, ckpt_path_or_file, FLAGS.batch_size or None, model_params) if FLAGS.mode == 'export': if not FLAGS.saved_model_dir: raise ValueError('Please specify --saved_model_dir=') model_dir = FLAGS.saved_model_dir if tf.io.gfile.exists(model_dir): tf.io.gfile.rmtree(model_dir) driver.export(model_dir, FLAGS.tensorrt, FLAGS.tflite) print('Model are exported to %s' % model_dir) elif FLAGS.mode == 'infer': if FLAGS.saved_model_dir: driver.load(FLAGS.saved_model_dir) image_file = tf.io.read_file(FLAGS.input_image) image_arrays = tf.io.decode_image(image_file) image_arrays.set_shape((None, None, 3)) image_arrays = tf.expand_dims(image_arrays, axis=0) detections_bs = driver.serve(image_arrays) boxes, scores, classes, _ = tf.nest.map_structure( np.array, detections_bs) raw_image = Image.open(FLAGS.input_image) img = driver.visualize( raw_image, boxes[0], classes[0], scores[0], min_score_thresh=model_config.nms_configs.score_thresh, max_boxes_to_draw=model_config.nms_configs.max_output_size) output_image_path = os.path.join(FLAGS.output_image_dir, '0.jpg') Image.fromarray(img).save(output_image_path) print('writing file to %s' % output_image_path) elif FLAGS.mode == 'benchmark': if FLAGS.saved_model_dir: driver.load(FLAGS.saved_model_dir) batch_size = FLAGS.batch_size or 1 if FLAGS.input_image: image_file = tf.io.read_file(FLAGS.input_image) image_arrays = tf.image.decode_image(image_file) image_arrays.set_shape((None, None, 3)) image_arrays = tf.expand_dims(image_arrays, 0) if batch_size > 1: image_arrays = tf.tile(image_arrays, [batch_size, 1, 1, 1]) else: # use synthetic data if no image is provided. image_arrays = tf.ones((batch_size, *model_config.image_size, 3), dtype=tf.uint8) driver.benchmark(image_arrays, FLAGS.bm_runs, FLAGS.trace_filename) elif FLAGS.mode == 'dry': # transfer to tf2 format ckpt driver.build() if FLAGS.export_ckpt: driver.model.save_weights(FLAGS.export_ckpt) elif FLAGS.mode == 'video': import cv2 # pylint: disable=g-import-not-at-top if FLAGS.saved_model_dir: driver.load(FLAGS.saved_model_dir) cap = cv2.VideoCapture(FLAGS.input_video) if not cap.isOpened(): print('Error opening input video: {}'.format(FLAGS.input_video)) out_ptr = None if FLAGS.output_video: frame_width, frame_height = int(cap.get(3)), int(cap.get(4)) out_ptr = cv2.VideoWriter( FLAGS.output_video, cv2.VideoWriter_fourcc('m', 'p', '4', 'v'), 25, (frame_width, frame_height)) while cap.isOpened(): # Capture frame-by-frame ret, frame = cap.read() if not ret: break raw_frames = np.array([frame]) detections_bs = driver.serve(raw_frames) boxes, scores, classes, _ = tf.nest.map_structure( np.array, detections_bs) new_frame = driver.visualize( raw_frames[0], boxes[0], scores[0], classes[0], min_score_thresh=model_config.nms_configs.score_thresh, max_boxes_to_draw=model_config.nms_configs.max_output_size) if out_ptr: # write frame into output file. out_ptr.write(new_frame) else: # show the frame online, mainly used for real-time speed test. cv2.imshow('Frame', new_frame) # Press Q on keyboard to exit if cv2.waitKey(1) & 0xFF == ord('q'): break
def main(argv): del argv # Unused. hvd_try_init() set_env(use_amp=FLAGS.use_amp) # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if not FLAGS.val_json_file and not FLAGS.testdev_dir: raise RuntimeError( 'You must specify --val_json_file or --testdev for evaluation.' ) # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = config.get('image_size') for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores if hvd is not None: num_shards = hvd.size() params = dict( config.as_dict(), model_name=FLAGS.model_name, num_epochs=FLAGS.num_epochs, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode, ) run_config = tf.estimator.RunConfig( session_config=get_session_config(use_xla=FLAGS.use_xla), save_checkpoints_steps=600) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) # TPU Estimator logging.info(params) if FLAGS.mode == 'train': # train_estimator = tf.estimator.tpu.TPUEstimator( # model_fn=model_fn_instance, # use_tpu=FLAGS.use_tpu, # train_batch_size=FLAGS.train_batch_size, # config=run_config, # params=params) params['batch_size'] = FLAGS.train_batch_size train_estimator = HorovodEstimator(model_fn=model_fn_instance, model_dir=FLAGS.model_dir, config=run_config, params=params) input_fn = dataloader.InputReader(FLAGS.training_file_pattern, is_training=True, params=params, use_fake_data=FLAGS.use_fake_data) max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / (FLAGS.train_batch_size * num_shards)) + 1 train_estimator.train(input_fn=input_fn, max_steps=max_steps) # if FLAGS.eval_after_training: # # Run evaluation after training finishes. # eval_params = dict( # params, # use_tpu=False, # input_rand_hflip=False, # is_training_bn=False, # use_bfloat16=False, # ) # eval_estimator = tf.estimator.tpu.TPUEstimator( # model_fn=model_fn_instance, # use_tpu=False, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, # config=run_config, # params=eval_params) # eval_results = eval_estimator.evaluate( # input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, # is_training=False), # steps=FLAGS.eval_samples//FLAGS.eval_batch_size) # logging.info('Eval results: %s', eval_results) # ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) # utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) elif FLAGS.mode == 'eval': config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.estimator.tpu.RunConfig( cluster=None, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, ) # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.num_epochs): logging.info('Starting training cycle, epoch: %d.', cycle) train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) logging.info('Starting evaluation cycle, epoch: %d.', cycle) # Run evaluation after every epoch. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Evaluation results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) pass else: logging.info('Mode not found.')
def main(_): # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) if FLAGS.use_xla and FLAGS.strategy != 'tpu': tf.config.optimizer.set_jit(True) for gpu in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) if FLAGS.debug: tf.debugging.set_log_device_placement(True) os.environ['TF_DETERMINISTIC_OPS'] = '1' tf.random.set_seed(FLAGS.tf_random_seed) logging.set_verbosity(logging.DEBUG) if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver) logging.info('All devices: %s', tf.config.list_logical_devices('TPU')) elif FLAGS.strategy == 'gpus': gpus = tf.config.list_physical_devices('GPU') if FLAGS.batch_size % len(gpus): raise ValueError( 'Batch size divide gpus number must be interger, but got %f' % (FLAGS.batch_size / len(gpus))) if platform.system() == 'Windows': # Windows doesn't support nccl use HierarchicalCopyAllReduce instead # TODO(fsx950223): investigate HierarchicalCopyAllReduce performance issue cross_device_ops = tf.distribute.HierarchicalCopyAllReduce() else: cross_device_ops = None ds_strategy = tf.distribute.MirroredStrategy( cross_device_ops=cross_device_ops) logging.info('All devices: %s', gpus) else: if tf.config.list_physical_devices('GPU'): ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0') else: ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0') steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size params = dict(profile=FLAGS.profile, model_name=FLAGS.model_name, steps_per_execution=FLAGS.steps_per_execution, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, strategy=FLAGS.strategy, batch_size=FLAGS.batch_size, tf_random_seed=FLAGS.tf_random_seed, debug=FLAGS.debug, val_json_file=FLAGS.val_json_file, eval_samples=FLAGS.eval_samples, num_shards=ds_strategy.num_replicas_in_sync) config.override(params, True) # set mixed precision policy by keras api. precision = utils.get_precision(config.strategy, config.mixed_precision) policy = tf.keras.mixed_precision.Policy(precision) tf.keras.mixed_precision.set_global_policy(policy) def get_dataset(is_training, config): file_pattern = (FLAGS.train_file_pattern if is_training else FLAGS.val_file_pattern) if not file_pattern: raise ValueError('No matching files.') return dataloader.InputReader( file_pattern, is_training=is_training, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=config.max_instances_per_image, debug=FLAGS.debug)(config.as_dict()) with ds_strategy.scope(): if config.model_optimizations: tfmot.set_config(config.model_optimizations.as_dict()) if FLAGS.hub_module_url: model = train_lib.EfficientDetNetTrainHub( config=config, hub_module_url=FLAGS.hub_module_url) else: model = train_lib.EfficientDetNetTrain(config=config) model = setup_model(model, config) if FLAGS.debug: tf.config.run_functions_eagerly(True) if FLAGS.pretrained_ckpt and not FLAGS.hub_module_url: ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt) util_keras.restore_ckpt(model, ckpt_path, config.moving_average_decay, exclude_layers=['class_net']) init_experimental(config) if 'train' in FLAGS.mode: val_dataset = get_dataset(False, config) if 'eval' in FLAGS.mode else None model.fit( get_dataset(True, config), epochs=config.num_epochs, steps_per_epoch=steps_per_epoch, callbacks=train_lib.get_callbacks(config.as_dict(), val_dataset), validation_data=val_dataset, validation_steps=(FLAGS.eval_samples // FLAGS.batch_size)) else: # Continuous eval. for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir, min_interval_secs=180): logging.info('Starting to evaluate.') # Terminate eval job when final checkpoint is reached. try: current_epoch = int(os.path.basename(ckpt).split('-')[1]) except IndexError: current_epoch = 0 val_dataset = get_dataset(False, config) logging.info('start loading model.') model.load_weights(tf.train.latest_checkpoint(FLAGS.model_dir)) logging.info('finish loading model.') coco_eval = train_lib.COCOCallback(val_dataset, 1) coco_eval.set_model(model) eval_results = coco_eval.on_epoch_end(current_epoch) logging.info('eval results for %s: %s', ckpt, eval_results) try: utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) except tf.errors.NotFoundError: # Checkpoint might be not already deleted by the time eval finished. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) if current_epoch >= config.num_epochs or not current_epoch: logging.info('Eval epoch %d / %d', current_epoch, config.num_epochs) break
def main(_): if FLAGS.strategy == 'tpu': tf.disable_eager_execution() tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ('train', 'train_and_eval'): if FLAGS.training_file_pattern is None: raise RuntimeError( 'Must specify --training_file_pattern for train.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError( 'Must specify --validation_file_pattern for eval.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, 'image_masks': None, } # The Input Partition Logic: We partition only the partition-able tensors. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.strategy != 'tpu': if FLAGS.use_xla: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True model_dir = FLAGS.model_dir strategy = None if FLAGS.strategy == 'tpu': tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) else: if FLAGS.strategy == 'gpus': strategy = tf.distribute.MirroredStrategy() run_config = tf.estimator.RunConfig( model_dir=model_dir, train_distribute=strategy, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size) total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch) train_steps = total_examples // FLAGS.train_batch_size logging.info(params) with tf.io.gfile.GFile(os.path.join(model_dir, 'config.yaml'), 'w') as f: f.write(str(config)) train_input_fn = dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) eval_input_fn = dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image) if FLAGS.strategy == 'tpu': estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) else: params['batch_size'] = (FLAGS.train_batch_size // getattr(strategy, 'num_replicas_in_sync', 1)) params['num_shards'] = getattr(strategy, 'num_replicas_in_sync', 1) estimator = tf.estimator.Estimator(model_fn=model_fn_instance, config=run_config, params=params) # start train/eval flow. if FLAGS.mode == 'train': estimator.train(input_fn=train_input_fn, max_steps=train_steps) if FLAGS.eval_after_training: estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout): logging.info('Starting to evaluate.') try: eval_results = estimator.evaluate(eval_input_fn, steps=eval_steps) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) if current_step >= train_steps: logging.info('Eval finished step %d/%d', current_step, train_steps) break except tf.errors.NotFoundError: # Checkpoint might be not already deleted by the time eval finished. # We simply skip ssuch case. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) elif FLAGS.mode == 'train_and_eval': train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=eval_steps, throttle_secs=600) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) else: logging.info('Invalid mode: %s', FLAGS.mode)
def main(_): if FLAGS.strategy == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top logging.info('Use horovod with multi gpus') hvd.init() os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank()) import tensorflow.compat.v1 as tf # pylint: disable=g-import-not-at-top tf.enable_v2_tensorshape() tf.disable_eager_execution() if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. feat_sizes = utils.get_feat_sizes(config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.strategy != 'tpu': if FLAGS.use_xla: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) config_proto.gpu_options.allow_growth = True tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop if FLAGS.strategy == 'tpu' else 1, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig. PER_HOST_V2) if FLAGS.strategy == 'horovod': model_dir = FLAGS.model_dir if hvd.rank() == 0 else None else: model_dir = FLAGS.model_dir run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) max_instances_per_image = config.max_instances_per_image eval_steps = int(FLAGS.eval_samples // FLAGS.eval_batch_size) use_tpu = (FLAGS.strategy == 'tpu') logging.info(params) def _train(steps): """Build train estimator and run training if steps > 0.""" train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=max_instances_per_image), max_steps=steps) def _eval(steps): """Build estimator and eval the latest checkpoint if steps > 0.""" eval_params = dict( params, strategy=FLAGS.strategy, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate(input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False, max_instances_per_image=max_instances_per_image), steps=steps, name=FLAGS.eval_name) logging.info('Evaluation results: %s', eval_results) return eval_results # start train/eval flow. if FLAGS.mode == 'train': total_examples = int(config.num_epochs * FLAGS.num_examples_per_epoch) _train(total_examples // FLAGS.train_batch_size) if FLAGS.eval_after_training: _eval(eval_steps) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout): logging.info('Starting to evaluate.') try: eval_results = _eval(eval_steps) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info('Checkpoint %s no longer exists, skipping.', ckpt) elif FLAGS.mode == 'train_and_eval': ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) try: step = int(os.path.basename(ckpt).split("-")[1]) current_epoch = (step * FLAGS.train_batch_size // FLAGS.num_examples_per_epoch) logging.info('found ckpt at step %d (epoch %d)', step, current_epoch) except (IndexError, TypeError): logging.info("Folder has no ckpt with valid step.", FLAGS.model_dir) current_epoch = 0 epochs_per_cycle = 1 # higher number has less graph construction overhead. for e in range(current_epoch + 1, config.num_epochs + 1, epochs_per_cycle): print('-----------------------------------------------------\n' '=====> Starting training, epoch: %d.' % e) _train(e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size) print('-----------------------------------------------------\n' '=====> Starting evaluation, epoch: %d.' % e) eval_results = _eval(eval_steps) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) else: logging.info('Invalid mode: %s', FLAGS.mode)
def main(_): # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) if FLAGS.use_xla and FLAGS.strategy != 'tpu': tf.config.optimizer.set_jit(True) for gpu in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) if FLAGS.debug: tf.config.experimental_run_functions_eagerly(True) tf.debugging.set_log_device_placement(True) os.environ['TF_DETERMINISTIC_OPS'] = '1' tf.random.set_seed(FLAGS.tf_random_seed) logging.set_verbosity(logging.DEBUG) if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver) logging.info('All devices: %s', tf.config.list_logical_devices('TPU')) elif FLAGS.strategy == 'gpus': ds_strategy = tf.distribute.MirroredStrategy() logging.info('All devices: %s', tf.config.list_physical_devices('GPU')) else: if tf.config.list_physical_devices('GPU'): ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0') else: ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0') steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size params = dict(profile=FLAGS.profile, model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, strategy=FLAGS.strategy, batch_size=FLAGS.batch_size, tf_random_seed=FLAGS.tf_random_seed, debug=FLAGS.debug, val_json_file=FLAGS.val_json_file, eval_samples=FLAGS.eval_samples, num_shards=ds_strategy.num_replicas_in_sync) config.override(params, True) # set mixed precision policy by keras api. precision = utils.get_precision(config.strategy, config.mixed_precision) policy = tf.keras.mixed_precision.experimental.Policy(precision) tf.keras.mixed_precision.experimental.set_policy(policy) def get_dataset(is_training, config): file_pattern = (FLAGS.training_file_pattern if is_training else FLAGS.val_file_pattern) if not file_pattern: raise ValueError('No matching files.') return dataloader.InputReader( file_pattern, is_training=is_training, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=config.max_instances_per_image, debug=FLAGS.debug)(config.as_dict()) with ds_strategy.scope(): if config.model_optimizations: tfmot.set_config(config.model_optimizations.as_dict()) model = setup_model(config) if FLAGS.pretrained_ckpt: ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt) util_keras.restore_ckpt(model, ckpt_path) init_experimental(config) val_dataset = get_dataset(False, config).repeat() model.fit(get_dataset(True, config), epochs=config.num_epochs, steps_per_epoch=steps_per_epoch, callbacks=train_lib.get_callbacks(config.as_dict(), val_dataset), validation_data=val_dataset, validation_steps=(FLAGS.eval_samples // FLAGS.batch_size)) model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))
def test_train(self): tf.random.set_seed(1111) config = hparams_config.get_detection_config('efficientdet-d0') config.batch_size = 1 config.num_examples_per_epoch = 1 config.model_dir = tempfile.mkdtemp() x = tf.ones((1, 512, 512, 3)) labels = { 'box_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 36)) for i in range(3, 8) } labels.update({ 'cls_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 9), dtype=tf.int32) for i in range(3, 8) }) labels.update({'mean_num_positives': tf.constant([10.0])}) params = config.as_dict() params['num_shards'] = 1 model = train_lib.EfficientDetNetTrain(config=config) model.build((1, 512, 512, 3)) model.compile( optimizer=train_lib.get_optimizer(params), loss={ 'box_loss': train_lib.BoxLoss( params['delta'], reduction=tf.keras.losses.Reduction.NONE), 'box_iou_loss': train_lib.BoxIouLoss( params['iou_loss_type'], reduction=tf.keras.losses.Reduction.NONE), 'class_loss': train_lib.FocalLoss( params['alpha'], params['gamma'], label_smoothing=params['label_smoothing'], reduction=tf.keras.losses.Reduction.NONE) }) # Test single-batch outputs = model.train_on_batch(x, labels, return_dict=True) expect_results = {'loss': 26278.25, 'det_loss': 26277.033203125, 'cls_loss': 5060.716796875, 'box_loss': 424.3263244628906, 'box_iou_loss': 0, 'gnorm': 5873.78759765625} self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.) outputs = model.test_on_batch(x, labels, return_dict=True) expect_results = {'loss': 26079.712890625, 'det_loss': 26078.49609375, 'cls_loss': 5063.3759765625, 'box_loss': 420.30242919921875, 'box_iou_loss': 0} self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.) # Test fit. hist = model.fit( x, labels, steps_per_epoch=1, epochs=1, callbacks=train_lib.get_callbacks(params)) expect_results = {'loss': [26063.099609375], 'det_loss': [26061.8828125], 'cls_loss': [5058.1337890625], 'box_loss': [420.074951171875], 'box_iou_loss': [0], 'gnorm': [5107.46435546875]} self.assertAllClose( hist.history, expect_results, rtol=.1, atol=100.)
def test_train(self): tf.random.set_seed(1111) config = hparams_config.get_detection_config('efficientdet-d0') config.heads = ['object_detection', 'segmentation'] config.batch_size = 1 config.num_examples_per_epoch = 1 config.model_dir = tempfile.mkdtemp() config.steps_per_epoch = 1 x = tf.ones((1, 512, 512, 3)) labels = { 'box_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 36)) for i in range(3, 8) } labels.update({ 'cls_targets_%d' % i: tf.ones((1, 512 // 2**i, 512 // 2**i, 9), dtype=tf.int32) for i in range(3, 8) }) labels.update({'image_masks': tf.ones((1, 128, 128, 1))}) labels.update({'mean_num_positives': tf.constant([10.0])}) params = config.as_dict() params['num_shards'] = 1 model = train_lib.EfficientDetNetTrain(config=config) model.build((1, 512, 512, 3)) model.compile( optimizer=train_lib.get_optimizer(params), loss={ 'box_loss': train_lib.BoxLoss(params['delta'], reduction=tf.keras.losses.Reduction.NONE), 'box_iou_loss': train_lib.BoxIouLoss(params['iou_loss_type'], params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size'], reduction=tf.keras.losses.Reduction.NONE), 'class_loss': train_lib.FocalLoss(params['alpha'], params['gamma'], label_smoothing=params['label_smoothing'], reduction=tf.keras.losses.Reduction.NONE), 'seg_loss': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) }) # Test single-batch outputs = model.train_on_batch(x, labels, return_dict=True) expect_results = { 'loss': [26278.3, 5061.9, 425.5, 1.217], 'det_loss': 26277.033203125, 'cls_loss': 5060.716796875, 'box_loss': 424.3263244628906, 'gnorm': 5873.78759765625, 'seg_loss': 1.2215478420257568, } self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.) outputs = model.test_on_batch(x, labels, return_dict=True) expect_results = { 'loss': [26278.3, 5061.9, 425.5, 1.217], 'det_loss': 26078.49609375, 'cls_loss': 5063.3759765625, 'box_loss': 420.30242919921875, 'seg_loss': 1.2299377918243408, } self.assertAllClose(outputs, expect_results, rtol=.1, atol=100.) # Test fit. hist = model.fit(x, labels, steps_per_epoch=1, epochs=1, callbacks=train_lib.get_callbacks(params)) self.assertAllClose(hist.history['loss'], [[26067, 5057.5, 421.4, 1.2]], rtol=.1, atol=10.) self.assertAllClose(hist.history['det_loss'], [26061.], rtol=.1, atol=10.) self.assertAllClose(hist.history['cls_loss'], [5058.], rtol=.1, atol=10.) self.assertAllClose(hist.history['box_loss'], [420.], rtol=.1, atol=100.) self.assertAllClose(hist.history['seg_loss'], [1.2299], rtol=.1, atol=100.)
def main(_): # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) if FLAGS.use_xla and FLAGS.strategy != 'tpu': tf.config.optimizer.set_jit(True) for gpu in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) if FLAGS.debug: tf.config.experimental_run_functions_eagerly(True) tf.debugging.set_log_device_placement(True) tf.random.set_seed(111111) logging.set_verbosity(logging.DEBUG) if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver) logging.info('All devices: %s', tf.config.list_logical_devices('TPU')) elif FLAGS.strategy == 'gpus': ds_strategy = tf.distribute.MirroredStrategy() logging.info('All devices: %s', tf.config.list_physical_devices('GPU')) else: if tf.config.list_physical_devices('GPU'): ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0') else: ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0') # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') params = dict(config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_examples_per_epoch=FLAGS.num_examples_per_epoch, strategy=FLAGS.strategy, batch_size=FLAGS.batch_size // ds_strategy.num_replicas_in_sync, num_shards=ds_strategy.num_replicas_in_sync, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode) # set mixed precision policy by keras api. precision = utils.get_precision(params['strategy'], params['mixed_precision']) policy = tf.keras.mixed_precision.experimental.Policy(precision) tf.keras.mixed_precision.experimental.set_policy(policy) def get_dataset(is_training, params): file_pattern = (FLAGS.training_file_pattern if is_training else FLAGS.validation_file_pattern) return dataloader.InputReader( file_pattern, is_training=is_training, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=config.max_instances_per_image)(params) with ds_strategy.scope(): model = train_lib.EfficientDetNetTrain(params['model_name'], config) height, width = utils.parse_image_size(params['image_size']) model.build((params['batch_size'], height, width, 3)) model.compile( optimizer=train_lib.get_optimizer(params), loss={ 'box_loss': train_lib.BoxLoss(params['delta'], reduction=tf.keras.losses.Reduction.NONE), 'box_iou_loss': train_lib.BoxIouLoss(params['iou_loss_type'], params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size'], reduction=tf.keras.losses.Reduction.NONE), 'class_loss': train_lib.FocalLoss(params['alpha'], params['gamma'], label_smoothing=params['label_smoothing'], reduction=tf.keras.losses.Reduction.NONE) }) ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir) if ckpt_path: model.load_weights(ckpt_path) model.freeze_vars(params['var_freeze_expr']) model.fit(get_dataset(True, params=params), steps_per_epoch=FLAGS.num_examples_per_epoch, callbacks=train_lib.get_callbacks(params, FLAGS.profile), validation_data=get_dataset(False, params=params), validation_steps=FLAGS.eval_samples) model.save_weights(os.path.join(FLAGS.model_dir, 'model'))
def main(_): # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) if FLAGS.use_xla and FLAGS.strategy != 'tpu': tf.config.optimizer.set_jit(True) for gpu in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) if FLAGS.debug: tf.config.experimental_run_functions_eagerly(True) tf.debugging.set_log_device_placement(True) tf.random.set_seed(111111) logging.set_verbosity(logging.DEBUG) if FLAGS.strategy == 'tpu': tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) ds_strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver) logging.info('All devices: %s', tf.config.list_logical_devices('TPU')) elif FLAGS.strategy == 'gpus': ds_strategy = tf.distribute.MirroredStrategy() logging.info('All devices: %s', tf.config.list_physical_devices('GPU')) else: if tf.config.list_physical_devices('GPU'): ds_strategy = tf.distribute.OneDeviceStrategy('device:GPU:0') else: ds_strategy = tf.distribute.OneDeviceStrategy('device:CPU:0') steps_per_epoch = FLAGS.num_examples_per_epoch // FLAGS.batch_size params = dict(config.as_dict(), profile=FLAGS.profile, model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, strategy=FLAGS.strategy, batch_size=FLAGS.batch_size, num_shards=ds_strategy.num_replicas_in_sync) # set mixed precision policy by keras api. precision = utils.get_precision(params['strategy'], params['mixed_precision']) policy = tf.keras.mixed_precision.experimental.Policy(precision) tf.keras.mixed_precision.experimental.set_policy(policy) def get_dataset(is_training, params): file_pattern = (FLAGS.training_file_pattern if is_training else FLAGS.validation_file_pattern) if not file_pattern: raise ValueError('No matching files.') return dataloader.InputReader( file_pattern, is_training=is_training, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=config.max_instances_per_image)(params) with ds_strategy.scope(): model = train_lib.EfficientDetNetTrain(params['model_name'], config) model.compile( optimizer=train_lib.get_optimizer(params), loss={ 'box_loss': train_lib.BoxLoss(params['delta'], reduction=tf.keras.losses.Reduction.NONE), 'box_iou_loss': train_lib.BoxIouLoss(params['iou_loss_type'], params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size'], reduction=tf.keras.losses.Reduction.NONE), 'class_loss': train_lib.FocalLoss(params['alpha'], params['gamma'], label_smoothing=params['label_smoothing'], reduction=tf.keras.losses.Reduction.NONE), 'seg_loss': tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) }) if FLAGS.pretrained_ckpt: ckpt_path = tf.train.latest_checkpoint(FLAGS.pretrained_ckpt) util_keras.restore_ckpt(model, ckpt_path, params['moving_average_decay']) tf.io.gfile.makedirs(FLAGS.model_dir) if params['model_optimizations']: model_optimization.set_config(params['model_optimizations']) model.build((FLAGS.batch_size, *config.image_size, 3)) model.fit(get_dataset(True, params=params), epochs=params['num_epochs'], steps_per_epoch=steps_per_epoch, callbacks=train_lib.get_callbacks(params), validation_data=get_dataset(False, params=params).repeat(), validation_steps=(FLAGS.eval_samples // FLAGS.batch_size)) model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))
def main(argv): del argv # Unused. # if given an efficentdet ckpt don't use default backbone ckpt if FLAGS.backbone_ckpt == BACKBONE_CKPT_DEFAULT_DIR and FLAGS.ckpt is not None: print("Using ckpt flag: {}, ignoring default backbone_ckpt: {}".format( FLAGS.ckpt, FLAGS.backbone_ckpt)) FLAGS.backbone_ckpt = None if FLAGS.use_horovod is not None: if FLAGS.dump_all_ranks: FLAGS.model_dir += "/worker_" + str(hvd.rank()) if not 'HOROVOD_CYCLE_TIME' in os.environ: os.environ['HOROVOD_CYCLE_TIME'] = '0.5' if not 'HABANA_HCCL_COMM_API' in os.environ: os.environ['HABANA_HCCL_COMM_API'] = '0' hvd_init() if not FLAGS.no_hpu: from habana_frameworks.tensorflow import load_habana_module load_habana_module() if FLAGS.use_horovod: assert (horovod_enabled()) set_env(use_amp=FLAGS.use_amp) # deterministic setting if FLAGS.sbs_test or FLAGS.deterministic: set_deterministic() # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if not FLAGS.val_json_file and not FLAGS.testdev_dir: raise RuntimeError( 'You must specify --val_json_file or --testdev for evaluation.' ) # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = config.get('image_size') for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores if horovod_enabled(): num_shards = hvd.size() else: num_shards = 1 params = build_estimator_params('train', config, num_shards) # disabling input data scaling/flip manipulations. if FLAGS.sbs_test: sbs_params = dict(input_rand_hflip=False, train_scale_min=1, train_scale_max=1, dropout_rate=0.0) params.update(sbs_params) tf_random_seed = 0 if FLAGS.deterministic else None run_config = build_estimator_config('train', config, num_shards, num_cores_per_replica, input_partition_dims) write_hparams_v1(FLAGS.model_dir, { 'batch_size': FLAGS.train_batch_size, **FLAGS.flag_values_dict() }) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) # TPU Estimator logging.info(params) if FLAGS.mode == 'train': train_estimator = HorovodEstimator(model_fn=model_fn_instance, model_dir=FLAGS.model_dir, config=run_config, params=params) # for deterministic input, we pass to dataloader False for not manipulating input data is_training = not FLAGS.deterministic use_fake_data = FLAGS.use_fake_data or FLAGS.deterministic input_fn = dataloader.InputReader(FLAGS.training_file_pattern, is_training=is_training, params=params, use_fake_data=use_fake_data, is_deterministic=FLAGS.deterministic) max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / (FLAGS.train_batch_size * num_shards)) + 1 # for sbs test, train under sbs callbacks if FLAGS.sbs_test: from TensorFlow.common.debug import dump_callback SBS_TEST_CONFIG = os.path.join( os.environ['TF_TESTS_ROOT'], "tests/tf_training_tests/side_by_side/topologies/efficientdet/dump_config.json" ) with dump_callback(SBS_TEST_CONFIG): train_estimator.train(input_fn=input_fn, max_steps=max_steps) else: if FLAGS.ckpt is not None: train_estimator.train(input_fn=input_fn, steps=max_steps) else: train_estimator.train(input_fn=input_fn, max_steps=max_steps) elif FLAGS.mode == 'eval': eval_params = build_estimator_params('eval', config, num_shards) eval_config = build_estimator_config('eval', config, num_shards, num_cores_per_replica, input_partition_dims) # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=eval_config, params=eval_params) def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break write_summary(eval_results, ckpt, current_step) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': train_params = build_estimator_params('train', config, num_shards) train_config = build_estimator_config('train', config, num_shards, num_cores_per_replica, input_partition_dims) train_estimator = HorovodEstimator(model_fn=model_fn_instance, model_dir=FLAGS.model_dir, config=train_config, params=train_params) eval_estimator = None for cycle in range(FLAGS.num_epochs): logging.info('Starting training cycle, epoch: %d.', cycle) train_estimator.train( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), max_steps=(cycle + 1) * int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) # synchronization point for all ranks if horovod_enabled(): hvd.allreduce(tf.constant(0)) logging.info('Starting evaluation cycle, epoch: %d.', cycle) # Run evaluation after every epoch. if eval_estimator is None: eval_params = build_estimator_params('eval', config, num_shards) eval_config = build_estimator_config('eval', config, num_shards, num_cores_per_replica, input_partition_dims) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=eval_config, params=eval_params) if is_rank0(): eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) checkpoint_path = Path(FLAGS.model_dir) last_ckpt = tf.train.latest_checkpoint(str(checkpoint_path), latest_filename=None) current_step = int(os.path.basename(last_ckpt).split('-')[1]) write_summary(eval_results, FLAGS.model_dir, current_step) logging.info('Evaluation results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) pass else: logging.info('Mode not found.')
labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, areas, classes], axis=2) #labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) #if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. #dataset = dataset.take(1).cache().repeat() return dataset if __name__ == '__main__': Datatset = InputReader( "F:/automl-master/efficientdet/dataset/tfrecord/-00000-of-00001.tfrecord", is_training=True, use_fake_data=False) config = hparams_config.get_detection_config('efficientdet-d1') params = dict(config.as_dict()) dataset = Datatset(params) iter = dataset.make_one_shot_iterator() images, labels = iter.get_next() with tf.Session() as sess: print(sess.run(images))
def main(argv): assert len(argv) >= 1 if len(argv) > 1: # Do not accept unknown args. raise ValueError('Received unknown arguments: {}'.format(argv[1:])) if FLAGS.use_tpu: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None # Check data path if FLAGS.mode in ('train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError('You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs # Parse image size in case it is in string format. config.image_size = utils.parse_image_size(config.image_size) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError('--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. feat_sizes = utils.get_feat_sizes( config.get('image_size'), config.get('max_level')) for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len(FLAGS.input_partition_dims) spatial_dim = feat_sizes[level] if _can_partition(spatial_dim['height']) and _can_partition( spatial_dim['width']): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores params = dict( config.as_dict(), model_name=FLAGS.model_name, iterations_per_loop=FLAGS.iterations_per_loop, model_dir=FLAGS.model_dir, num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, backbone_ckpt=FLAGS.backbone_ckpt, ckpt=FLAGS.ckpt, val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode=FLAGS.mode, ) config_proto = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) tpu_config = tf.estimator.tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig .PER_HOST_V2) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, tf_random_seed=FLAGS.tf_random_seed, ) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) # TPU Estimator logging.info(params) if FLAGS.mode == 'train': train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), max_steps=int((config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, precision=None, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples//FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, precision=None, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples//FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int((config.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info('Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': # resumeした場合、global_stepにstep数が入っていて、cycleは関係がない for cycle in range(config.num_epochs): logging.info('Starting training cycle, epoch: %d.', cycle) train_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) logging.info('Starting evaluation cycle, epoch: %d.', cycle) # Run evaluation after every epoch. eval_params = dict( params, use_tpu=FLAGS.use_tpu, input_rand_hflip=False, is_training_bn=False, ) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples//FLAGS.eval_batch_size) logging.info('Evaluation results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) logging.info(f'save_checkpoint_start for epoch: {cycle}') now = datetime.now().strftime('%Y%m%d%H%M%S') with open('/tmp/main_inner.log', 'a') as f: f.write(f'{now}: save_checkpoint_start {cycle}\n') utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) with open('/tmp/main_inner.log', 'a') as f: f.write(f'{now}: save_checkpoint_end {cycle}\n') logging.info(f'save_checkpoint_end for epoch: {cycle}') else: logging.info('Mode not found.')