def coco_eval(predictions, current_epoch, current_step, summary_writer): """Call the coco library to get the eval metrics.""" global SUCCESS eval_results = coco_metric.compute_map(predictions, FLAGS.val_json_file) mlperf_log.ssd_print(key=mlperf_log.EVAL_STOP, value=current_epoch) mlperf_log.ssd_print(key=mlperf_log.EVAL_SIZE, value=FLAGS.eval_samples) mlperf_log.ssd_print(key=mlperf_log.EVAL_ACCURACY, value={ 'epoch': current_epoch, 'value': eval_results['COCO/AP'] }) mlperf_log.ssd_print(key=mlperf_log.EVAL_TARGET, value=ssd_constants.EVAL_TARGET) mlperf_log.ssd_print(key=mlperf_log.EVAL_ITERATION_ACCURACY, value={ 'iteration': current_step, 'value': eval_results['COCO/AP'] }) print("The coco AP is: {}\n".format(eval_results['COCO/AP'])) if eval_results['COCO/AP'] >= ssd_constants.EVAL_TARGET and not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) SUCCESS = True tf.logging.info('Eval results: %s' % eval_results) # Write out eval results for the checkpoint. with tf.Graph().as_default(): summaries = [] for metric in eval_results: summaries.append( tf.Summary.Value(tag=metric, simple_value=eval_results[metric])) tf_summary = tf.Summary(value=list(summaries)) summary_writer.add_summary(tf_summary, current_step)
def predict_post_processing(q_in, q_out): """Run post-processing on CPU for predictions.""" coco_gt = coco_metric.create_coco(FLAGS.val_json_file, use_cpp_extension=True) current_step, predictions = q_in.get() while current_step != _STOP and q_out is not None: tf.logging.info("Start to predict for step %d.", current_step) q_out.put((current_step, coco_metric.compute_map(predictions, coco_gt, use_cpp_extension=True, nms_on_tpu=True))) current_step, predictions = q_in.get()
def postprocess(self, results): """Postprocess results returned from model.""" try: import coco_metric # pylint: disable=g-import-not-at-top except ImportError: raise ImportError('To use the COCO dataset, you must clone the ' 'repo https://github.com/tensorflow/models and add ' 'tensorflow/models and tensorflow/models/research to ' 'the PYTHONPATH, and compile the protobufs by ' 'following https://github.com/tensorflow/models/blob/' 'master/research/object_detection/g3doc/installation.md' '#protobuf-compilation ; To evaluate using COCO' 'metric, download and install Python COCO API from' 'https://github.com/cocodataset/cocoapi') pred_boxes = results[ssd_constants.PRED_BOXES] pred_scores = results[ssd_constants.PRED_SCORES] # TODO(haoyuzhang): maybe use these values for visualization. # gt_boxes = results['gt_boxes'] # gt_classes = results['gt_classes'] source_id = results[ssd_constants.SOURCE_ID] raw_shape = results[ssd_constants.RAW_SHAPE] for i in range(self.get_batch_size()): self.predictions[int(source_id[i])] = { ssd_constants.PRED_BOXES: pred_boxes[i], ssd_constants.PRED_SCORES: pred_scores[i], ssd_constants.SOURCE_ID: source_id[i], ssd_constants.RAW_SHAPE: raw_shape[i] } # COCO metric calculates mAP only after a full epoch of evaluation. Return # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py. if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES: annotation_file = os.path.join(self.params.data_dir, ssd_constants.ANNOTATION_FILE) eval_results = coco_metric.compute_map(self.predictions.values(), annotation_file) ret = {'top_1_accuracy': 0., 'top_5_accuracy': 0.} for metric_key, metric_value in eval_results.items(): ret['simple_value:' + metric_key] = metric_value return ret log_fn('Got {:d} out of {:d} eval examples.' ' Waiting for the remaining to calculate mAP...'.format( len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES)) return {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
def coco_eval(predictions, current_step, summary_writer, coco_gt, use_cpp_extension=True, nms_on_tpu=True): """Call the coco library to get the eval metrics.""" global SUCCESS eval_results = coco_metric.compute_map(predictions, coco_gt, use_cpp_extension=use_cpp_extension, nms_on_tpu=nms_on_tpu) if eval_results['COCO/AP'] >= ssd_constants.EVAL_TARGET and not SUCCESS: SUCCESS = True tf.logging.info('Eval results: %s' % eval_results) # Write out eval results for the checkpoint. with tf.Graph().as_default(): summaries = [] for metric in eval_results: summaries.append( tf.Summary.Value(tag=metric, simple_value=eval_results[metric])) tf_summary = tf.Summary(value=list(summaries)) summary_writer.add_summary(tf_summary, current_step)
def postprocess(self, results): """Postprocess results returned from model.""" try: import coco_metric # pylint: disable=g-import-not-at-top except ImportError: raise ImportError( 'To use the COCO dataset, you must clone the ' 'repo https://github.com/tensorflow/models and add ' 'tensorflow/models and tensorflow/models/research to ' 'the PYTHONPATH, and compile the protobufs by ' 'following https://github.com/tensorflow/models/blob/' 'master/research/object_detection/g3doc/installation.md' '#protobuf-compilation ; To evaluate using COCO' 'metric, download and install Python COCO API from' 'https://github.com/cocodataset/cocoapi') pred_boxes = results[ssd_constants.PRED_BOXES] pred_scores = results[ssd_constants.PRED_SCORES] # TODO(haoyuzhang): maybe use these values for visualization. # gt_boxes = results['gt_boxes'] # gt_classes = results['gt_classes'] source_id = results[ssd_constants.SOURCE_ID] raw_shape = results[ssd_constants.RAW_SHAPE] # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting # `num_eval_epochs` to 1 is not enough and will often miss some images. We # expect user to set `num_eval_epochs` to >1, which will leave some unused # images from previous steps in `predictions`. Here we check if we are doing # eval at a new global step. if results['global_step'] > self.eval_global_step: self.eval_global_step = results['global_step'] self.predictions.clear() for i, sid in enumerate(source_id): self.predictions[int(sid)] = { ssd_constants.PRED_BOXES: pred_boxes[i], ssd_constants.PRED_SCORES: pred_scores[i], ssd_constants.SOURCE_ID: source_id[i], ssd_constants.RAW_SHAPE: raw_shape[i] } # COCO metric calculates mAP only after a full epoch of evaluation. Return # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py. if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES: log_fn('Got results for all {:d} eval examples. Calculate mAP...'. format(ssd_constants.COCO_NUM_VAL_IMAGES)) annotation_file = os.path.join(self.params.data_dir, ssd_constants.ANNOTATION_FILE) eval_results = coco_metric.compute_map(self.predictions.values(), annotation_file) self.predictions.clear() ret = {'top_1_accuracy': 0., 'top_5_accuracy': 0.} for metric_key, metric_value in eval_results.items(): ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value return ret log_fn('Got {:d} out of {:d} eval examples.' ' Waiting for the remaining to calculate mAP...'.format( len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES)) return {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
def postprocess(self, results): """Postprocess results returned from model.""" try: import coco_metric # pylint: disable=g-import-not-at-top except ImportError: raise ImportError('To use the COCO dataset, you must clone the ' 'repo https://github.com/tensorflow/models and add ' 'tensorflow/models and tensorflow/models/research to ' 'the PYTHONPATH, and compile the protobufs by ' 'following https://github.com/tensorflow/models/blob/' 'master/research/object_detection/g3doc/installation.md' '#protobuf-compilation ; To evaluate using COCO' 'metric, download and install Python COCO API from' 'https://github.com/cocodataset/cocoapi') pred_boxes = results[ssd_constants.PRED_BOXES] pred_scores = results[ssd_constants.PRED_SCORES] # TODO(haoyuzhang): maybe use these values for visualization. # gt_boxes = results['gt_boxes'] # gt_classes = results['gt_classes'] source_id = results[ssd_constants.SOURCE_ID] raw_shape = results[ssd_constants.RAW_SHAPE] # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting # `num_eval_epochs` to 1 is not enough and will often miss some images. We # expect user to set `num_eval_epochs` to >1, which will leave some unused # images from previous steps in `predictions`. Here we check if we are doing # eval at a new global step. if results['global_step'] > self.eval_global_step: self.eval_global_step = results['global_step'] self.predictions.clear() for i, sid in enumerate(source_id): self.predictions[int(sid)] = { ssd_constants.PRED_BOXES: pred_boxes[i], ssd_constants.PRED_SCORES: pred_scores[i], ssd_constants.SOURCE_ID: source_id[i], ssd_constants.RAW_SHAPE: raw_shape[i] } # COCO metric calculates mAP only after a full epoch of evaluation. Return # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py. if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES: log_fn('Got results for all {:d} eval examples. Calculate mAP...'.format( ssd_constants.COCO_NUM_VAL_IMAGES)) annotation_file = os.path.join(self.params.data_dir, ssd_constants.ANNOTATION_FILE) # Size of predictions before decoding about 15--30GB, while size after # decoding is 100--200MB. When using async eval mode, decoding takes # 20--30 seconds of main thread time but is necessary to avoid OOM during # inter-process communication. decoded_preds = coco_metric.decode_predictions(self.predictions.values()) self.predictions.clear() if self.params.collect_eval_results_async: def _eval_results_getter(): """Iteratively get eval results from async eval process.""" while True: step, eval_results = self.async_eval_results_queue.get() self.eval_coco_ap = eval_results['COCO/AP'] mlperf.logger.log_eval_accuracy( self.eval_coco_ap, step, self.batch_size * self.params.num_gpus, ssd_constants.COCO_NUM_TRAIN_IMAGES) if self.reached_target(): # Reached target, clear all pending messages in predictions queue # and insert poison pill to stop the async eval process. while not self.async_eval_predictions_queue.empty(): self.async_eval_predictions_queue.get() self.async_eval_predictions_queue.put('STOP') break if not self.async_eval_process: # Limiting the number of messages in predictions queue to prevent OOM. # Each message (predictions data) can potentially consume a lot of # memory, and normally there should only be few messages in the queue. # If often blocked on this, consider reducing eval frequency. self.async_eval_predictions_queue = multiprocessing.Queue(2) self.async_eval_results_queue = multiprocessing.Queue() # Reason to use a Process as opposed to Thread is mainly the # computationally intensive eval runner. Python multithreading is not # truly running in parallel, a runner thread would get significantly # delayed (or alternatively delay the main thread). self.async_eval_process = multiprocessing.Process( target=coco_metric.async_eval_runner, args=(self.async_eval_predictions_queue, self.async_eval_results_queue, annotation_file)) self.async_eval_process.daemon = True self.async_eval_process.start() self.async_eval_results_getter_thread = threading.Thread( target=_eval_results_getter, args=()) self.async_eval_results_getter_thread.daemon = True self.async_eval_results_getter_thread.start() self.async_eval_predictions_queue.put( (self.eval_global_step, decoded_preds)) return {'top_1_accuracy': 0, 'top_5_accuracy': 0.} eval_results = coco_metric.compute_map(decoded_preds, annotation_file) self.eval_coco_ap = eval_results['COCO/AP'] ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.} for metric_key, metric_value in eval_results.items(): ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value mlperf.logger.log_eval_accuracy(self.eval_coco_ap, self.eval_global_step, self.batch_size * self.params.num_gpus, ssd_constants.COCO_NUM_TRAIN_IMAGES) return ret log_fn('Got {:d} out of {:d} eval examples.' ' Waiting for the remaining to calculate mAP...'.format( len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES)) return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
def main(argv): del argv # Unused. global SUCCESS print(FLAGS.model_dir) if FLAGS.model_dir: print(FLAGS.model_dir) else: print(FLAGS.training_file_pattern) raise Exception('No model dir') # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') run_config, params = construct_run_config(FLAGS.iterations_per_loop) if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once': if params['train_with_low_level_api']: params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards trunner = train_low_level_runner.TrainLowLevelRunner( iterations=FLAGS.iterations_per_loop) input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data) mlperf_log.ssd_print(key=mlperf_log.RUN_START) trunner.initialize(input_fn, ssd_model.ssd_model_fn, params) else: mlperf_log.ssd_print(key=mlperf_log.RUN_START) if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if params['eval_with_low_level_api']: params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards erunner = eval_low_level_runner.EvalLowLevelRunner( eval_steps=int(FLAGS.eval_samples / FLAGS.eval_batch_size)) input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data) erunner.initialize(input_fn, params) erunner.build_model(ssd_model.ssd_model_fn, params) # TPU Estimator if FLAGS.mode == 'train': if params['train_with_low_level_api']: train_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0) trunner.train(train_steps) trunner.shutdown() else: if FLAGS.device == 'gpu': params['dataset_num_shards'] = 1 params['dataset_index'] = 0 train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=0) hooks = [] if FLAGS.use_async_checkpoint: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) train_estimator.train( input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size), hooks=hooks) if FLAGS.eval_after_training: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) eval_results = coco_metric.compute_map(predictions, FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'train_and_eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) current_step = 0 mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) threads = [] for eval_step in ssd_constants.EVAL_STEPS: # Compute the actual eval steps based on the actural train_batch_size steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size) current_epoch = current_step // params['steps_per_epoch'] # TODO(wangtao): figure out how to log for each epoch. mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=current_epoch) tf.logging.info('Starting training cycle for %d steps.' % steps) if params['train_with_low_level_api']: trunner.train(steps) else: run_config, params = construct_run_config(steps) if FLAGS.device == 'gpu': train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) train_estimator.train(input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=steps) if SUCCESS: break current_step = current_step + steps current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('Starting evaluation cycle at step %d.' % current_step) mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) # Run evaluation at the given step. if params['eval_with_low_level_api']: predictions = list(erunner.predict()) else: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer)) threads.append(t) t.start() trunner.shutdown() for t in threads: t.join() # success is a string right now as boolean is not JSON serializable. if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) summary_writer.close() elif FLAGS.mode == 'eval': if not params['eval_with_low_level_api']: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() eval_epochs = [ steps * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size // params['steps_per_epoch'] for steps in eval_steps ] # For 8x8 slices and above. if FLAGS.train_batch_size >= 4096: eval_epochs = [i * 2 for i in eval_epochs] tf.logging.info('Eval epochs: %s' % eval_epochs) # Run evaluation when there's a new checkpoint threads = [] count = 1 for ckpt in next_checkpoint(FLAGS.model_dir): print("current count is {}\n".format(count)) count += 1 if SUCCESS: break current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('current step: %s' % current_step) tf.logging.info('current epoch: %s' % current_epoch) if not params[ 'eval_every_checkpoint'] and current_epoch not in eval_epochs: continue tf.logging.info('Starting to evaluate.') try: mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer)) threads.append(t) t.start() # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) for t in threads: t.join() if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) summary_writer.close() elif FLAGS.mode == 'eval_once': if not params['eval_with_low_level_api']: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in next_checkpoint(FLAGS.model_dir): current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] print('current epoch: %s' % current_epoch) if FLAGS.eval_epoch < current_epoch: break if FLAGS.eval_epoch > current_epoch: continue tf.logging.info('Starting to evaluate.') try: mlperf_log.ssd_print(key=mlperf_log.EVAL_START, value=current_epoch) if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) coco_eval(predictions, current_epoch, current_step, summary_writer) # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: if not SUCCESS: mlperf_log.ssd_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) mlperf_log.ssd_print(key=mlperf_log.RUN_FINAL) print('Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. print('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) print('%d ending' % FLAGS.eval_epoch) summary_writer.close()
def main(argv): del argv # Unused. global SUCCESS # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval', 'eval_once'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') run_config, params = construct_run_config(FLAGS.iterations_per_loop) mlp_log.mlperf_print('global_batch_size', FLAGS.train_batch_size) mlp_log.mlperf_print('opt_base_learning_rate', params['base_learning_rate']) mlp_log.mlperf_print('opt_weight_decay', params['weight_decay']) mlp_log.mlperf_print( 'model_bn_span', FLAGS.train_batch_size // FLAGS.num_shards * params['distributed_group_size']) if FLAGS.mode in ('eval', 'eval_once'): coco_gt = coco_metric.create_coco( FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc']) if FLAGS.mode == 'train_and_eval' and params[ 'in_memory_eval'] and FLAGS.train_batch_size != FLAGS.eval_batch_size: raise RuntimeError( 'train batch size should be equal to eval batch size for in memory eval.' ) if FLAGS.mode != 'eval' and FLAGS.mode != 'eval_once' and not params[ 'in_memory_eval']: if params['train_with_low_level_api'] and not params['in_memory_eval']: params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards input_partition_dims = FLAGS.input_partition_dims if input_partition_dims is not None and params['transpose_input']: if params['batch_size'] > 8: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 3, 0] ] else: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 0, 3] ] trunner = train_low_level_runner.TrainLowLevelRunner( input_partition_dims=[input_partition_dims, None] if FLAGS.input_partition_dims else None, num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims)) if FLAGS.input_partition_dims else 1, iterations=FLAGS.iterations_per_loop, ) input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data) trunner.initialize(input_fn, ssd_model.ssd_model_fn, params) if params[ 'eval_with_low_level_api'] and FLAGS.mode != 'train' and not params[ 'in_memory_eval']: params['batch_size'] = FLAGS.eval_batch_size // FLAGS.num_shards eval_steps = int(math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size)) if params['distributed_eval']: erunner = dist_eval_low_level_runner.DistEvalLowLevelRunner( eval_steps=eval_steps) else: erunner = eval_low_level_runner.EvalLowLevelRunner( eval_steps=eval_steps) input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, distributed_eval=params['distributed_eval'], count=eval_steps * FLAGS.eval_batch_size) erunner.initialize(input_fn, params) erunner.build_model(ssd_model.ssd_model_fn, params) # TPU Estimator if FLAGS.mode == 'train': if params['train_with_low_level_api']: train_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) trunner.train(train_steps) trunner.shutdown() else: if FLAGS.device == 'gpu': train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) hooks = [] if FLAGS.use_async_checkpoint: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) train_estimator.train( input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size), hooks=hooks) if FLAGS.eval_after_training: if params['eval_with_low_level_api']: predictions = list(erunner.predict()) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) eval_results = coco_metric.compute_map( predictions, coco_gt, use_cpp_extension=params['use_cocoeval_cc'], nms_on_tpu=params['nms_on_tpu']) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'train_and_eval': output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) if params['in_memory_eval']: params['batch_size'] = FLAGS.train_batch_size // FLAGS.num_shards eval_steps = int( math.ceil(FLAGS.eval_samples / FLAGS.eval_batch_size)) input_partition_dims = FLAGS.input_partition_dims if input_partition_dims is not None and params['transpose_input']: if params['batch_size'] > 8: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 3, 0] ] else: input_partition_dims = [ input_partition_dims[i] for i in [1, 2, 0, 3] ] runner = train_and_eval_low_level_runner.TrainAndEvalLowLevelRunner( iterations=FLAGS.iterations_per_loop, eval_steps=eval_steps, input_partition_dims=input_partition_dims if FLAGS.input_partition_dims else None, num_cores_per_shard=int(np.prod(FLAGS.input_partition_dims)) if FLAGS.input_partition_dims else 1, ) input_fn = dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data) # Init for eval. eval_input_fn = dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data, distributed_eval=True, count=eval_steps * FLAGS.eval_batch_size) runner.initialize(input_fn, eval_input_fn, ssd_model.ssd_model_fn, params) train_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) runner.train_and_eval(train_steps) runner.shutdown() return current_step = 0 threads = [] for eval_step in ssd_constants.EVAL_STEPS: # Compute the actual eval steps based on the actural train_batch_size steps = int(eval_step * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size) current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('Starting training cycle for %d steps.' % steps) if params['train_with_low_level_api']: trunner.train(steps, current_step) else: run_config, params = construct_run_config(steps) if FLAGS.device == 'gpu': train_params = dict(params) train_params['batch_size'] = FLAGS.train_batch_size train_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=train_params) else: train_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) tf.logging.info(params) train_estimator.train(input_fn=dataloader.SSDInputReader( FLAGS.training_file_pattern, params['transpose_input'], is_training=True, use_fake_data=FLAGS.use_fake_data), steps=steps) if SUCCESS: break current_step = current_step + steps current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('Starting evaluation cycle at step %d.' % current_step) # Run evaluation at the given step. if params['eval_with_low_level_api']: # TODO(b/123313070): Fix convergence discrepency # for train and distributed eval on POD with low level API. predictions = list(erunner.predict()) else: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) predictions = list( eval_estimator.predict(input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer, coco_gt, params['use_cocoeval_cc'], params['nms_on_tpu'])) threads.append(t) t.start() if params['train_with_low_level_api']: trunner.shutdown() for t in threads: t.join() summary_writer.close() elif FLAGS.mode == 'eval': if not params['eval_with_low_level_api']: if FLAGS.device == 'gpu': eval_params = dict(params) eval_params['batch_size'] = FLAGS.eval_batch_size eval_estimator = tf.estimator.Estimator( model_fn=ssd_model.ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=eval_params) else: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) eval_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() eval_epochs = [ steps * ssd_constants.DEFAULT_BATCH_SIZE / FLAGS.train_batch_size // params['steps_per_epoch'] for steps in eval_steps ] # For 8x8 slices and above. if FLAGS.train_batch_size >= 4096: eval_epochs = [i * 2 for i in eval_epochs] tf.logging.info('Eval epochs: %s' % eval_epochs) # Run evaluation when there's a new checkpoint threads = [] for ckpt in next_checkpoint(FLAGS.model_dir): if SUCCESS: break current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] tf.logging.info('current epoch: %s' % current_epoch) if not params[ 'eval_every_checkpoint'] and current_epoch not in eval_epochs: continue tf.logging.info('Starting to evaluate.') try: if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) t = threading.Thread(target=coco_eval, args=(predictions, current_epoch, current_step, summary_writer, coco_gt)) threads.append(t) t.start() # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) for t in threads: t.join() summary_writer.close() elif FLAGS.mode == 'eval_once': if not params['eval_with_low_level_api']: eval_estimator = tpu_estimator.TPUEstimator( model_fn=ssd_model.ssd_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=params) output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) # Summary writer writes out eval metrics. summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in next_checkpoint(FLAGS.model_dir): current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // params['steps_per_epoch'] print('current epoch: %s' % current_epoch) if FLAGS.eval_epoch < current_epoch: break if FLAGS.eval_epoch > current_epoch: continue tf.logging.info('Starting to evaluate.') try: if params['eval_with_low_level_api']: predictions = list(erunner.predict(checkpoint_path=ckpt)) else: predictions = list( eval_estimator.predict( checkpoint_path=ckpt, input_fn=dataloader.SSDInputReader( FLAGS.validation_file_pattern, is_training=False, use_fake_data=FLAGS.use_fake_data))) coco_eval(predictions, current_step, summary_writer, coco_gt, params['use_cocoeval_cc'], params['nms_on_tpu']) # Terminate eval job when final checkpoint is reached total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long # after the CPU job tells it to start evaluating. In this case, # the checkpoint file could have been deleted already. print('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) print('%d ending' % FLAGS.eval_epoch) summary_writer.close()