def test_checkpoint_max_to_keep(self): """Test that only the most recent checkpoints are kept.""" with mock.patch.object(model_builder, 'build', autospec=True) as mock_builder: mock_builder.return_value = SimpleModel() hparams = model_hparams.create_hparams( hparams_overrides='load_pretrained=false') pipeline_config_path = get_pipeline_config_path( MODEL_NAME_FOR_TEST) config_kwarg_overrides = _get_config_kwarg_overrides() model_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) model_lib_v2.train_loop(hparams, pipeline_config_path, model_dir=model_dir, train_steps=20, checkpoint_every_n=2, checkpoint_max_to_keep=3, **config_kwarg_overrides) ckpt_files = tf.io.gfile.glob( os.path.join(model_dir, 'ckpt-*.index')) self.assertEqual(len(ckpt_files), 3, '{} not of length 3.'.format(ckpt_files))
def main(unused_argv): flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) if FLAGS.checkpoint_dir: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: resolver = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif FLAGS.num_workers > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu)
def test_train_loop_then_eval_loop(self): """Tests that Estimator and input function are constructed correctly.""" hparams = model_hparams.create_hparams( hparams_overrides='load_pretrained=false') pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) config_kwarg_overrides = _get_config_kwarg_overrides() model_dir = tf.test.get_temp_dir() train_steps = 2 model_lib_v2.train_loop( hparams, pipeline_config_path, model_dir=model_dir, train_steps=train_steps, checkpoint_every_n=1, **config_kwarg_overrides) model_lib_v2.eval_continuously( hparams, pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, train_steps=train_steps, wait_interval=10, **config_kwarg_overrides)
def test_checkpoint_max_to_keep(self): """Test that only the most recent checkpoints are kept.""" strategy = tf2.distribute.OneDeviceStrategy(device='/cpu:0') with mock.patch.object(model_builder, 'build', autospec=True) as mock_builder: with strategy.scope(): mock_builder.return_value = SimpleModel() model_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) pipeline_config_path = get_pipeline_config_path( MODEL_NAME_FOR_TEST) new_pipeline_config_path = os.path.join(model_dir, 'new_pipeline.config') config_util.clear_fine_tune_checkpoint(pipeline_config_path, new_pipeline_config_path) config_kwarg_overrides = _get_config_kwarg_overrides() with strategy.scope(): model_lib_v2.train_loop(new_pipeline_config_path, model_dir=model_dir, train_steps=20, checkpoint_every_n=2, checkpoint_max_to_keep=3, **config_kwarg_overrides) ckpt_files = tf.io.gfile.glob( os.path.join(model_dir, 'ckpt-*.index')) self.assertEqual(len(ckpt_files), 3, '{} not of length 3.'.format(ckpt_files))
def test_train_loop_then_eval_loop(self): """Tests that Estimator and input function are constructed correctly.""" model_dir = tf.test.get_temp_dir() pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) new_pipeline_config_path = os.path.join(model_dir, 'new_pipeline.config') config_util.clear_fine_tune_checkpoint(pipeline_config_path, new_pipeline_config_path) config_kwarg_overrides = _get_config_kwarg_overrides() train_steps = 2 strategy = tf2.distribute.MirroredStrategy(['/cpu:0', '/cpu:1']) with strategy.scope(): model_lib_v2.train_loop(new_pipeline_config_path, model_dir=model_dir, train_steps=train_steps, checkpoint_every_n=1, **config_kwarg_overrides) model_lib_v2.eval_continuously(new_pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, train_steps=train_steps, wait_interval=1, timeout=10, **config_kwarg_overrides)
def test_export_metrics_json_serializable(self): """Tests that Estimator and input function are constructed correctly.""" strategy = tf2.distribute.OneDeviceStrategy(device='/cpu:0') def export(data, _): json.dumps(data) with mock.patch.dict(exporter_lib_v2.INPUT_BUILDER_UTIL_MAP, FAKE_BUILDER_MAP): with strategy.scope(): model_dir = tf.test.get_temp_dir() new_pipeline_config_path = os.path.join( model_dir, 'new_pipeline.config') pipeline_config_path = get_pipeline_config_path( MODEL_NAME_FOR_TEST) config_util.clear_fine_tune_checkpoint( pipeline_config_path, new_pipeline_config_path) train_steps = 2 with strategy.scope(): model_lib_v2.train_loop( new_pipeline_config_path, model_dir=model_dir, train_steps=train_steps, checkpoint_every_n=100, performance_summary_exporter=export, **_get_config_kwarg_overrides())
def test_checkpoint_max_to_keep(self): """Test that only the most recent checkpoints are kept.""" strategy = tf2.distribute.OneDeviceStrategy(device='/cpu:0') with mock.patch.dict(exporter_lib_v2.INPUT_BUILDER_UTIL_MAP, FAKE_BUILDER_MAP): model_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) pipeline_config_path = get_pipeline_config_path( MODEL_NAME_FOR_TEST) new_pipeline_config_path = os.path.join(model_dir, 'new_pipeline.config') config_util.clear_fine_tune_checkpoint(pipeline_config_path, new_pipeline_config_path) config_kwarg_overrides = _get_config_kwarg_overrides() with strategy.scope(): model_lib_v2.train_loop(new_pipeline_config_path, model_dir=model_dir, train_steps=20, checkpoint_every_n=2, checkpoint_max_to_keep=3, **config_kwarg_overrides) ckpt_files = tf.io.gfile.glob( os.path.join(model_dir, 'ckpt-*.index')) self.assertEqual(len(ckpt_files), 3, '{} not of length 3.'.format(ckpt_files))
def main(unused_argv): # ste the gpu (device:GPU:0) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: # Restrict TensorFlow to only use the first GPU try: tf.config.experimental.set_visible_devices(gpus[0], 'GPU') tf.config.experimental.set_memory_growth(gpus[0], True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") except RuntimeError as e: # Visible devices must be set before GPUs have been initialized print(e) flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) if FLAGS.checkpoint_dir: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) # elif FLAGS.num_workers > 1: # strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # else: # strategy = tf.compat.v2.distribute.MirroredStrategy() # with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu, checkpoint_every_n=FLAGS.checkpoint_every_n, record_summaries=FLAGS.record_summaries)
def main(unused_argv): flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) if FLAGS.checkpoint_dir: if FLAGS.eval_all_checkpoints: model_lib_v2.eval_all_checkpoints( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir) else: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif FLAGS.num_workers > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu, checkpoint_every_n=FLAGS.checkpoint_every_n, record_summaries=FLAGS.record_summaries)
def train(self, steps_per_epoch, checkpoint_every_n_epochs=10): """ Trains the model. Args: steps_per_epoch: Number of steps that are to be trained for one epoch checkpoint_every_n_epochs: Epoch interval in which to save a checkpoint while training """ checkpoints_every_n_steps = steps_per_epoch * checkpoint_every_n_epochs strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=self.config_path, model_dir=self.checkpoint_path, checkpoint_every_n=checkpoints_every_n_steps, checkpoint_max_to_keep=150, record_summaries=True)
def main(_): with open('system_dict.json') as json_file: args = json.load(json_file) tf.config.set_soft_device_placement(True) if args["checkpoint_dir"]: model_lib_v2.eval_continuously( pipeline_config_path=args["pipeline_config_path"], model_dir=args["model_dir"], train_steps=args["num_train_steps"], sample_1_of_n_eval_examples=args["sample_1_of_n_eval_examples"], sample_1_of_n_eval_on_train_examples=( args["sample_1_of_n_eval_on_train_examples"]), checkpoint_dir=args["checkpoint_dir"], wait_interval=300, timeout=args["eval_timeout"]) else: if args["use_tpu"]: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( args["tpu_name"]) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif args["num_workers"] > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=args["pipeline_config_path"], model_dir=args["model_dir"], train_steps=args["num_train_steps"], use_tpu=args["use_tpu"], checkpoint_every_n=args["checkpoint_every_n"], record_summaries=args["record_summaries"])
def run(self): self._validate_pipeline_config() if self._memory_growth: gpus = tf.config.experimental.list_physical_devices("GPU") if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) print("Running train loop...") strategy = tf.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=self._pipeline_config_path, model_dir=self._training_loop_path, checkpoint_max_to_keep=None # keep all checkpoints )
testboundingbox = decoded_tensors['groundtruth_boxes'].numpy() # show_oneimage_category(testimage, testlabel, testboundingbox, IMAGE_SIZE) # cv2.imwrite('result.jpg', resultimage) cwd = os.getcwd() # Print the current working directory print("Current working directory: {0}".format(cwd)) # Start the training, ref: https://github.com/tensorflow/models/blob/master/research/object_detection/model_main_tf2.py pipeline_config_path = '/Developer/MyRepo/WaymoObjectDetection/2DObject/tfobjectdetection/tf_ssdresnet50_1024_pipeline_P100.config' model_dir = '/Developer/MyRepo/mymodels/tf_ssdresnet50_output' num_train_steps = 150000 steps_per_sec_list = [] checkpoint_every_n=1000 tf.config.set_soft_device_placement(True) strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): #in: https://github.com/tensorflow/models/blob/master/research/object_detection/model_lib_v2.py model_lib_v2.train_loop( pipeline_config_path=pipeline_config_path, model_dir=model_dir, train_steps=num_train_steps, use_tpu=False, checkpoint_every_n=1000, record_summaries=True)
def main(unused_argv): if FLAGS.checkpoint_dir: print("\n-------Running evaluation") else: print("\n-------Running traingin!") flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) print( "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) print( "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) if FLAGS.checkpoint_dir: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif FLAGS.num_workers > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, save_final_config=True, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu, checkpoint_every_n=FLAGS.checkpoint_every_n, record_summaries=FLAGS.record_summaries)