def testKeyValueOverrideBadKey(self): """Tests that overwriting with a bad key causes an exception.""" pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() configs = self._create_and_load_test_configs(pipeline_config) hparams = tf.contrib.training.HParams(**{"train_config.no_such_field": 10}) with self.assertRaises(ValueError): config_util.merge_external_params_with_configs(configs, hparams)
def testOverwriteBatchSizeWithBadValueType(self): """Tests that overwriting with a bad valuye type causes an exception.""" pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_config.batch_size = 2 configs = self._create_and_load_test_configs(pipeline_config) # Type should be an integer, but we're passing a string "10". hparams = tf.contrib.training.HParams(**{"train_config.batch_size": "10"}) with self.assertRaises(TypeError): config_util.merge_external_params_with_configs(configs, hparams)
def _assertOptimizerWithNewLearningRate(self, optimizer_name): """Asserts successful updating of all learning rate schemes.""" original_learning_rate = 0.7 learning_rate_scaling = 0.1 hparams = tf.HParams(learning_rate=0.15) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") # Constant learning rate. pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer = getattr(pipeline_config.train_config.optimizer, optimizer_name) _update_optimizer_with_constant_learning_rate(optimizer, original_learning_rate) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) optimizer = getattr(configs["train_config"].optimizer, optimizer_name) constant_lr = optimizer.learning_rate.constant_learning_rate self.assertAlmostEqual(hparams.learning_rate, constant_lr.learning_rate) # Exponential decay learning rate. pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer = getattr(pipeline_config.train_config.optimizer, optimizer_name) _update_optimizer_with_exponential_decay_learning_rate( optimizer, original_learning_rate) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) optimizer = getattr(configs["train_config"].optimizer, optimizer_name) exponential_lr = optimizer.learning_rate.exponential_decay_learning_rate self.assertAlmostEqual(hparams.learning_rate, exponential_lr.initial_learning_rate) # Manual step learning rate. pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer = getattr(pipeline_config.train_config.optimizer, optimizer_name) _update_optimizer_with_manual_step_learning_rate( optimizer, original_learning_rate, learning_rate_scaling) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) optimizer = getattr(configs["train_config"].optimizer, optimizer_name) manual_lr = optimizer.learning_rate.manual_step_learning_rate self.assertAlmostEqual(hparams.learning_rate, manual_lr.initial_learning_rate) for i, schedule in enumerate(manual_lr.schedule): self.assertAlmostEqual(hparams.learning_rate * learning_rate_scaling**i, schedule.learning_rate)
def testGenericConfigOverride(self): """Tests generic config overrides for all top-level configs.""" # Set one parameter for each of the top-level pipeline configs: pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.ssd.num_classes = 1 pipeline_config.train_config.batch_size = 1 pipeline_config.eval_config.num_visualizations = 1 pipeline_config.train_input_reader.label_map_path = "/some/path" pipeline_config.eval_input_reader.add().label_map_path = "/some/path" pipeline_config.graph_rewriter.quantization.weight_bits = 1 pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") _write_config(pipeline_config, pipeline_config_path) # Override each of the parameters: configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) hparams = tf.contrib.training.HParams( **{ "model.ssd.num_classes": 2, "train_config.batch_size": 2, "train_input_config.label_map_path": "/some/other/path", "eval_config.num_visualizations": 2, "graph_rewriter_config.quantization.weight_bits": 2 }) configs = config_util.merge_external_params_with_configs(configs, hparams) # Ensure that the parameters have the overridden values: self.assertEqual(2, configs["model"].ssd.num_classes) self.assertEqual(2, configs["train_config"].batch_size) self.assertEqual("/some/other/path", configs["train_input_config"].label_map_path) self.assertEqual(2, configs["eval_config"].num_visualizations) self.assertEqual(2, configs["graph_rewriter_config"].quantization.weight_bits)
def testOverwriteBatchSizeWithKeyValue(self): """Tests that batch size is overwritten based on key/value.""" pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_config.batch_size = 2 configs = self._create_and_load_test_configs(pipeline_config) hparams = tf.contrib.training.HParams(**{"train_config.batch_size": 10}) configs = config_util.merge_external_params_with_configs(configs, hparams) new_batch_size = configs["train_config"].batch_size self.assertEqual(10, new_batch_size)
def testUseMovingAverageForEval(self): use_moving_averages_orig = False pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.eval_config.use_moving_averages = use_moving_averages_orig _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, eval_with_moving_averages=True) self.assertEqual(True, configs["eval_config"].use_moving_averages)
def _get_configs_for_model(model_name): """Returns configurations for model.""" filename = get_pipeline_config_path(model_name) data_path = _get_data_path() label_map_path = _get_labelmap_path() configs = config_util.get_configs_from_pipeline_file(filename) configs = config_util.merge_external_params_with_configs( configs, train_input_path=data_path, eval_input_path=data_path, label_map_path=label_map_path) return configs
def _get_configs_for_model(model_name): """Returns configurations for model.""" filename = get_pipeline_config_path(model_name) data_path = _get_data_path() label_map_path = _get_labelmap_path() configs = config_util.get_configs_from_pipeline_file(filename) override_dict = { 'train_input_path': data_path, 'eval_input_path': data_path, 'label_map_path': label_map_path } configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) return configs
def testNewBatchSizeWithClipping(self): """Tests that batch size is clipped to 1 from below.""" original_batch_size = 2 hparams = tf.contrib.training.HParams(batch_size=0.5) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_config.batch_size = original_batch_size _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) new_batch_size = configs["train_config"].batch_size self.assertEqual(1, new_batch_size) # Clipped to 1.0.
def _get_configs_for_model(model_name): """Returns configurations for model.""" fname = os.path.join(tf.resource_loader.get_data_files_path(), 'samples/configs/' + model_name + '.config') label_map_path = os.path.join(tf.resource_loader.get_data_files_path(), 'data/pet_label_map.pbtxt') data_path = os.path.join(tf.resource_loader.get_data_files_path(), 'test_data/pets_examples.record') configs = config_util.get_configs_from_pipeline_file(fname) return config_util.merge_external_params_with_configs( configs, train_input_path=data_path, eval_input_path=data_path, label_map_path=label_map_path)
def testNewBatchSize(self): """Tests that batch size is updated appropriately.""" original_batch_size = 2 hparams = tf.contrib.training.HParams(batch_size=16) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_config.batch_size = original_batch_size _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) new_batch_size = configs["train_config"].batch_size self.assertEqual(16, new_batch_size)
def testTrainShuffle(self): """Tests that `train_shuffle` keyword arguments are applied correctly.""" original_shuffle = True desired_shuffle = False pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_input_reader.shuffle = original_shuffle _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, train_shuffle=desired_shuffle) train_shuffle = configs["train_input_config"].shuffle self.assertEqual(desired_shuffle, train_shuffle)
def testEvalShuffle(self): """Tests that `eval_shuffle` keyword arguments are applied correctly.""" original_shuffle = True desired_shuffle = False pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.eval_input_reader.add().shuffle = original_shuffle _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"eval_shuffle": desired_shuffle} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) self.assertEqual(desired_shuffle, configs["eval_input_configs"][0].shuffle)
def testOverwriteAllEvalNumEpochs(self): original_num_epochs = 10 new_num_epochs = 1 pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.eval_input_reader.add().num_epochs = original_num_epochs pipeline_config.eval_input_reader.add().num_epochs = original_num_epochs _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"eval_num_epochs": new_num_epochs} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) for eval_input_config in configs["eval_input_configs"]: self.assertEqual(new_num_epochs, eval_input_config.num_epochs)
def testNewMomentumOptimizerValue(self): """Tests that new momentum value is updated appropriately.""" original_momentum_value = 0.4 hparams = tf.contrib.training.HParams(momentum_optimizer_value=1.1) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer_config = pipeline_config.train_config.optimizer.rms_prop_optimizer optimizer_config.momentum_optimizer_value = original_momentum_value _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) optimizer_config = configs["train_config"].optimizer.rms_prop_optimizer new_momentum_value = optimizer_config.momentum_optimizer_value self.assertAlmostEqual(1.0, new_momentum_value) # Clipped to 1.0.
def testOverWriteRetainOriginalImages(self): """Tests that `train_shuffle` keyword arguments are applied correctly.""" original_retain_original_images = True desired_retain_original_images = False pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.eval_config.retain_original_images = ( original_retain_original_images) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, retain_original_images_in_eval=desired_retain_original_images) retain_original_images = configs["eval_config"].retain_original_images self.assertEqual(desired_retain_original_images, retain_original_images)
def testMergingKeywordArguments(self): """Tests that keyword arguments get merged as do hyperparameters.""" original_num_train_steps = 100 desired_num_train_steps = 10 pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_config.num_steps = original_num_train_steps _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"train_steps": desired_num_train_steps} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) train_steps = configs["train_config"].num_steps self.assertEqual(desired_num_train_steps, train_steps)
def testNewTrainInputPathList(self): """Tests that train input path can be overwritten with multiple files.""" original_train_path = ["path/to/data"] new_train_path = ["another/path/to/data", "yet/another/path/to/data"] pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() reader_config = pipeline_config.train_input_reader.tf_record_input_reader reader_config.input_path.extend(original_train_path) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, train_input_path=new_train_path) reader_config = configs["train_input_config"].tf_record_input_reader final_path = reader_config.input_path self.assertEqual(new_train_path, final_path)
def testErrorOverwritingMultipleInputConfig(self): original_shuffle = False new_shuffle = True pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() eval_1 = pipeline_config.eval_input_reader.add() eval_1.shuffle = original_shuffle eval_1.name = "eval_1" eval_2 = pipeline_config.eval_input_reader.add() eval_2.shuffle = original_shuffle eval_2.name = "eval_2" _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"eval_shuffle": new_shuffle} with self.assertRaises(ValueError): configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict)
def testNewTrainInputPath(self): """Tests that train input path can be overwritten with single file.""" original_train_path = ["path/to/data"] new_train_path = "another/path/to/data" pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() reader_config = pipeline_config.train_input_reader.tf_record_input_reader reader_config.input_path.extend(original_train_path) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"train_input_path": new_train_path} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) reader_config = configs["train_input_config"].tf_record_input_reader final_path = reader_config.input_path self.assertEqual([new_train_path], final_path)
def testNewMaskType(self): """Tests that mask type can be overwritten in input readers.""" original_mask_type = input_reader_pb2.NUMERICAL_MASKS new_mask_type = input_reader_pb2.PNG_MASKS pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() train_input_reader = pipeline_config.train_input_reader train_input_reader.mask_type = original_mask_type eval_input_reader = pipeline_config.eval_input_reader eval_input_reader.mask_type = original_mask_type _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, mask_type=new_mask_type) self.assertEqual(new_mask_type, configs["train_input_config"].mask_type) self.assertEqual(new_mask_type, configs["eval_input_config"].mask_type)
def _get_configs_for_model(model_name): """Returns configurations for model.""" fname = os.path.join( FLAGS.test_srcdir, ('google3/third_party/tensorflow_models/' 'object_detection/samples/configs/' + model_name + '.config')) label_map_path = os.path.join(FLAGS.test_srcdir, ('google3/third_party/tensorflow_models/' 'object_detection/data/pet_label_map.pbtxt')) data_path = os.path.join(FLAGS.test_srcdir, ('google3/third_party/tensorflow_models/' 'object_detection/test_data/pets_examples.record')) configs = config_util.get_configs_from_pipeline_file(fname) return config_util.merge_external_params_with_configs( configs, train_input_path=data_path, eval_input_path=data_path, label_map_path=label_map_path)
def testOverwriteAllEvalSampling(self): original_num_eval_examples = 1 new_num_eval_examples = 10 pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.eval_input_reader.add().sample_1_of_n_examples = ( original_num_eval_examples) pipeline_config.eval_input_reader.add().sample_1_of_n_examples = ( original_num_eval_examples) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"sample_1_of_n_eval_examples": new_num_eval_examples} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) for eval_input_config in configs["eval_input_configs"]: self.assertEqual(new_num_eval_examples, eval_input_config.sample_1_of_n_examples)
def testDontOverwriteEmptyLabelMapPath(self): """Tests that label map path will not by overwritten with empty string.""" original_label_map_path = "path/to/original/label_map" new_label_map_path = "" pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() train_input_reader = pipeline_config.train_input_reader train_input_reader.label_map_path = original_label_map_path eval_input_reader = pipeline_config.eval_input_reader eval_input_reader.label_map_path = original_label_map_path _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, label_map_path=new_label_map_path) self.assertEqual(original_label_map_path, configs["train_input_config"].label_map_path) self.assertEqual(original_label_map_path, configs["eval_input_config"].label_map_path)
def testNewLabelMapPath(self): """Tests that label map path can be overwritten in input readers.""" original_label_map_path = "path/to/original/label_map" new_label_map_path = "path//to/new/label_map" pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() train_input_reader = pipeline_config.train_input_reader train_input_reader.label_map_path = original_label_map_path eval_input_reader = pipeline_config.eval_input_reader.add() eval_input_reader.label_map_path = original_label_map_path _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"label_map_path": new_label_map_path} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) self.assertEqual(new_label_map_path, configs["train_input_config"].label_map_path) for eval_input_config in configs["eval_input_configs"]: self.assertEqual(new_label_map_path, eval_input_config.label_map_path)
def testNewClassificationLocalizationWeightRatio(self): """Tests that the loss weight ratio is updated appropriately.""" original_localization_weight = 0.1 original_classification_weight = 0.2 new_weight_ratio = 5.0 hparams = tf.contrib.training.HParams( classification_localization_weight_ratio=new_weight_ratio) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.ssd.loss.localization_weight = ( original_localization_weight) pipeline_config.model.ssd.loss.classification_weight = ( original_classification_weight) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) loss = configs["model"].ssd.loss self.assertAlmostEqual(1.0, loss.localization_weight) self.assertAlmostEqual(new_weight_ratio, loss.classification_weight)
def testDontOverwriteEmptyLabelMapPath(self): """Tests that label map path will not by overwritten with empty string.""" original_label_map_path = "path/to/original/label_map" new_label_map_path = "" pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() train_input_reader = pipeline_config.train_input_reader train_input_reader.label_map_path = original_label_map_path eval_input_reader = pipeline_config.eval_input_reader eval_input_reader.label_map_path = original_label_map_path _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, label_map_path=new_label_map_path) self.assertEqual(original_label_map_path, configs["train_input_config"].label_map_path) self.assertEqual(original_label_map_path, configs["eval_input_config"].label_map_path)
def testOverWriteRetainOriginalImages(self): """Tests that `train_shuffle` keyword arguments are applied correctly.""" original_retain_original_images = True desired_retain_original_images = False pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.eval_config.retain_original_images = ( original_retain_original_images) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) override_dict = { "retain_original_images_in_eval": desired_retain_original_images } configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) retain_original_images = configs["eval_config"].retain_original_images self.assertEqual(desired_retain_original_images, retain_original_images)
def testNewFocalLossParameters(self): """Tests that the loss weight ratio is updated appropriately.""" original_alpha = 1.0 original_gamma = 1.0 new_alpha = 0.3 new_gamma = 2.0 hparams = tf.HParams(focal_loss_alpha=new_alpha, focal_loss_gamma=new_gamma) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() classification_loss = pipeline_config.model.ssd.loss.classification_loss classification_loss.weighted_sigmoid_focal.alpha = original_alpha classification_loss.weighted_sigmoid_focal.gamma = original_gamma _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs(configs, hparams) classification_loss = configs["model"].ssd.loss.classification_loss self.assertAlmostEqual(new_alpha, classification_loss.weighted_sigmoid_focal.alpha) self.assertAlmostEqual(new_gamma, classification_loss.weighted_sigmoid_focal.gamma)
def testMergingKeywordArguments(self): """Tests that keyword arguments get merged as do hyperparameters.""" original_num_train_steps = 100 original_num_eval_steps = 5 desired_num_train_steps = 10 desired_num_eval_steps = 1 pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_config.num_steps = original_num_train_steps pipeline_config.eval_config.num_examples = original_num_eval_steps _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, train_steps=desired_num_train_steps, eval_steps=desired_num_eval_steps) train_steps = configs["train_config"].num_steps eval_steps = configs["eval_config"].num_examples self.assertEqual(desired_num_train_steps, train_steps) self.assertEqual(desired_num_eval_steps, eval_steps)
def testNewBatchSize(self): """Tests that batch size is updated appropriately.""" print( '\n================================================================' ) print('testNewBatchSize') original_batch_size = 2 hparams = tf.contrib.training.HParams(batch_size=16) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.train_config.batch_size = original_batch_size _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams) new_batch_size = configs["train_config"].batch_size self.assertEqual(16, new_batch_size)
def _set_config_paths(self): configs = create_configs_from_pipeline_proto(self.pipeline) update_input_reader_config(configs, key_name="train_input_config", input_name=None, field_name="input_path", value=str(self._val_record_file), path_updater=_update_tf_record_input_path) update_input_reader_config(configs, key_name="eval_input_configs", input_name=None, field_name="input_path", value=str(self._train_record_file), path_updater=_update_tf_record_input_path) update_dict = { "label_map_path": str(self._labels_map_file), "train_config.fine_tune_checkpoint": str(self._checkpoint_model_folder.joinpath("model.ckpt")) } configs = merge_external_params_with_configs(configs, kwargs_dict=update_dict) self._pipeline = create_pipeline_proto_from_configs(configs)
def testNewMaskType(self): """Tests that mask type can be overwritten in input readers.""" original_mask_type = input_reader_pb2.NUMERICAL_MASKS new_mask_type = input_reader_pb2.PNG_MASKS pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() train_input_reader = pipeline_config.train_input_reader train_input_reader.mask_type = original_mask_type eval_input_reader = pipeline_config.eval_input_reader.add() eval_input_reader.mask_type = original_mask_type _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) override_dict = {"mask_type": new_mask_type} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) self.assertEqual(new_mask_type, configs["train_input_config"].mask_type) self.assertEqual(new_mask_type, configs["eval_input_configs"][0].mask_type)
def testGenericConfigOverride(self): """Tests generic config overrides for all top-level configs.""" # Set one parameter for each of the top-level pipeline configs: pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.ssd.num_classes = 1 pipeline_config.train_config.batch_size = 1 pipeline_config.eval_config.num_visualizations = 1 pipeline_config.train_input_reader.label_map_path = "/some/path" pipeline_config.eval_input_reader.add().label_map_path = "/some/path" pipeline_config.graph_rewriter.quantization.weight_bits = 1 pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") _write_config(pipeline_config, pipeline_config_path) # Override each of the parameters: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) hparams = tf.contrib.training.HParams( **{ "model.ssd.num_classes": 2, "train_config.batch_size": 2, "train_input_config.label_map_path": "/some/other/path", "eval_config.num_visualizations": 2, "graph_rewriter_config.quantization.weight_bits": 2 }) configs = config_util.merge_external_params_with_configs( configs, hparams) # Ensure that the parameters have the overridden values: self.assertEqual(2, configs["model"].ssd.num_classes) self.assertEqual(2, configs["train_config"].batch_size) self.assertEqual("/some/other/path", configs["train_input_config"].label_map_path) self.assertEqual(2, configs["eval_config"].num_visualizations) self.assertEqual( 2, configs["graph_rewriter_config"].quantization.weight_bits)
def testUpdateMaskTypeForAllInputConfigs(self): original_mask_type = input_reader_pb2.NUMERICAL_MASKS new_mask_type = input_reader_pb2.PNG_MASKS pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() train_config = pipeline_config.train_input_reader train_config.mask_type = original_mask_type eval_1 = pipeline_config.eval_input_reader.add() eval_1.mask_type = original_mask_type eval_1.name = "eval_1" eval_2 = pipeline_config.eval_input_reader.add() eval_2.mask_type = original_mask_type eval_2.name = "eval_2" _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) override_dict = {"mask_type": new_mask_type} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) self.assertEqual(configs["train_input_config"].mask_type, new_mask_type) for eval_input_config in configs["eval_input_configs"]: self.assertEqual(eval_input_config.mask_type, new_mask_type)
def testNewLabelMapPath(self): """Tests that label map path can be overwritten in input readers.""" original_label_map_path = "path/to/original/label_map" new_label_map_path = "path//to/new/label_map" pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() train_input_reader = pipeline_config.train_input_reader train_input_reader.label_map_path = original_label_map_path eval_input_reader = pipeline_config.eval_input_reader.add() eval_input_reader.label_map_path = original_label_map_path _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) override_dict = {"label_map_path": new_label_map_path} configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) self.assertEqual(new_label_map_path, configs["train_input_config"].label_map_path) for eval_input_config in configs["eval_input_configs"]: self.assertEqual(new_label_map_path, eval_input_config.label_map_path)
def testNewClassificationLocalizationWeightRatio(self): """Tests that the loss weight ratio is updated appropriately.""" original_localization_weight = 0.1 original_classification_weight = 0.2 new_weight_ratio = 5.0 hparams = tf.contrib.training.HParams( classification_localization_weight_ratio=new_weight_ratio) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.ssd.loss.localization_weight = ( original_localization_weight) pipeline_config.model.ssd.loss.classification_weight = ( original_classification_weight) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams) loss = configs["model"].ssd.loss self.assertAlmostEqual(1.0, loss.localization_weight) self.assertAlmostEqual(new_weight_ratio, loss.classification_weight)
def create_estimator(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, train_batch_size=None, model_fn_creator=model.create_model_fn, use_tpu=False, num_shards=1, params=None, **kwargs): """Creates an `Estimator` object. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. eval_steps: Number of evaluation steps per evaluation cycle. If None, the number of evaluation steps is set from the `EvalConfig` proto. train_batch_size: Training batch size. If none, use batch size from `TrainConfig` proto. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. use_tpu: Boolean, whether training and evaluation should run on TPU. num_shards: Number of shards (TPU cores). params: Parameter dictionary passed from the estimator. **kwargs: Additional keyword arguments for configuration override. Returns: Estimator: A estimator object used for training and evaluation train_input_fn: Input function for the training loop eval_input_fn: Input function for the evaluation run train_steps: Number of training steps either from arg `train_steps` or `TrainConfig` proto eval_steps: Number of evaluation steps either from arg `eval_steps` or `EvalConfig` proto """ configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, batch_size=train_batch_size, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if params is None: params = {} if train_steps is None: train_steps = train_config.num_steps if train_config.num_steps else None if eval_steps is None: eval_steps = eval_config.num_examples if eval_config.num_examples else None detection_model_fn = functools.partial( model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL. train_input_fn = inputs.create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn_creator(detection_model_fn, configs, hparams, use_tpu), train_batch_size=train_config.batch_size, # For each core, only batch size 1 is supported for eval. eval_batch_size=num_shards * 1 if use_tpu else 1, use_tpu=use_tpu, config=run_config, params=params) return estimator, train_input_fn, eval_input_fn, train_steps, eval_steps
def create_estimator(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, train_batch_size=None, model_fn_creator=model.create_model_fn, use_tpu=False, num_shards=1, params=None, **kwargs): """Creates an `Estimator` object. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. eval_steps: Number of evaluation steps per evaluation cycle. If None, the number of evaluation steps is set from the `EvalConfig` proto. train_batch_size: Training batch size. If none, use batch size from `TrainConfig` proto. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. use_tpu: Boolean, whether training and evaluation should run on TPU. num_shards: Number of shards (TPU cores). params: Parameter dictionary passed from the estimator. **kwargs: Additional keyword arguments for configuration override. Returns: Estimator: A estimator object used for training and evaluation train_input_fn: Input function for the training loop eval_validation_input_fn: Input function to run for evaluation on validation data. eval_training_input_fn: Input function to run for evaluation on training data. train_steps: Number of training steps either from arg `train_steps` or `TrainConfig` proto eval_steps: Number of evaluation steps either from arg `eval_steps` or `EvalConfig` proto """ configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, batch_size=train_batch_size, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if FLAGS.eval_training_data: eval_input_config = configs['train_input_config'] if params is None: params = {} if train_steps is None and train_config.num_steps: train_steps = train_config.num_steps if eval_steps is None and eval_config.num_examples: eval_steps = eval_config.num_examples detection_model_fn = functools.partial( model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL. train_input_fn = inputs.create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_validation_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) eval_training_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=train_input_config, model_config=model_config) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn_creator(detection_model_fn, configs, hparams, use_tpu), train_batch_size=train_config.batch_size, # For each core, only batch size 1 is supported for eval. eval_batch_size=num_shards * 1 if use_tpu else 1, use_tpu=use_tpu, config=run_config, params=params) return (estimator, train_input_fn, eval_validation_input_fn, eval_training_input_fn, train_steps, eval_steps)
def _assertOptimizerWithNewLearningRate(self, optimizer_name): """Asserts successful updating of all learning rate schemes.""" original_learning_rate = 0.7 learning_rate_scaling = 0.1 warmup_learning_rate = 0.07 hparams = tf.contrib.training.HParams(learning_rate=0.15) pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") # Constant learning rate. pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer = getattr(pipeline_config.train_config.optimizer, optimizer_name) _update_optimizer_with_constant_learning_rate(optimizer, original_learning_rate) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams) optimizer = getattr(configs["train_config"].optimizer, optimizer_name) constant_lr = optimizer.learning_rate.constant_learning_rate self.assertAlmostEqual(hparams.learning_rate, constant_lr.learning_rate) # Exponential decay learning rate. pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer = getattr(pipeline_config.train_config.optimizer, optimizer_name) _update_optimizer_with_exponential_decay_learning_rate( optimizer, original_learning_rate) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams) optimizer = getattr(configs["train_config"].optimizer, optimizer_name) exponential_lr = optimizer.learning_rate.exponential_decay_learning_rate self.assertAlmostEqual(hparams.learning_rate, exponential_lr.initial_learning_rate) # Manual step learning rate. pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer = getattr(pipeline_config.train_config.optimizer, optimizer_name) _update_optimizer_with_manual_step_learning_rate( optimizer, original_learning_rate, learning_rate_scaling) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams) optimizer = getattr(configs["train_config"].optimizer, optimizer_name) manual_lr = optimizer.learning_rate.manual_step_learning_rate self.assertAlmostEqual(hparams.learning_rate, manual_lr.initial_learning_rate) for i, schedule in enumerate(manual_lr.schedule): self.assertAlmostEqual( hparams.learning_rate * learning_rate_scaling**i, schedule.learning_rate) # Cosine decay learning rate. pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() optimizer = getattr(pipeline_config.train_config.optimizer, optimizer_name) _update_optimizer_with_cosine_decay_learning_rate( optimizer, original_learning_rate, warmup_learning_rate) _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams) optimizer = getattr(configs["train_config"].optimizer, optimizer_name) cosine_lr = optimizer.learning_rate.cosine_decay_learning_rate self.assertAlmostEqual(hparams.learning_rate, cosine_lr.learning_rate_base) warmup_scale_factor = warmup_learning_rate / original_learning_rate self.assertAlmostEqual(hparams.learning_rate * warmup_scale_factor, cosine_lr.warmup_learning_rate)
def train_loop(config_path: str, model_dir: str, config_override: Optional[ pipeline_pb2.TrainEvalPipelineConfig] = None, train_steps: Optional[int] = None, use_tpu: bool = False, save_final_config: bool = False, log_every_n: int = 100, ckpt_every_n: int = 1000, ckpt_max_to_keep: int = 7, record_summaries: bool = True, **kwargs) -> None: """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `ckpt_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `config_path`. train_steps: Number of training steps. If None, training steps from `TrainConfig` proto will be adopted. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. log_every_n: Log total loss every n training steps. ckpt_every_n: Checkpoint every n training steps. ckpt_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. record_summaries: Boolean, whether or not to record summaries. **kwargs: Additional keyword arguments for configuration override. """ # parse config configs = config_util.get_configs_from_pipeline_file( config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu, }) configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_gt_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradient_norm = None if train_config.gradient_clipping_by_norm > 0: clip_gradient_norm = train_config.gradient_clipping_by_norm if kwargs['use_bfloat16']: tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError( 'train_pb2.load_all_detection_checkpoint_vars unsupported in TF2') # base checkpoint to fine-tune from config_util.update_fine_tune_checkpoint_type(train_config) base_ckpt = train_config.fine_tune_checkpoint base_ckpt_type = train_config.fine_tune_checkpoint_type base_ckpt_ver = train_config.fine_tune_checkpoint_version # write the as-run pipeline config to disk if save_final_config: pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # build model, input, optimizer strategy = tf.distribute.get_strategy() with strategy.scope(): # build model model = model_builder.build(model_config=model_config, is_training=True) # build input def train_dataset_fn( input_context: tf.distribute.InputContext) -> tf.data.Dataset: """Callable to create train input.""" train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=model, input_context=input_context, ) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) # build optimizer global_step = tf.Variable( 0, trainable=False, dtype=tf.int64, name='global_step', aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate # prepare for training # get appropriate filepath (temporary or not) based on whether the worker is the chief summary_log_path = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.summary.create_file_writer(summary_log_path) else: summary_writer = tf.summary.create_noop_writer() if use_tpu: num_steps_per_iteration = 100 else: num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # prepare checkpoint manager # (do not use manager.latest_checkpoint as manager_dir is not model_dir while running in worker) ckpt = tf.train.Checkpoint(model=model, step=global_step, optimizer=optimizer) ckpt_max_to_keep = ckpt_max_to_keep if strategy.extended.should_checkpoint else 1 manager_dir = get_filepath(strategy, model_dir) manager = tf.train.CheckpointManager( ckpt, manager_dir, max_to_keep=ckpt_max_to_keep) latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: # load latest checkpoint being trained ckpt.restore(latest_ckpt).expect_partial() elif base_ckpt: # load a pre-trained checkpoint load_base_ckpt(model, base_ckpt, base_ckpt_type, base_ckpt_ver, train_input, unpad_gt_tensors) # get trainable variables train_vars = get_train_vars(model, train_config) # define training step def train_step_fn(features: Dict, labels: Dict): """Single train step.""" loss = eager_train_step( model, train_vars, features, labels, unpad_gt_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradient_norm=clip_gradient_norm, global_step=global_step, num_replicas=strategy.num_replicas_in_sync, ) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) # save initialized version of checkpoint if int(global_step.value()) == 0: manager.save() ckpt_step = int(global_step.value()) logged_step = global_step.value() # proceed with training last_step_time = time.time() for _ in range(global_step.value(), train_config.num_steps, num_steps_per_iteration): # execute a step (forward pass + backward pass) loss = _dist_train_step(train_input_iter) # log time curr_step = global_step.value() time_taken = time.time() - last_step_time last_step_time = time.time() tf.summary.scalar( 'steps_per_sec', num_steps_per_iteration * 1.0 / time_taken, step=global_step, ) # log loss if curr_step - logged_step >= log_every_n: step_time = time_taken / num_steps_per_iteration step_msg = 'Step {} per-step time {:.3f}s loss={:.3f}'.format( curr_step, step_time, loss) v1.logging.info(step_msg) logged_step = curr_step # save checkpoint regularly if (curr_step - ckpt_step) >= ckpt_every_n: manager.save() ckpt_step = curr_step # remove checkpoint directories of non-chief workers that MultiWorkerMirroredStrategy forces us to save during sync # distributed training. clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_log_path)
def shuffle_params(configs, models): if configs['model'].HasField('ssd'): print(len(models)) matched_threshold = models[random.choice(list(models.keys( )))].params["model.ssd.matcher.argmax_matcher.matched_threshold"] unmatched_threshold = min( models[random.choice(list(models.keys()))]. params["model.ssd.matcher.argmax_matcher.unmatched_threshold"], matched_threshold) ssd_generative_params = { "model.ssd.matcher.argmax_matcher.matched_threshold": matched_threshold, "model.ssd.matcher.argmax_matcher.unmatched_threshold": unmatched_threshold, "model.ssd.box_predictor.convolutional_box_predictor.dropout_keep_probability": models[random.choice(list(models.keys()))].params[ "model.ssd.box_predictor.convolutional_box_predictor.dropout_keep_probability"], "model.ssd.feature_extractor.min_depth": models[random.choice(list(models.keys()))]. params["model.ssd.feature_extractor.min_depth"] } configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=ssd_generative_params) # print(configs['model'].ssd) return configs, ssd_generative_params elif configs['model'].HasField('faster_rcnn'): max_total_detections = models[random.choice(list(models.keys( )))].params[ "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.max_total_detections"] faster_config_params = { 'model.faster_rcnn.first_stage_nms_iou_threshold': models[random.choice(list(models.keys( )))].params["model.faster_rcnn.first_stage_nms_iou_threshold"], "model.faster_rcnn.first_stage_max_proposals": models[random.choice(list(models.keys( )))].params["model.faster_rcnn.first_stage_max_proposals"], "model.faster_rcnn.first_stage_localization_loss_weight": models[random.choice(list(models.keys()))]. params["model.faster_rcnn.first_stage_localization_loss_weight"], "model.faster_rcnn.second_stage_localization_loss_weight": models[random.choice(list(models.keys()))]. params["model.faster_rcnn.second_stage_localization_loss_weight"], "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.score_threshold": models[random.choice(list(models.keys()))].params[ "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.score_threshold"], "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.iou_threshold": models[random.choice(list(models.keys()))].params[ "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.iou_threshold"], "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.max_total_detections": max_total_detections, "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.max_detections_per_class": min( max_total_detections, models[random.choice(list(models.keys()))].params[ "model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.max_detections_per_class"] ) } configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=faster_config_params) return configs, faster_config_params
def eval_continuously( pipeline_config_path, config_override=None, train_steps=None, sample_1_of_n_eval_examples=1, sample_1_of_n_eval_on_train_examples=1, use_tpu=False, override_eval_num_epochs=True, postprocess_on_cpu=False, model_dir=None, checkpoint_dir=None, wait_interval=180, timeout=3600, eval_index=0, save_final_config=False, **kwargs): """Run continuous evaluation of a detection model eagerly. This method builds the model, and continously restores it from the most recent training checkpoint in the checkpoint directory & evaluates it on the evaluation data. Args: pipeline_config_path: A path to a pipeline config file. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. sample_1_of_n_eval_examples: Integer representing how often an eval example should be sampled. If 1, will sample all examples. sample_1_of_n_eval_on_train_examples: Similar to `sample_1_of_n_eval_examples`, except controls the sampling of training data for evaluation. use_tpu: Boolean, whether training and evaluation should run on TPU. override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for eval_input. postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true, postprocess is scheduled on the host cpu. model_dir: Directory to output resulting evaluation summaries to. checkpoint_dir: Directory that contains the training checkpoints. wait_interval: The mimmum number of seconds to wait before checking for a new checkpoint. timeout: The maximum number of seconds to wait for a checkpoint. Execution will terminate if no new checkpoints are found after these many seconds. eval_index: int, If given, only evaluate the dataset at the given index. By default, evaluates dataset at 0'th index. save_final_config: Whether to save the pipeline config file to the model directory. **kwargs: Additional keyword arguments for configuration override. """ config_override = None configs = config_util.get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) # get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ # 'get_configs_from_pipeline_file'] # create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ # 'create_pipeline_proto_from_configs'] # merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ # 'merge_external_params_with_configs'] # configs = get_configs_from_pipeline_file( # pipeline_config_path, config_override=config_override) kwargs.update({ 'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) if train_steps is not None: kwargs['train_steps'] = train_steps if override_eval_num_epochs: kwargs.update({'eval_num_epochs': 1}) # tf.logging.warning( # 'Forced number of epochs for all eval validations to be 1.') configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) if model_dir and save_final_config: # tf.logging.info('Saving pipeline config file to directory {}'.format( # model_dir)) pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) model_config = configs['model'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_configs = configs['eval_input_configs'] eval_on_train_input_config = copy.deepcopy(train_input_config) eval_on_train_input_config.sample_1_of_n_examples = ( sample_1_of_n_eval_on_train_examples) if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1: # tf.logging.warning('Expected number of evaluation epochs is 1, but ' # 'instead encountered `eval_on_train_input_config' # '.num_epochs` = ' # '{}. Overwriting `num_epochs` to 1.'.format( # eval_on_train_input_config.num_epochs)) eval_on_train_input_config.num_epochs = 1 if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16') eval_input_config = eval_input_configs[eval_index] strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = model_builder.build( model_config=model_config, is_training=True) # detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']( # model_config=model_config, is_training=True) eval_input = strategy.experimental_distribute_dataset( inputs.eval_input( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config, model=detection_model)) global_step = tf.compat.v2.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64) optimizer, _ = optimizer_builder.build( configs['train_config'].optimizer, global_step=global_step) for latest_checkpoint in tf.train.checkpoints_iterator( checkpoint_dir, timeout=timeout, min_interval_secs=wait_interval): ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) # We run the detection_model on dummy inputs in order to ensure that the # model and all its variables have been properly constructed. Specifically, # this is currently necessary prior to (potentially) creating shadow copies # of the model variables for the EMA optimizer. if eval_config.use_moving_averages: unpad_groundtruth_tensors = ( eval_config.batch_size == 1 and not use_tpu) _ensure_model_is_built(detection_model, eval_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) ckpt.restore(latest_checkpoint).expect_partial() if eval_config.use_moving_averages: optimizer.swap_weights() summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(model_dir, 'eval', eval_input_config.name)) with summary_writer.as_default(): eager_eval_loop( detection_model, configs, eval_input, use_tpu=use_tpu, postprocess_on_cpu=postprocess_on_cpu, global_step=global_step, )
def update_pipeline_config(params, eval_type): cfg = config_util.get_configs_from_pipeline_file( os.path.join(params.config_mnt, params.config_dir)) # update num_of_classes model_name = os.path.basename( os.path.normpath(os.path.join(params.config_mnt, params.config_dir))).lower() print("model name: ", model_name) if model_name.startswith("ssd"): model_cfg = cfg['model'].ssd elif model_name.startswith("faster_rcnn"): model_cfg = cfg['model'].faster_rcnn else: raise ValueError( 'unknown base model {}, we can only handle ssd or faster_rcnn'. format(model_name)) label_map = os.path.join(params.config_mnt, params.label_dir) label_map_dict = label_map_util.get_label_map_dict(label_map) num_classes = len(label_map_dict) model_cfg.num_classes = num_classes # update base_model_dir train_cfg = cfg['train_config'] train_cfg.fine_tune_checkpoint = os.path.join(params.config_mnt, params.transfer_learning_dir, 'model.ckpt') eval_cfg = cfg['eval_config'] eval_cfg.max_evals = 1 eval_cfg.num_examples = int(params.eval_num_examples) # update num_train_steps, label_map_path, train_tfrecords, val_tfrecords, batch size\ print( os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', 'train.record')) hparams = tf.contrib.training.HParams( batch_size=int(params.batch_size), train_steps=int(params.num_steps), label_map_path=label_map, train_input_path=os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', 'train.record'), eval_input_path=os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', eval_type + '.record'), ) cfg = config_util.merge_external_params_with_configs(cfg, hparams) # log metrics run_context = Run.get_context() run_context.log("Batch Size", int(params.batch_size)) run_context.log("Training Steps", int(params.num_steps)) # run.log("Maximum Evaluations",max_evals) updated_pipeline_config = config_util.create_pipeline_proto_from_configs( cfg) print("updated_pipeline_config: ", updated_pipeline_config) updated_pipeline_config_file = os.path.join(params.config_mnt, params.config_dir) print("updated_pipeline_config_file: ", updated_pipeline_config_file) print("dir name: ", os.path.dirname(os.path.join(params.config_mnt, params.config_dir))) config_util.save_pipeline_config( updated_pipeline_config, os.path.join(params.base_mnt, params.source_data_name, 'model_config')) return updated_pipeline_config, updated_pipeline_config_file
config_updata = { 'model.{}.num_classes'.format(config['model'].WhichOneof('model')): num_classes, 'train_config.fine_tune_checkpoint': os.path.join(os.getcwd(), 'pretrained_models', model_name, 'model.ckpt'), 'label_map_path': label_pbtxt, 'train_input_path': train_record, 'eval_input_path': eval_record, } # 更新config文件 tf.logging.info("Updata config file {}".format(pipeline_config)) config = config_util.merge_external_params_with_configs( config, kwargs_dict=config_updata) config = config_util.create_pipeline_proto_from_configs(config) with tf.gfile.Open(pipeline_config, "wb") as f: f.write(text_format.MessageToString(config)) # 训练结果保存位置 save_path = os.path.join(os.getcwd(), 'weights', model_name) if not os.path.exists(save_path): os.mkdir(save_path) def main(unused_argv): train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=tf.estimator.RunConfig(model_dir=save_path), hparams=model_hparams.create_hparams(None), pipeline_config_path=pipeline_config,
def populate_experiment(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, **kwargs): """Populates an `Experiment` object. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. eval_steps: Number of evaluation steps per evaluation cycle. If None, the number of evaluation steps is set from the `EvalConfig` proto. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. **kwargs: Additional keyword arguments for configuration override. Returns: An `Experiment` that defines all aspects of training, evaluation, and export. """ configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if train_steps is None: train_steps = train_config.num_steps if train_config.num_steps else None if eval_steps is None: eval_steps = eval_config.num_examples if eval_config.num_examples else None detection_model_fn = functools.partial( model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL. train_input_fn = inputs.create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) export_strategies = [ tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy( serving_input_fn=inputs.create_predict_input_fn( model_config=model_config)) ] estimator = tf.estimator.Estimator( model_fn=model_fn_creator(detection_model_fn, configs, hparams), config=run_config) if run_config.is_chief: # Store the final pipeline config for traceability. pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) pipeline_config_final_path = os.path.join(estimator.model_dir, 'pipeline.config') config_text = text_format.MessageToString(pipeline_config_final) with tf.gfile.Open(pipeline_config_final_path, 'wb') as f: tf.logging.info('Writing as-run pipeline config file to %s', pipeline_config_final_path) f.write(config_text) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, export_strategies=export_strategies, eval_delay_secs=120,)
def train_loop(pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, checkpoint_every_n=1000, checkpoint_max_to_keep=7, record_summaries=True, performance_summary_exporter=None, num_steps_per_iteration=NUM_STEPS_PER_ITERATION, **kwargs): # """Trains a model using eager + functions. config_override = None configs = config_util.get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors # False add_regularization_loss = train_config.add_regularization_loss # True clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: # Not run clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided train_steps = num_train_steps if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError('train_pb2.load_all_detection_checkpoint_vars ' 'unsupported in TF2') config_util.update_fine_tune_checkpoint_type(train_config) fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # 'detection' fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() from object_detection import inputs from object_detection.builders import optimizer_builder from object_detection.utils import variables_helper with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step', aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: def learning_rate_fn(): return learning_rate # Train the model # Get the appropriate filepath (temporary or not) based on whether the worker # is the chief. summary_writer_filepath = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.compat.v2.summary.create_file_writer( summary_writer_filepath) else: #summary_writer = tf2.summary.create_noop_writer() summary_writer = tf.summary.create_noop_writer() with summary_writer.as_default(): with strategy.scope(): with tf.compat.v2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # Load a fine-tuning checkpoint. if train_config.fine_tune_checkpoint: variables_helper.ensure_checkpoint_supported( train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, model_dir) load_fine_tune_checkpoint( detection_model, train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, fine_tune_checkpoint_version, train_config. run_fine_tune_checkpoint_dummy_computation, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model, optimizer=optimizer) manager_dir = get_filepath(strategy, model_dir) if not strategy.extended.should_checkpoint: checkpoint_max_to_keep = 1 manager = tf.compat.v2.train.CheckpointManager( ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep) # We use the following instead of manager.latest_checkpoint because # manager_dir does not point to the model directory when we are running # in a worker. latest_checkpoint = tf.train.latest_checkpoint(model_dir) ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): """Single train step.""" loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() if hasattr(tf.distribute.Strategy, 'run'): per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) else: per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=(features, labels)) # TODO(anjalisridhar): explore if it is safe to remove the # num_replicas scaling of the loss and switch this to a ReduceOp.Mean return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): # Following suggestion on yaqs/5402607292645376 with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) if int(global_step.value()) == 0: manager.save() checkpointed_step = int(global_step.value()) logged_step = global_step.value() last_step_time = time.time() for _ in range(global_step.value(), train_steps, num_steps_per_iteration): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() steps_per_sec = num_steps_per_iteration * 1.0 / time_taken tf.compat.v2.summary.scalar('steps_per_sec', steps_per_sec, step=global_step) steps_per_sec_list.append(steps_per_sec) if global_step.value() - logged_step >= 100: tf.logging.info( 'Step {} per-step time {:.3f}s loss={:.3f}'.format( global_step.value(), time_taken / num_steps_per_iteration, loss)) logged_step = global_step.value() if ((int(global_step.value()) - checkpointed_step) >= checkpoint_every_n): manager.save() checkpointed_step = int(global_step.value())
def populate_experiment(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, **kwargs): configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if train_steps is None: train_steps = train_config.num_steps if train_config.num_steps else None if eval_steps is None: eval_steps = eval_config.num_examples if eval_config.num_examples else None detection_model_fn = functools.partial( model_builder.build, model_config=model_config) train_input_fn = inputs.create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) export_strategies = [ tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy( serving_input_fn=inputs.create_predict_input_fn( model_config=model_config)) ] estimator = tf.estimator.Estimator( model_fn=model_fn_creator(detection_model_fn, configs, hparams), config=run_config) if run_config.is_chief: pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) pipeline_config_final_path = os.path.join(estimator.model_dir, 'pipeline.config') config_text = text_format.MessageToString(pipeline_config_final) with tf.gfile.Open(pipeline_config_final_path, 'wb') as f: tf.logging.info('Writing as-run pipeline config file to %s', pipeline_config_final_path) f.write(config_text) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, export_strategies=export_strategies, eval_delay_secs=120,)