def test_end_local_training(): run_id = str(uuid.uuid4()) out_dir = "/tmp/newlogsRunTest/" + run_id assert has_training_ended(out_dir) == False subprocess.check_call([ sys.executable, "examples/mxnet/scripts/mnist_gluon_basic_hook_demo.py", "--output-uri", out_dir, "--num_steps", "10", ]) assert has_training_ended(out_dir) shutil.rmtree(out_dir)
def maybe_refresh(self, name=None): if self.loaded_all_steps or not self.dynamic_refresh: return retry_count = 1 training_ended = has_training_ended(self.path) if training_ended and self.loaded_all_steps is False: retry_count = 2 while retry_count > 0: if name is None: self.refresh_data() else: self.refresh_tensor(name) if retry_count > 1: self.logger.info( f"Training has ended, will refresh one final time in " f"{self._training_end_delay_refresh} sec.") time.sleep(self._training_end_delay_refresh) retry_count -= 1 if training_ended is True and self.loaded_all_steps is False: self.loaded_all_steps = True self.last_complete_step = ( sorted(self._global_to_mode.keys())[-1] if len(self._global_to_mode) else self.last_complete_step ) # Update last_complete_step to the last step written self.logger.info("Loaded all steps") self.logger.debug( f"Training Has Ended : last_complete_step was: {self.last_complete_step}" ) self.logger.debug( f"Training Has Ended : last_index_token was: {self.last_index_token}" )
def test_loss_collection_with_no_other_collections(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config, include_collections=[]) assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 4 print(tr.tensor_names()) tname = tr.tensor_names(regex=".*loss")[0] loss_tensor = tr.tensor(tname) loss_val = loss_tensor.value(step_num=1) assert len(loss_val) > 0 shutil.rmtree(out_dir)
def test_end_s3_training(): run_id = str(uuid.uuid4()) bucket = "smdebugcodebuildtest" key = "newlogsRunTest/" + run_id out_dir = bucket + "/" + key assert has_training_ended(out_dir) == False subprocess.check_call([ sys.executable, "examples/mxnet/scripts/mnist_gluon_basic_hook_demo.py", "--output-uri", out_dir, "--num_steps", "10", ]) assert has_training_ended(out_dir) delete_s3_prefix(bucket, key)
def test_hook_from_json_config_for_losses(tmpdir, monkeypatch, params): out_dir = tmpdir.join("test_hook_from_json_config_for_losses") config_file = tmpdir.join("config.json") config_file.write(get_json_config_for_losses(str(out_dir))) monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file)) hook = Hook.create_from_json_file() assert has_training_ended(out_dir) is False run_xgboost_model(hook=hook, params=params) trial = create_trial(str(out_dir)) eval_metric = params["eval_metric"] test_metric = f"test-{eval_metric}" train_metric = f"train-{eval_metric}" if eval_metric == "rmse": assert train_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert train_metric in trial.tensor_names( collection=CollectionKeys.LOSSES) assert test_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert test_metric in trial.tensor_names( collection=CollectionKeys.LOSSES) if eval_metric == "auc" or eval_metric == "map": assert train_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert train_metric not in trial.tensor_names( collection=CollectionKeys.LOSSES) assert test_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert test_metric not in trial.tensor_names( collection=CollectionKeys.LOSSES)
def test_end_s3_training(): run_id = str(uuid.uuid4()) bucket = "smdebug-testing" key = f"outputs/{uuid.uuid4()}" out_dir = "s3://" + bucket + "/" + key assert has_training_ended(out_dir) == False subprocess.check_call([ sys.executable, "tests/resources/mxnet/mnist_gluon_basic_hook_demo.py", "--output-uri", out_dir, "--num_steps", "10", ]) assert has_training_ended(out_dir) delete_s3_prefix(bucket, key)
def event_file_present_loop(self, tensor_location: TensorLocation): event_file_name = tensor_location.event_file_name event_file_present = self._is_event_file_present(event_file_name) num_retry = 0 while not event_file_present and num_retry < self.event_file_retry_limit: if self._has_event_file_been_skipped(event_file_name): raise TensorUnavailableForStep( tname=tensor_location.tensorname, mode=tensor_location.mode, step=tensor_location.mode_step, ) elif has_training_ended(self.path) is True: self.logger.warn( f"IndexReader: Training Has Ended" f"\nIndexReader: {event_file_name} was written but not found." ) raise TensorUnavailableForStep( tname=tensor_location.tensorname, mode=tensor_location.mode, step=tensor_location.mode_step, ) event_file_present = self._is_event_file_present(event_file_name) num_retry += 1 time.sleep(2) if num_retry >= self.event_file_retry_limit: self.logger.warn( f"IndexReader: {event_file_name} was written but not found. After {num_retry} retries." ) raise TensorUnavailableForStep( tname=tensor_location.tensorname, mode=tensor_location.mode, step=tensor_location.mode_step, ) return
def _wait_for_collection_files(number_of_collection_file_to_wait_for): while len(collection_files) < number_of_collection_file_to_wait_for: time.sleep(2) _fetch() if has_training_ended(self.path): """ _fetch should have returned all the collection files if the training job has ended """ if len(collection_files) < number_of_collection_file_to_wait_for: raise MissingCollectionFiles
def test_hook_from_json_config_full(tmpdir, monkeypatch): out_dir = tmpdir.join("test_hook_from_json_config_full") config_file = tmpdir.join("config.json") config_file.write(get_json_config_full(str(out_dir))) monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file)) hook = Hook.create_from_json_file() assert has_training_ended(out_dir) is False run_xgboost_model(hook=hook)
def test_s3_training_end(): s3dir = "s3://smdebugcodebuildtest/training_end_test_dir" _, bucket, key = is_s3(s3dir) f = TSAccessS3(bucket_name=bucket, key_name=key) f.close() training_has_ended(s3dir) assert has_training_ended(s3dir) is True delete_s3_prefixes(bucket, key)
def test_s3_training_end(): s3key = str(uuid.uuid4()) s3dir = f"s3://smdebugcodebuildtest/ok_to_delete_{s3key}" _, bucket, key = is_s3(s3dir) f = TSAccessS3(bucket_name=bucket, key_name=key) f.close() training_has_ended(s3dir) assert has_training_ended(s3dir) is True delete_s3_prefixes(bucket, key)
def test_hook(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config) assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) shutil.rmtree(out_dir)
def test_hook_from_json_config_full(): out_dir = "/tmp/newlogsRunTest2/test_hook_from_json_config_full" shutil.rmtree(out_dir, True) os.environ[ CONFIG_FILE_PATH_ENV_STR] = "tests/mxnet/test_json_configs/test_hook_from_json_config_full.json" hook = t_hook.create_from_json_file() assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) shutil.rmtree(out_dir, True)
def test_training_job_has_ended(out_dir): tf.reset_default_graph() subprocess.check_call( [ sys.executable, "examples/tensorflow/local/simple.py", "--out_dir", out_dir, "--steps", "10", "--save_interval", "5", ], env={ "CUDA_VISIBLE_DEVICES": "-1", "SMDEBUG_LOG_LEVEL": "debug" }, ) assert has_training_ended(out_dir) == True
def test_negative_s3_training_end(): s3dir = "s3://smdebugcodebuildtest/training_end_test_dir_negative" assert has_training_ended(s3dir) is False
def test_negative_local_training_end(): localdir = "/tmp/training_end_test_dir_negative" assert has_training_ended(localdir) is False
def test_local_training_end(): localdir = "/tmp/training_end_test_dir" ensure_dir(localdir, is_file=False) training_has_ended(localdir) assert has_training_ended(localdir) is True shutil.rmtree(localdir)
def test_spot_hook(): os.environ[ CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] = "./tests/mxnet/test_json_configs/checkpointconfig.json" checkpoint_path = "/tmp/savedParams" if not os.path.exists(checkpoint_path): os.mkdir(checkpoint_path) save_config = SaveConfig( save_steps=[10, 11, 12, 13, 14, 40, 50, 60, 70, 80]) """ Run the training for 2 epochs and save the parameter after every epoch. We expect that steps 0 to 14 will be written. """ run_id_1 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir_1 = "/tmp/newlogsRunTest/" + run_id_1 hook = t_hook(out_dir=out_dir_1, save_config=save_config, include_collections=["weights", "gradients"]) assert has_training_ended(out_dir_1) == False run_mnist( hook=hook, num_steps_train=10, num_steps_eval=10, epochs=2, save_interval=1, save_path=checkpoint_path, ) """ Run the training again for 4 epochs and save the parameter after every epoch. We DONOT expect that steps 0 to 14 are written. We expect to read steps 40, 50, 60, 70 and 80 """ run_id_2 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir_2 = "/tmp/newlogsRunTest/" + run_id_2 hook = t_hook(out_dir=out_dir_2, save_config=save_config, include_collections=["weights", "gradients"]) assert has_training_ended(out_dir_2) == False run_mnist( hook=hook, num_steps_train=10, num_steps_eval=10, epochs=4, save_interval=1, save_path=checkpoint_path, ) # Unset the environ variable before validation so that it won't affect the other scripts in py test environment. del os.environ[CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] # Validation print("Created the trial with out_dir {0} for the first training".format( out_dir_1)) tr = create_trial(out_dir_1) assert tr available_steps_1 = tr.steps() assert 40 not in available_steps_1 assert 80 not in available_steps_1 print(available_steps_1) print("Created the trial with out_dir {0} for the second training".format( out_dir_2)) tr = create_trial(out_dir_2) assert tr available_steps_2 = tr.steps() assert 40 in available_steps_2 assert 50 in available_steps_2 assert 60 in available_steps_2 assert 70 in available_steps_2 assert 80 in available_steps_2 assert 0 not in available_steps_2 assert 10 not in available_steps_2 assert 11 not in available_steps_2 assert 12 not in available_steps_2 print(available_steps_2) print("Cleaning up.") shutil.rmtree(os.path.dirname(out_dir_1)) shutil.rmtree(checkpoint_path, ignore_errors=True)
def job_finished(self): training_ended = has_training_ended( self.path + "/system") or has_training_ended(self.path + "/framework") # rule job should finish if training job has ended or rule job has been signalled return training_ended or is_rule_signalled_gracetime_passed(self.path)
def test_hook(tmpdir): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook(out_dir=out_dir, save_config=save_config) assert has_training_ended(out_dir) is False run_xgboost_model(hook=hook)