def help_test_multiple_trials(num_steps=20, num_tensors=10): trial_name = str(uuid.uuid4()) bucket = "smdebug-testing" path = "s3://" + os.path.join(bucket, "outputs/") c = CollectionManager() c.add("default") c.get("default").tensor_names = [ "foo_" + str(i) for i in range(num_tensors) ] c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME) c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME) for i in range(num_steps): generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=i, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), rank=0, ) _, bucket, prefix = is_s3(os.path.join(path, trial_name)) trial_obj = S3Trial(name=prefix, bucket_name=bucket, prefix_name=prefix) return trial_obj, trial_name
def dummy_trial_creator(trial_dir, num_workers, job_ended): Path(trial_dir).mkdir(parents=True, exist_ok=True) cm = CollectionManager() for i in range(num_workers): collection_file_name = f"worker_{i}_collections.json" cm.export(trial_dir, collection_file_name) if job_ended: Path(os.path.join(trial_dir, "training_job_end.ts")).touch()
def test_manager_export_load(): cm = CollectionManager() cm.create_collection("default") cm.get("default").include("loss") cm.add(Collection("trial1")) cm.add("trial2") cm.get("trial2").include("total_loss") cm.export("/tmp/dummy_trial", DEFAULT_COLLECTIONS_FILE_NAME) cm2 = CollectionManager.load( os.path.join(get_path_to_collections("/tmp/dummy_trial"), DEFAULT_COLLECTIONS_FILE_NAME)) assert cm == cm2
def test_mode_data(): run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") trial_dir = "/tmp/ts_outputs/" + run_id c = CollectionManager() c.add("default") c.get("default").tensor_names = ["arr_1"] c.get("default").tensor_names = ["arr_2"] c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME) trial = create_trial(trial_dir) worker = socket.gethostname() for s in range(0, 10): fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker) if s % 2 == 0: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr_1", mode=modes.TRAIN, mode_step=s // 2, ) else: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr_2", mode=modes.EVAL, mode_step=s // 2, ) fw.close() assert trial.tensor_names() == ["arr_1", "arr_2"] assert trial.tensor_names(step=0) == ["arr_1"] assert trial.tensor_names(step=1) == ["arr_2"] assert trial.tensor_names(step=0, mode=modes.TRAIN) == ["arr_1"] assert trial.tensor_names(step=0, mode=modes.EVAL) == ["arr_2"] assert trial.tensor_names(mode=modes.TRAIN) == ["arr_1"] assert trial.tensor_names(mode=modes.EVAL) == ["arr_2"]
def generate_data( path, trial, step, tname_prefix, num_tensors, worker, shape, dtype=np.float32, rank=None, mode=None, mode_step=None, export_colls=True, data=None, ): with FileWriter(trial_dir=os.path.join(path, trial), step=step, worker=worker) as fw: for i in range(num_tensors): if data is None: data = np.ones(shape=shape, dtype=dtype) * step fw.write_tensor(tdata=data, tname=f"{tname_prefix}_{i}", mode=mode, mode_step=mode_step) if export_colls: c = CollectionManager() c.add("default") c.get("default").tensor_names = [ f"{tname_prefix}_{i}" for i in range(num_tensors) ] c.add("gradients") c.get("gradients").tensor_names = [ f"{tname_prefix}_{i}" for i in range(num_tensors) ] c.export(os.path.join(path, trial), DEFAULT_COLLECTIONS_FILE_NAME)
def write_dummy_collection_file(trial): cm = CollectionManager() cm.create_collection("default") cm.add(Collection(trial)) cm.export(trial, DEFAULT_COLLECTIONS_FILE_NAME)
def test_mode_data(): run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") trial_dir = "/tmp/ts_outputs/" + run_id c = CollectionManager() c.add("default") c.get("default").tensor_names = ["arr"] c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME) tr = create_trial(trial_dir) worker = socket.gethostname() for s in range(0, 10): fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker) if s % 2 == 0: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=modes.TRAIN, mode_step=s // 2, ) else: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=modes.EVAL, mode_step=s // 2, ) fw.close() if s % 2 == 0: assert tr.has_passed_step(s // 2, mode=modes.TRAIN) == StepState.AVAILABLE assert tr.has_passed_step( s // 2, mode=modes.EVAL) == StepState.NOT_YET_AVAILABLE else: assert tr.has_passed_step(s // 2, mode=modes.EVAL) == StepState.AVAILABLE assert tr.has_passed_step(s) == StepState.AVAILABLE assert tr.has_passed_step(s + 1) == StepState.NOT_YET_AVAILABLE assert tr.has_passed_step( s + 1, mode=modes.TRAIN) == StepState.NOT_YET_AVAILABLE assert len(tr.tensor_names()) == 1 assert len(tr.steps()) == 10 assert len(tr.steps(mode=modes.TRAIN)) == 5 assert len(tr.steps(mode=modes.EVAL)) == 5 assert len(tr.modes()) == 2 for i in range(10): if i % 2 == 0: assert tr.mode(i) == modes.TRAIN else: assert tr.mode(i) == modes.EVAL assert tr.mode_step(i) == i // 2 for i in range(5): assert tr.global_step(modes.TRAIN, i) == (i * 2) assert tr.global_step(modes.EVAL, i) == (i * 2) + 1 assert len(tr.tensor("arr").steps()) == 10 assert len(tr.tensor("arr").steps(mode=modes.TRAIN)) == 5 assert len(tr.tensor("arr").steps(mode=modes.EVAL)) == 5 for i in range(10): assert tr.tensor("arr").value(i) is not None if i < 5: assert tr.tensor("arr").value(i, mode=modes.TRAIN) is not None assert tr.tensor("arr").value(i, mode=modes.EVAL) is not None shutil.rmtree("/tmp/ts_outputs/" + run_id)