def _initialize_writers(self, only_initialize_if_missing=False) -> None: # In keras, sometimes we are not sure if writer is initialized # (such as metrics at end of epoch), that's why it passes the flag only_init_if_missing if self.dry_run: return if self.distribution_strategy in [ TFDistributionStrategy.PARAMETER_SERVER, TFDistributionStrategy.HOROVOD, ]: if self.save_all_workers is True or self.worker == self.chief_worker: if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter( trial_dir=self.out_dir, step=self.step, worker=self.worker ) elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: if len(self.device_map): for device, device_string in self.device_map.items(): if device_string in self.writer_map and only_initialize_if_missing is True: continue if self.save_all_workers is True or device == self.chief_worker: self.writer_map[device_string] = FileWriter( trial_dir=self.out_dir, step=self.step, worker=device_string ) else: # training on CPU when all device strings have cpu if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter( trial_dir=self.out_dir, step=self.step, worker=self.worker ) elif self.distribution_strategy == TFDistributionStrategy.NONE: if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) else: raise NotImplementedError
def _initialize_writers(self, only_initialize_if_missing=False) -> None: # Function is overridden in smdebug/tensorflow/base_hook.py if only_initialize_if_missing and self.writer: return if self.dry_run: return if self.first_process is False: return elif self.first_process is None: if self._get_num_workers() == 1: if is_first_process(self.out_dir): self.first_process = True self.logger.info( f"Hook is writing from the hook with pid: {os.getpid()}\n" ) else: if self.first_process is None: self.logger.warn( f"Unsupported Distributed Training Strategy Detected. \ Sagemaker-Debugger will only write from one process. \ The process with pid: {os.getpid()} will not be writing any data. \n" ) self.first_process = False return if self.save_all_workers is False: if self.worker != self.chief_worker: return self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
def _initialize_writers(self) -> None: if self.dry_run: return if self.save_all_workers is False: if self.worker != self.chief_worker: return self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
def test_string(): with FileWriter(trial_dir="/tmp/ts_output/my_trial", step=20, worker="algo-1") as fw: s_written = np.array(["foo", "barz"]) fw.write_tensor(tdata=s_written, tname=f"foo_string") fname = fw.name() fr = FileReader(fname=fname) read = list(fr.read_tensors()) assert len(read) == 1 s_read = np.array(read[0][2]) assert np.all(s_written == s_read)
def _initialize_writers(self, only_initialize_if_missing=False) -> None: # In keras, sometimes we are not sure if writer is initialized # (such as metrics at end of epoch), that's why it passes the flag only_init_if_missing if self.dry_run: return if (self.save_all_workers is False and self.distribution_strategy != TFDistributionStrategy.MIRRORED_STRATEGY): """ If include_workers is False, we assign we check if the hook has been created by the chief worker. If not we do not initialize a writer. """ if self.chief_worker != self.worker: return if (len(self.device_map) and self.distribution_strategy != TFDistributionStrategy.PARAMETER_SERVER_STRATEGY): """ Initialize one writer per device string If save_all_workers is False, we only initialize a writer for the chief worker """ for device, device_string in self.device_map.items(): if device_string in self.writer_map and only_initialize_if_missing is True: continue if self.save_all_workers is True: self.writer_map[device_string] = FileWriter( trial_dir=self.out_dir, step=self.step, worker=device_string) elif self.save_all_workers is False and device == self.chief_worker: self.writer_map[device_string] = FileWriter( trial_dir=self.out_dir, step=self.step, worker=device_string) else: if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
def test_mode_writing(): run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") worker = socket.gethostname() for s in range(0, 10): fw = FileWriter(trial_dir="/tmp/ts_outputs/" + run_id, step=s, worker=worker) if s % 2 == 0: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=ModeKeys.TRAIN, mode_step=s // 2, ) else: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=ModeKeys.EVAL, mode_step=s // 2, ) fw.close() write_dummy_collection_file("/tmp/ts_outputs/" + run_id) files = glob.glob("/tmp/ts_outputs/" + run_id + "/**/*.tfevents", recursive=True) global_steps = [] train_steps = [] eval_steps = [] for f in files: fr = FileReader(fname=f) for tu in fr.read_tensors(): tensor_name, step, tensor_data, mode, mode_step = tu if step % 2 == 0: assert mode == ModeKeys.TRAIN train_steps.append(step // 2) else: assert mode == ModeKeys.EVAL eval_steps.append(step // 2) assert mode_step == step // 2 global_steps.append(step) trial = create_trial("/tmp/ts_outputs/" + run_id) assert trial.steps() == sorted(global_steps) assert trial.steps(ModeKeys.TRAIN) == sorted(train_steps) assert trial.steps(ModeKeys.EVAL) == sorted(eval_steps) shutil.rmtree("/tmp/ts_outputs/" + run_id)
def test_index(): numpy_tensor = [ np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), np.array([[1.0, 2.0, 4.0], [3.0, 4.0, 5.0]], dtype=np.float32), ] runid = "default" logdir = "." step = 0 worker = "worker_0" run_dir = os.path.join(logdir, runid) writer = FileWriter(trial_dir=run_dir, step=step, worker=worker, verbose=True) for i in (0, len(numpy_tensor) - 1): n = "tensor" + str(i) writer.write_tensor(tdata=numpy_tensor[i], tname=n) writer.flush() writer.close() efl = TensorFileLocation(step_num=step, worker_name=worker) eventfile = efl.get_file_location(trial_dir=run_dir) indexfile = IndexFileLocationUtils.get_index_key_for_step( run_dir, step, worker) fo = open(eventfile, "rb") with open(indexfile) as idx_file: index_data = json.load(idx_file) tensor_payload = index_data["tensor_payload"] i = 0 for tensor in tensor_payload: start_idx = int(tensor["start_idx"]) fo.seek(start_idx, 0) length = int(tensor["length"]) line = fo.read(length) zoo = open("test.txt", "wb") zoo.write(line) zoo.close() testfile_reader = FileReader("./test.txt") tensor_values = list(testfile_reader.read_tensors()) assert np.allclose( tensor_values[0][2].all(), numpy_tensor[i].all()), "indexwriter not working" i = i + 1 fo.close() shutil.rmtree(run_dir) os.remove("test.txt")
def rw(path): """ Checks that we can save data and read it back the way it was """ with FileWriter(trial_dir=path + "/my_trial", step=20, worker="algo-1") as fw: for i in range(10): data = np.ones(shape=(4, 4), dtype=np.float32) * i fw.write_tensor(tdata=data, tname=f"foo_{i}") fname = fw.name() fr = FileReader(fname=fname) for i, ts in enumerate(fr.read_tensors()): """ read_data returns name, step and data (if read_data==True) """ print(i, ts) assert np.all(ts[2] == i)
def test_mode_data(): run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") trial_dir = "/tmp/ts_outputs/" + run_id c = CollectionManager() c.add("default") c.get("default").tensor_names = ["arr_1"] c.get("default").tensor_names = ["arr_2"] c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME) trial = create_trial(trial_dir) worker = socket.gethostname() for s in range(0, 10): fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker) if s % 2 == 0: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr_1", mode=modes.TRAIN, mode_step=s // 2, ) else: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr_2", mode=modes.EVAL, mode_step=s // 2, ) fw.close() assert trial.tensor_names() == ["arr_1", "arr_2"] assert trial.tensor_names(step=0) == ["arr_1"] assert trial.tensor_names(step=1) == ["arr_2"] assert trial.tensor_names(step=0, mode=modes.TRAIN) == ["arr_1"] assert trial.tensor_names(step=0, mode=modes.EVAL) == ["arr_2"] assert trial.tensor_names(mode=modes.TRAIN) == ["arr_1"] assert trial.tensor_names(mode=modes.EVAL) == ["arr_2"]
def _maybe_get_tb_writer(self) -> Optional[FileWriter]: """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. Creates a writer if does not exist. """ if not self.tensorboard_dir: return None if self.mode in self.tb_writers: assert self.tb_writers[self.mode] is not None # would be there if set_mode was called return self.tb_writers[self.mode] else: # s = self.step # if s < 0: s = 0 self.tb_writers[self.mode] = FileWriter( trial_dir=self.tensorboard_dir, step=self.step, worker=get_tb_worker(), write_checksum=True, wtype="tensorboard", mode=self.mode, ) return self.tb_writers[self.mode]
def generate_data( path, trial, step, tname_prefix, num_tensors, worker, shape, dtype=np.float32, rank=None, mode=None, mode_step=None, export_colls=True, data=None, ): with FileWriter(trial_dir=os.path.join(path, trial), step=step, worker=worker) as fw: for i in range(num_tensors): if data is None: data = np.ones(shape=shape, dtype=dtype) * step fw.write_tensor(tdata=data, tname=f"{tname_prefix}_{i}", mode=mode, mode_step=mode_step) if export_colls: c = CollectionManager() c.add("default") c.get("default").tensor_names = [ f"{tname_prefix}_{i}" for i in range(num_tensors) ] c.add("gradients") c.get("gradients").tensor_names = [ f"{tname_prefix}_{i}" for i in range(num_tensors) ] c.export(os.path.join(path, trial), DEFAULT_COLLECTIONS_FILE_NAME)
def test_mode_data(): run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") trial_dir = "/tmp/ts_outputs/" + run_id c = CollectionManager() c.add("default") c.get("default").tensor_names = ["arr"] c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME) tr = create_trial(trial_dir) worker = socket.gethostname() for s in range(0, 10): fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker) if s % 2 == 0: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=modes.TRAIN, mode_step=s // 2, ) else: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=modes.EVAL, mode_step=s // 2, ) fw.close() if s % 2 == 0: assert tr.has_passed_step(s // 2, mode=modes.TRAIN) == StepState.AVAILABLE assert tr.has_passed_step( s // 2, mode=modes.EVAL) == StepState.NOT_YET_AVAILABLE else: assert tr.has_passed_step(s // 2, mode=modes.EVAL) == StepState.AVAILABLE assert tr.has_passed_step(s) == StepState.AVAILABLE assert tr.has_passed_step(s + 1) == StepState.NOT_YET_AVAILABLE assert tr.has_passed_step( s + 1, mode=modes.TRAIN) == StepState.NOT_YET_AVAILABLE assert len(tr.tensor_names()) == 1 assert len(tr.steps()) == 10 assert len(tr.steps(mode=modes.TRAIN)) == 5 assert len(tr.steps(mode=modes.EVAL)) == 5 assert len(tr.modes()) == 2 for i in range(10): if i % 2 == 0: assert tr.mode(i) == modes.TRAIN else: assert tr.mode(i) == modes.EVAL assert tr.mode_step(i) == i // 2 for i in range(5): assert tr.global_step(modes.TRAIN, i) == (i * 2) assert tr.global_step(modes.EVAL, i) == (i * 2) + 1 assert len(tr.tensor("arr").steps()) == 10 assert len(tr.tensor("arr").steps(mode=modes.TRAIN)) == 5 assert len(tr.tensor("arr").steps(mode=modes.EVAL)) == 5 for i in range(10): assert tr.tensor("arr").value(i) is not None if i < 5: assert tr.tensor("arr").value(i, mode=modes.TRAIN) is not None assert tr.tensor("arr").value(i, mode=modes.EVAL) is not None shutil.rmtree("/tmp/ts_outputs/" + run_id)