Exemplo n.º 1
0
    def _initialize_writers(self, only_initialize_if_missing=False) -> None:
        # In keras, sometimes we are not sure if writer is initialized
        # (such as metrics at end of epoch), that's why it passes the flag only_init_if_missing
        if self.dry_run:
            return

        if self.distribution_strategy in [
            TFDistributionStrategy.PARAMETER_SERVER,
            TFDistributionStrategy.HOROVOD,
        ]:
            if self.save_all_workers is True or self.worker == self.chief_worker:
                if self.writer is None or only_initialize_if_missing is False:
                    self.writer = FileWriter(
                        trial_dir=self.out_dir, step=self.step, worker=self.worker
                    )
        elif self.distribution_strategy == TFDistributionStrategy.MIRRORED:
            if len(self.device_map):
                for device, device_string in self.device_map.items():
                    if device_string in self.writer_map and only_initialize_if_missing is True:
                        continue
                    if self.save_all_workers is True or device == self.chief_worker:
                        self.writer_map[device_string] = FileWriter(
                            trial_dir=self.out_dir, step=self.step, worker=device_string
                        )
            else:
                # training on CPU when all device strings have cpu
                if self.writer is None or only_initialize_if_missing is False:
                    self.writer = FileWriter(
                        trial_dir=self.out_dir, step=self.step, worker=self.worker
                    )
        elif self.distribution_strategy == TFDistributionStrategy.NONE:
            if self.writer is None or only_initialize_if_missing is False:
                self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
        else:
            raise NotImplementedError
Exemplo n.º 2
0
    def _initialize_writers(self, only_initialize_if_missing=False) -> None:
        # Function is overridden in smdebug/tensorflow/base_hook.py
        if only_initialize_if_missing and self.writer:
            return
        if self.dry_run:
            return
        if self.first_process is False:
            return
        elif self.first_process is None:
            if self._get_num_workers() == 1:
                if is_first_process(self.out_dir):
                    self.first_process = True
                    self.logger.info(
                        f"Hook is writing from the hook with pid: {os.getpid()}\n"
                    )
                else:
                    if self.first_process is None:
                        self.logger.warn(
                            f"Unsupported Distributed Training Strategy Detected. \
                            Sagemaker-Debugger will only write from one process. \
                            The process with pid: {os.getpid()} will not be writing any data. \n"
                        )
                    self.first_process = False
                    return

        if self.save_all_workers is False:
            if self.worker != self.chief_worker:
                return

        self.writer = FileWriter(trial_dir=self.out_dir,
                                 step=self.step,
                                 worker=self.worker)
Exemplo n.º 3
0
 def _initialize_writers(self) -> None:
     if self.dry_run:
         return
     if self.save_all_workers is False:
         if self.worker != self.chief_worker:
             return
     self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
Exemplo n.º 4
0
def test_string():
    with FileWriter(trial_dir="/tmp/ts_output/my_trial", step=20, worker="algo-1") as fw:
        s_written = np.array(["foo", "barz"])
        fw.write_tensor(tdata=s_written, tname=f"foo_string")
        fname = fw.name()
    fr = FileReader(fname=fname)
    read = list(fr.read_tensors())
    assert len(read) == 1
    s_read = np.array(read[0][2])
    assert np.all(s_written == s_read)
Exemplo n.º 5
0
    def _initialize_writers(self, only_initialize_if_missing=False) -> None:
        # In keras, sometimes we are not sure if writer is initialized
        # (such as metrics at end of epoch), that's why it passes the flag only_init_if_missing

        if self.dry_run:
            return

        if (self.save_all_workers is False and self.distribution_strategy !=
                TFDistributionStrategy.MIRRORED_STRATEGY):
            """
            If include_workers is False, we assign we check if the hook has been created by
            the chief worker. If not we do not initialize a writer.
            """
            if self.chief_worker != self.worker:
                return

        if (len(self.device_map) and self.distribution_strategy !=
                TFDistributionStrategy.PARAMETER_SERVER_STRATEGY):
            """
                Initialize one writer per device string
                If save_all_workers is False, we only initialize a writer
                for the chief worker
            """
            for device, device_string in self.device_map.items():
                if device_string in self.writer_map and only_initialize_if_missing is True:
                    continue
                if self.save_all_workers is True:
                    self.writer_map[device_string] = FileWriter(
                        trial_dir=self.out_dir,
                        step=self.step,
                        worker=device_string)
                elif self.save_all_workers is False and device == self.chief_worker:
                    self.writer_map[device_string] = FileWriter(
                        trial_dir=self.out_dir,
                        step=self.step,
                        worker=device_string)
        else:
            if self.writer is None or only_initialize_if_missing is False:
                self.writer = FileWriter(trial_dir=self.out_dir,
                                         step=self.step,
                                         worker=self.worker)
Exemplo n.º 6
0
def test_mode_writing():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir="/tmp/ts_outputs/" + run_id,
                        step=s,
                        worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=ModeKeys.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=ModeKeys.EVAL,
                mode_step=s // 2,
            )
        fw.close()
    write_dummy_collection_file("/tmp/ts_outputs/" + run_id)
    files = glob.glob("/tmp/ts_outputs/" + run_id + "/**/*.tfevents",
                      recursive=True)

    global_steps = []
    train_steps = []
    eval_steps = []
    for f in files:
        fr = FileReader(fname=f)
        for tu in fr.read_tensors():
            tensor_name, step, tensor_data, mode, mode_step = tu
            if step % 2 == 0:
                assert mode == ModeKeys.TRAIN
                train_steps.append(step // 2)
            else:
                assert mode == ModeKeys.EVAL
                eval_steps.append(step // 2)
            assert mode_step == step // 2
            global_steps.append(step)

    trial = create_trial("/tmp/ts_outputs/" + run_id)
    assert trial.steps() == sorted(global_steps)
    assert trial.steps(ModeKeys.TRAIN) == sorted(train_steps)
    assert trial.steps(ModeKeys.EVAL) == sorted(eval_steps)
    shutil.rmtree("/tmp/ts_outputs/" + run_id)
Exemplo n.º 7
0
def test_index():
    numpy_tensor = [
        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
        np.array([[1.0, 2.0, 4.0], [3.0, 4.0, 5.0]], dtype=np.float32),
    ]
    runid = "default"
    logdir = "."
    step = 0
    worker = "worker_0"
    run_dir = os.path.join(logdir, runid)
    writer = FileWriter(trial_dir=run_dir,
                        step=step,
                        worker=worker,
                        verbose=True)
    for i in (0, len(numpy_tensor) - 1):
        n = "tensor" + str(i)
        writer.write_tensor(tdata=numpy_tensor[i], tname=n)
    writer.flush()
    writer.close()
    efl = TensorFileLocation(step_num=step, worker_name=worker)
    eventfile = efl.get_file_location(trial_dir=run_dir)
    indexfile = IndexFileLocationUtils.get_index_key_for_step(
        run_dir, step, worker)

    fo = open(eventfile, "rb")
    with open(indexfile) as idx_file:
        index_data = json.load(idx_file)
        tensor_payload = index_data["tensor_payload"]
        i = 0
        for tensor in tensor_payload:
            start_idx = int(tensor["start_idx"])
            fo.seek(start_idx, 0)
            length = int(tensor["length"])
            line = fo.read(length)
            zoo = open("test.txt", "wb")
            zoo.write(line)
            zoo.close()
            testfile_reader = FileReader("./test.txt")
            tensor_values = list(testfile_reader.read_tensors())
            assert np.allclose(
                tensor_values[0][2].all(),
                numpy_tensor[i].all()), "indexwriter not working"
            i = i + 1

    fo.close()
    shutil.rmtree(run_dir)
    os.remove("test.txt")
Exemplo n.º 8
0
def rw(path):
    """
    Checks that we can save data and read it back the way it was
    """
    with FileWriter(trial_dir=path + "/my_trial", step=20, worker="algo-1") as fw:
        for i in range(10):
            data = np.ones(shape=(4, 4), dtype=np.float32) * i
            fw.write_tensor(tdata=data, tname=f"foo_{i}")
        fname = fw.name()

    fr = FileReader(fname=fname)
    for i, ts in enumerate(fr.read_tensors()):
        """
        read_data returns name, step and data (if read_data==True)
        """
        print(i, ts)
        assert np.all(ts[2] == i)
Exemplo n.º 9
0
def test_mode_data():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = "/tmp/ts_outputs/" + run_id

    c = CollectionManager()
    c.add("default")
    c.get("default").tensor_names = ["arr_1"]
    c.get("default").tensor_names = ["arr_2"]
    c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME)
    trial = create_trial(trial_dir)
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr_1",
                mode=modes.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr_2",
                mode=modes.EVAL,
                mode_step=s // 2,
            )
        fw.close()

    assert trial.tensor_names() == ["arr_1", "arr_2"]
    assert trial.tensor_names(step=0) == ["arr_1"]
    assert trial.tensor_names(step=1) == ["arr_2"]
    assert trial.tensor_names(step=0, mode=modes.TRAIN) == ["arr_1"]
    assert trial.tensor_names(step=0, mode=modes.EVAL) == ["arr_2"]

    assert trial.tensor_names(mode=modes.TRAIN) == ["arr_1"]
    assert trial.tensor_names(mode=modes.EVAL) == ["arr_2"]
Exemplo n.º 10
0
    def _maybe_get_tb_writer(self) -> Optional[FileWriter]:
        """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None.

        Creates a writer if does not exist.
        """
        if not self.tensorboard_dir:
            return None

        if self.mode in self.tb_writers:
            assert self.tb_writers[self.mode] is not None
            # would be there if set_mode was called
            return self.tb_writers[self.mode]
        else:
            # s = self.step
            # if s < 0: s = 0
            self.tb_writers[self.mode] = FileWriter(
                trial_dir=self.tensorboard_dir,
                step=self.step,
                worker=get_tb_worker(),
                write_checksum=True,
                wtype="tensorboard",
                mode=self.mode,
            )
            return self.tb_writers[self.mode]
Exemplo n.º 11
0
def generate_data(
    path,
    trial,
    step,
    tname_prefix,
    num_tensors,
    worker,
    shape,
    dtype=np.float32,
    rank=None,
    mode=None,
    mode_step=None,
    export_colls=True,
    data=None,
):
    with FileWriter(trial_dir=os.path.join(path, trial),
                    step=step,
                    worker=worker) as fw:
        for i in range(num_tensors):
            if data is None:
                data = np.ones(shape=shape, dtype=dtype) * step
            fw.write_tensor(tdata=data,
                            tname=f"{tname_prefix}_{i}",
                            mode=mode,
                            mode_step=mode_step)
    if export_colls:
        c = CollectionManager()
        c.add("default")
        c.get("default").tensor_names = [
            f"{tname_prefix}_{i}" for i in range(num_tensors)
        ]
        c.add("gradients")
        c.get("gradients").tensor_names = [
            f"{tname_prefix}_{i}" for i in range(num_tensors)
        ]
        c.export(os.path.join(path, trial), DEFAULT_COLLECTIONS_FILE_NAME)
Exemplo n.º 12
0
def test_mode_data():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = "/tmp/ts_outputs/" + run_id

    c = CollectionManager()
    c.add("default")
    c.get("default").tensor_names = ["arr"]
    c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME)
    tr = create_trial(trial_dir)
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=modes.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=modes.EVAL,
                mode_step=s // 2,
            )
        fw.close()

        if s % 2 == 0:
            assert tr.has_passed_step(s // 2,
                                      mode=modes.TRAIN) == StepState.AVAILABLE
            assert tr.has_passed_step(
                s // 2, mode=modes.EVAL) == StepState.NOT_YET_AVAILABLE
        else:
            assert tr.has_passed_step(s // 2,
                                      mode=modes.EVAL) == StepState.AVAILABLE

        assert tr.has_passed_step(s) == StepState.AVAILABLE
        assert tr.has_passed_step(s + 1) == StepState.NOT_YET_AVAILABLE
        assert tr.has_passed_step(
            s + 1, mode=modes.TRAIN) == StepState.NOT_YET_AVAILABLE

    assert len(tr.tensor_names()) == 1
    assert len(tr.steps()) == 10
    assert len(tr.steps(mode=modes.TRAIN)) == 5
    assert len(tr.steps(mode=modes.EVAL)) == 5
    assert len(tr.modes()) == 2

    for i in range(10):
        if i % 2 == 0:
            assert tr.mode(i) == modes.TRAIN
        else:
            assert tr.mode(i) == modes.EVAL
        assert tr.mode_step(i) == i // 2

    for i in range(5):
        assert tr.global_step(modes.TRAIN, i) == (i * 2)
        assert tr.global_step(modes.EVAL, i) == (i * 2) + 1

    assert len(tr.tensor("arr").steps()) == 10
    assert len(tr.tensor("arr").steps(mode=modes.TRAIN)) == 5
    assert len(tr.tensor("arr").steps(mode=modes.EVAL)) == 5

    for i in range(10):
        assert tr.tensor("arr").value(i) is not None
        if i < 5:
            assert tr.tensor("arr").value(i, mode=modes.TRAIN) is not None
            assert tr.tensor("arr").value(i, mode=modes.EVAL) is not None

    shutil.rmtree("/tmp/ts_outputs/" + run_id)