예제 #1
0
 def write_checkpoint(self, checkpoint: Dict):
     # If inside a Tune Trainable, then checkpoint with Tune.
     with tune.checkpoint_dir(
             step=self._latest_checkpoint_id) as checkpoint_dir:
         source_ip = checkpoint[NODE_IP_KEY]
         source_path = checkpoint[CHECKPOINT_PATH_ON_NODE_KEY]
         target_ip = get_node_ip_address()
         if source_ip == target_ip:
             # Move contents of source_path, but not source_path
             # itself. shutil.move is already recursive.
             for path in Path(source_path).iterdir():
                 shutil.move(str(path.absolute()), checkpoint_dir)
             shutil.rmtree(source_path, ignore_errors=True)
         else:
             sync_dir_between_nodes(
                 source_ip=source_ip,
                 source_path=source_path,
                 target_ip=target_ip,
                 target_path=checkpoint_dir,
                 return_futures=False,
                 max_size_bytes=None,
             )
             delete_on_node(node_ip=source_ip, path=source_path)
         checkpoint_dir = Path(checkpoint_dir)
         save_preprocessor_to_dir(self.preprocessor, checkpoint_dir)
         # add tune checkpoint id
         with open(checkpoint_dir.joinpath(TUNE_CHECKPOINT_ID), "w") as f:
             f.write(str(self._latest_checkpoint_id))
예제 #2
0
    def commit(self, path: Optional[Path] = None) -> None:
        if (self.storage_mode == CheckpointStorage.MEMORY or not path
                or not isinstance(self.dir_or_data, dict)):
            return

        source_ip = self.dir_or_data[NODE_IP_KEY]
        source_path = self.dir_or_data[CHECKPOINT_PATH_ON_NODE_KEY]
        target_ip = get_node_ip_address()

        if source_ip == target_ip:
            # Move contents of source_path, but not source_path
            # itself. shutil.move is already recursive.
            for inner in Path(source_path).iterdir():
                shutil.move(str(inner.absolute()), str(path))
            shutil.rmtree(source_path, ignore_errors=True)
        else:
            sync_dir_between_nodes(
                source_ip=source_ip,
                source_path=source_path,
                target_ip=target_ip,
                target_path=str(path),
                return_futures=False,
                max_size_bytes=None,
            )
            delete_on_node(node_ip=source_ip, path=source_path)
        save_preprocessor_to_dir(self.dir_or_data.pop(PREPROCESSOR_KEY, None),
                                 path)
        # add tune checkpoint id
        with open(path.joinpath(TUNE_CHECKPOINT_ID), "w") as f:
            f.write(str(self.id))
예제 #3
0
    def _testSyncInNodeAndDelete(self, num_workers: int = 1):
        temp_source = tempfile.mkdtemp()
        temp_up_target = tempfile.mkdtemp()
        temp_down_target = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, temp_source)
        self.addCleanup(shutil.rmtree, temp_up_target, ignore_errors=True)
        self.addCleanup(shutil.rmtree, temp_down_target)

        os.makedirs(os.path.join(temp_source, "dir_level0", "dir_level1"))
        with open(os.path.join(temp_source, "dir_level0", "file_level1.txt"),
                  "w") as f:
            f.write("Data\n")

        # Sanity check
        self._check_dir_contents(temp_source)
        node_ip = ray.util.get_node_ip_address()

        futures = [
            _sync_dir_on_same_node(
                ip=node_ip,
                source_path=temp_source,
                target_path=temp_up_target,
                return_futures=True,
            ) for i in range(num_workers)
        ]
        ray.get(futures)

        # Check sync up
        self._check_dir_contents(temp_up_target)

        assert not os.listdir(temp_down_target)

        futures = [
            _sync_dir_on_same_node(
                ip=node_ip,
                source_path=temp_up_target,
                target_path=temp_down_target,
                return_futures=True,
            ) for i in range(num_workers)
        ]
        ray.get(futures)

        # Check sync down
        self._check_dir_contents(temp_down_target)

        # Delete in some dir
        delete_on_node(node_ip=node_ip, path=temp_up_target)

        assert not os.path.exists(temp_up_target)
예제 #4
0
    def delete(self, target: str):
        if not self._last_target_tuple:
            logger.warning(
                f"Could not delete path {target} as the target node is not known."
            )
            return

        node_ip = self._last_target_tuple[0]

        try:
            delete_on_node(node_ip=node_ip, path=target)
        except Exception as e:
            logger.warning(
                f"Could not delete path {target} on remote node {node_ip}: {e}"
            )
예제 #5
0
def test_delete_on_node(ray_start_2_cpus, temp_data_dirs):
    """Check that delete on node works."""
    tmp_source, tmp_target = temp_data_dirs

    assert_file(True, tmp_source, "level0.txt")
    assert_file(True, tmp_source, "subdir/level1.txt")

    node_ip = ray.util.get_node_ip_address()
    delete_on_node(
        node_ip=node_ip,
        path=tmp_source,
    )

    assert_file(False, tmp_source, "level0.txt")
    assert_file(False, tmp_source, "subdir/level1.txt")

    # Re-create dir for teardown
    os.makedirs(tmp_source, exist_ok=True)
예제 #6
0
    def testSyncBetweenNodesAndDelete(self):
        temp_source = tempfile.mkdtemp()
        temp_up_target = tempfile.mkdtemp()
        temp_down_target = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, temp_source)
        self.addCleanup(shutil.rmtree, temp_up_target, ignore_errors=True)
        self.addCleanup(shutil.rmtree, temp_down_target)

        os.makedirs(os.path.join(temp_source, "dir_level0", "dir_level1"))
        with open(os.path.join(temp_source, "dir_level0", "file_level1.txt"),
                  "w") as f:
            f.write("Data\n")

        def check_dir_contents(path: str):
            assert os.path.exists(os.path.join(path, "dir_level0"))
            assert os.path.exists(
                os.path.join(path, "dir_level0", "dir_level1"))
            assert os.path.exists(
                os.path.join(path, "dir_level0", "file_level1.txt"))
            with open(os.path.join(path, "dir_level0", "file_level1.txt"),
                      "r") as f:
                assert f.read() == "Data\n"

        # Sanity check
        check_dir_contents(temp_source)

        sync_dir_between_nodes(
            source_ip=ray.util.get_node_ip_address(),
            source_path=temp_source,
            target_ip=ray.util.get_node_ip_address(),
            target_path=temp_up_target,
        )

        # Check sync up
        check_dir_contents(temp_up_target)

        # Max size exceeded
        with self.assertRaises(RayTaskError):
            sync_dir_between_nodes(
                source_ip=ray.util.get_node_ip_address(),
                source_path=temp_up_target,
                target_ip=ray.util.get_node_ip_address(),
                target_path=temp_down_target,
                max_size_bytes=2,
            )

        assert not os.listdir(temp_down_target)

        sync_dir_between_nodes(
            source_ip=ray.util.get_node_ip_address(),
            source_path=temp_up_target,
            target_ip=ray.util.get_node_ip_address(),
            target_path=temp_down_target,
        )

        # Check sync down
        check_dir_contents(temp_down_target)

        # Delete in some dir
        delete_on_node(node_ip=ray.util.get_node_ip_address(),
                       path=temp_up_target)

        assert not os.path.exists(temp_up_target)
예제 #7
0
    def _testSyncBetweenNodesAndDelete(self, num_workers: int = 1):
        temp_source = tempfile.mkdtemp()
        temp_up_target = tempfile.mkdtemp()
        temp_down_target = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, temp_source)
        self.addCleanup(shutil.rmtree, temp_up_target, ignore_errors=True)
        self.addCleanup(shutil.rmtree, temp_down_target)

        os.makedirs(os.path.join(temp_source, "dir_level0", "dir_level1"))
        with open(os.path.join(temp_source, "dir_level0", "file_level1.txt"),
                  "w") as f:
            f.write("Data\n")

        # Sanity check
        self._check_dir_contents(temp_source)
        node_ip = ray.util.get_node_ip_address()

        futures = [
            _sync_dir_between_different_nodes(
                source_ip=node_ip,
                source_path=temp_source,
                target_ip=node_ip,
                target_path=temp_up_target,
                return_futures=True,
            )[0] for i in range(num_workers)
        ]
        ray.get(futures)

        # Check sync up
        self._check_dir_contents(temp_up_target)

        # Max size exceeded
        with self.assertRaises(RayTaskError):
            _sync_dir_between_different_nodes(
                source_ip=node_ip,
                source_path=temp_up_target,
                target_ip=node_ip,
                target_path=temp_down_target,
                max_size_bytes=2,
            )

        assert not os.listdir(temp_down_target)

        futures = [
            _sync_dir_between_different_nodes(
                source_ip=node_ip,
                source_path=temp_up_target,
                target_ip=node_ip,
                target_path=temp_down_target,
                return_futures=True,
            )[0] for i in range(num_workers)
        ]
        ray.get(futures)

        # Check sync down
        self._check_dir_contents(temp_down_target)

        # Delete in some dir
        delete_on_node(node_ip=node_ip, path=temp_up_target)

        assert not os.path.exists(temp_up_target)