def write_checkpoint(self, checkpoint: Dict): # If inside a Tune Trainable, then checkpoint with Tune. with tune.checkpoint_dir( step=self._latest_checkpoint_id) as checkpoint_dir: source_ip = checkpoint[NODE_IP_KEY] source_path = checkpoint[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: # Move contents of source_path, but not source_path # itself. shutil.move is already recursive. for path in Path(source_path).iterdir(): shutil.move(str(path.absolute()), checkpoint_dir) shutil.rmtree(source_path, ignore_errors=True) else: sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_dir, return_futures=False, max_size_bytes=None, ) delete_on_node(node_ip=source_ip, path=source_path) checkpoint_dir = Path(checkpoint_dir) save_preprocessor_to_dir(self.preprocessor, checkpoint_dir) # add tune checkpoint id with open(checkpoint_dir.joinpath(TUNE_CHECKPOINT_ID), "w") as f: f.write(str(self._latest_checkpoint_id))
def commit(self, path: Optional[Path] = None) -> None: if (self.storage_mode == CheckpointStorage.MEMORY or not path or not isinstance(self.dir_or_data, dict)): return source_ip = self.dir_or_data[NODE_IP_KEY] source_path = self.dir_or_data[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: # Move contents of source_path, but not source_path # itself. shutil.move is already recursive. for inner in Path(source_path).iterdir(): shutil.move(str(inner.absolute()), str(path)) shutil.rmtree(source_path, ignore_errors=True) else: sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=str(path), return_futures=False, max_size_bytes=None, ) delete_on_node(node_ip=source_ip, path=source_path) save_preprocessor_to_dir(self.dir_or_data.pop(PREPROCESSOR_KEY, None), path) # add tune checkpoint id with open(path.joinpath(TUNE_CHECKPOINT_ID), "w") as f: f.write(str(self.id))
def _testSyncInNodeAndDelete(self, num_workers: int = 1): temp_source = tempfile.mkdtemp() temp_up_target = tempfile.mkdtemp() temp_down_target = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, temp_source) self.addCleanup(shutil.rmtree, temp_up_target, ignore_errors=True) self.addCleanup(shutil.rmtree, temp_down_target) os.makedirs(os.path.join(temp_source, "dir_level0", "dir_level1")) with open(os.path.join(temp_source, "dir_level0", "file_level1.txt"), "w") as f: f.write("Data\n") # Sanity check self._check_dir_contents(temp_source) node_ip = ray.util.get_node_ip_address() futures = [ _sync_dir_on_same_node( ip=node_ip, source_path=temp_source, target_path=temp_up_target, return_futures=True, ) for i in range(num_workers) ] ray.get(futures) # Check sync up self._check_dir_contents(temp_up_target) assert not os.listdir(temp_down_target) futures = [ _sync_dir_on_same_node( ip=node_ip, source_path=temp_up_target, target_path=temp_down_target, return_futures=True, ) for i in range(num_workers) ] ray.get(futures) # Check sync down self._check_dir_contents(temp_down_target) # Delete in some dir delete_on_node(node_ip=node_ip, path=temp_up_target) assert not os.path.exists(temp_up_target)
def delete(self, target: str): if not self._last_target_tuple: logger.warning( f"Could not delete path {target} as the target node is not known." ) return node_ip = self._last_target_tuple[0] try: delete_on_node(node_ip=node_ip, path=target) except Exception as e: logger.warning( f"Could not delete path {target} on remote node {node_ip}: {e}" )
def test_delete_on_node(ray_start_2_cpus, temp_data_dirs): """Check that delete on node works.""" tmp_source, tmp_target = temp_data_dirs assert_file(True, tmp_source, "level0.txt") assert_file(True, tmp_source, "subdir/level1.txt") node_ip = ray.util.get_node_ip_address() delete_on_node( node_ip=node_ip, path=tmp_source, ) assert_file(False, tmp_source, "level0.txt") assert_file(False, tmp_source, "subdir/level1.txt") # Re-create dir for teardown os.makedirs(tmp_source, exist_ok=True)
def testSyncBetweenNodesAndDelete(self): temp_source = tempfile.mkdtemp() temp_up_target = tempfile.mkdtemp() temp_down_target = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, temp_source) self.addCleanup(shutil.rmtree, temp_up_target, ignore_errors=True) self.addCleanup(shutil.rmtree, temp_down_target) os.makedirs(os.path.join(temp_source, "dir_level0", "dir_level1")) with open(os.path.join(temp_source, "dir_level0", "file_level1.txt"), "w") as f: f.write("Data\n") def check_dir_contents(path: str): assert os.path.exists(os.path.join(path, "dir_level0")) assert os.path.exists( os.path.join(path, "dir_level0", "dir_level1")) assert os.path.exists( os.path.join(path, "dir_level0", "file_level1.txt")) with open(os.path.join(path, "dir_level0", "file_level1.txt"), "r") as f: assert f.read() == "Data\n" # Sanity check check_dir_contents(temp_source) sync_dir_between_nodes( source_ip=ray.util.get_node_ip_address(), source_path=temp_source, target_ip=ray.util.get_node_ip_address(), target_path=temp_up_target, ) # Check sync up check_dir_contents(temp_up_target) # Max size exceeded with self.assertRaises(RayTaskError): sync_dir_between_nodes( source_ip=ray.util.get_node_ip_address(), source_path=temp_up_target, target_ip=ray.util.get_node_ip_address(), target_path=temp_down_target, max_size_bytes=2, ) assert not os.listdir(temp_down_target) sync_dir_between_nodes( source_ip=ray.util.get_node_ip_address(), source_path=temp_up_target, target_ip=ray.util.get_node_ip_address(), target_path=temp_down_target, ) # Check sync down check_dir_contents(temp_down_target) # Delete in some dir delete_on_node(node_ip=ray.util.get_node_ip_address(), path=temp_up_target) assert not os.path.exists(temp_up_target)
def _testSyncBetweenNodesAndDelete(self, num_workers: int = 1): temp_source = tempfile.mkdtemp() temp_up_target = tempfile.mkdtemp() temp_down_target = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, temp_source) self.addCleanup(shutil.rmtree, temp_up_target, ignore_errors=True) self.addCleanup(shutil.rmtree, temp_down_target) os.makedirs(os.path.join(temp_source, "dir_level0", "dir_level1")) with open(os.path.join(temp_source, "dir_level0", "file_level1.txt"), "w") as f: f.write("Data\n") # Sanity check self._check_dir_contents(temp_source) node_ip = ray.util.get_node_ip_address() futures = [ _sync_dir_between_different_nodes( source_ip=node_ip, source_path=temp_source, target_ip=node_ip, target_path=temp_up_target, return_futures=True, )[0] for i in range(num_workers) ] ray.get(futures) # Check sync up self._check_dir_contents(temp_up_target) # Max size exceeded with self.assertRaises(RayTaskError): _sync_dir_between_different_nodes( source_ip=node_ip, source_path=temp_up_target, target_ip=node_ip, target_path=temp_down_target, max_size_bytes=2, ) assert not os.listdir(temp_down_target) futures = [ _sync_dir_between_different_nodes( source_ip=node_ip, source_path=temp_up_target, target_ip=node_ip, target_path=temp_down_target, return_futures=True, )[0] for i in range(num_workers) ] ray.get(futures) # Check sync down self._check_dir_contents(temp_down_target) # Delete in some dir delete_on_node(node_ip=node_ip, path=temp_up_target) assert not os.path.exists(temp_up_target)