class HDFSTensorboardManager(base.TensorboardManager): """ Store and tfevents files to HDFS. """ @util.preserve_random_state def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, *args: Any, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) self.client.makedirs(str(self.sync_path)) @util.preserve_random_state def sync(self) -> None: for path in self.to_sync(): file_name = str(self.sync_path.joinpath(path.name)) logging.debug(f"Uploading {path} to {self.hdfs_path}") self.client.upload(file_name, str(path)) def delete(self) -> None: self.client.delete(self.sync_path, recursive=True)
class HDFSStorageManager(StorageManager): """ Store and load checkpoints from HDFS. """ def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, temp_dir: Optional[str] = None, ) -> None: super().__init__( temp_dir if temp_dir is not None else tempfile.gettempdir()) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) def post_store_path(self, storage_id: str, storage_dir: str, metadata: StorageMetadata) -> None: """post_store_path uploads the checkpoint to hdfs and deletes the original files.""" try: logging.info("Uploading storage {} to HDFS".format(storage_id)) result = self.client.upload(metadata, storage_dir) logging.info("Uploaded storage {} to HDFS path {}".format( storage_id, result)) finally: self._remove_checkpoint_directory(metadata.storage_id) @contextlib.contextmanager def restore_path(self, metadata: StorageMetadata) -> Iterator[str]: logging.info("Downloading storage {} from HDFS".format( metadata.storage_id)) self.client.download(metadata.storage_id, self._base_path, overwrite=True) try: yield os.path.join(self._base_path, metadata.storage_id) finally: self._remove_checkpoint_directory(metadata.storage_id) def delete(self, metadata: StorageMetadata) -> None: logging.info("Deleting storage {} from HDFS".format( metadata.storage_id)) self.client.delete(metadata.storage_id, recursive=True)
def run_remove_files(configData: ConfigData): f_date_str = configData.get_f_date() # StrTool.get_the_date_str(the_date, delta_day) # "20181101" data_path = os.path.join(configData.get_data_path(), f_date_str) # allinpay_data_bl utf8_path = os.path.join(configData.get_utf8_path(), f_date_str) # allinpay_utf8_bl hdfs_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) # hdfs_dir_bl a_client = InsecureClient(configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" shutil.rmtree(data_path, ignore_errors=True) shutil.rmtree(utf8_path, ignore_errors=True) try: a_client.delete(hdfs_path, recursive=True) except: pass
class HDFSTensorboardManager(base.TensorboardManager): """ Store and tfevents files to HDFS. """ @util.preserve_random_state def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, *args: Any, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) self.client.makedirs(str(self.sync_path)) @util.preserve_random_state def sync( self, selector: Callable[[pathlib.Path], bool] = lambda _: True, mangler: Callable[[pathlib.Path, int], pathlib.Path] = lambda p, __: p, rank: int = 0, ) -> None: for path in self.to_sync(selector): relative_path = path.relative_to(self.base_path) mangled_relative_path = mangler(relative_path, rank) mangled_path = self.sync_path.joinpath(mangled_relative_path) file_name = str(mangled_path) logging.debug(f"Uploading {path} to {self.hdfs_path}") self.client.upload(file_name, str(path)) def delete(self) -> None: self.client.delete(self.sync_path, recursive=True)
class HDFSStorageManager(storage.CloudStorageManager): """ Store and load checkpoints from HDFS. """ def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, temp_dir: Optional[str] = None, ) -> None: super().__init__(temp_dir if temp_dir is not None else tempfile.gettempdir()) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) @util.preserve_random_state def upload(self, src: Union[str, os.PathLike], dst: str) -> None: src = os.fspath(src) logging.info(f"Uploading to HDFS: {dst}") self.client.upload(dst, src) @util.preserve_random_state def download(self, src: str, dst: Union[str, os.PathLike]) -> None: dst = os.fspath(dst) logging.info(f"Downloading {src} from HDFS") self.client.download(src, dst, overwrite=True) @util.preserve_random_state def delete(self, tgt: str) -> None: logging.info(f"Deleting {tgt} from HDFS") self.client.delete(tgt, recursive=True)
import datetime from hdfs import HdfsError from hdfs.client import Client, InsecureClient # client = Client("http://master.hadoop:50070") client = InsecureClient('http://master.hadoop:50070', user='******') # print dir(client) # print client.list("/user") # print client.makedirs("/tmp/test") client.delete('/user/spider', recursive=True) # current_date = datetime.datetime.now().strftime("%Y%m%d") # # hdfs_dir = "/user/a/b/c/" # # hdfs_path = hdfs_dir + current_date # # data = "\n测试" # # try: # client.write(hdfs_path=hdfs_path, data=data, append=True) # except HdfsError,e: # client.write(hdfs_path=hdfs_path, data=data)