def _upload_tensorboard_on_hdfs(local_dir: str, hdfs_dir: str) -> None: resolved_fs, _ = filesystem.resolve_filesystem_and_path(hdfs_dir) if not resolved_fs.exists(hdfs_dir): resolved_fs.mkdir(hdfs_dir) for f in os.listdir(local_dir): hdfs_file_path = os.path.join(hdfs_dir, f) local_file_path = os.path.join(local_dir, f) resolved_fs.put(local_file_path, hdfs_file_path)
def test_readlines(size, expected_lines): with tempfile.TemporaryDirectory() as temp_dir: file = _create_temp_file(temp_dir) resolved_fs, path = filesystem.resolve_filesystem_and_path(file) with resolved_fs.open(file, "rb") as fs_file: lines = fs_file.readlines(size) assert lines == expected_lines
def test_file_as_lines_list(): with tempfile.TemporaryDirectory() as temp_dir: file = _create_temp_file(temp_dir) resolved_fs, path = filesystem.resolve_filesystem_and_path(file) with resolved_fs.open(file, "rb") as fs_file: lines = list(fs_file) assert lines == [b"abcdef\n", b"\n", b"\n", b"123456789\n", b"\n", b"\n"]
def load_ckpt( model_ckpt_path: str, model: Union[DDP, torch.nn.Module], optimizer: torch.optim.Optimizer, device: Union[int, str] ) -> Dict[Any, Any]: resolved_fs, _ = filesystem.resolve_filesystem_and_path(model_ckpt_path) _logger.info(f"Loading model checkpoint {model_ckpt_path}") with resolved_fs.open(model_ckpt_path, "rb") as fd: checkpoint = torch.load(fd, map_location=torch.device(device)) _unwrap_model(model).load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) return checkpoint
def test_file_iterator(): with tempfile.TemporaryDirectory() as temp_dir: file = _create_temp_file(temp_dir) resolved_fs, path = filesystem.resolve_filesystem_and_path(file) with resolved_fs.open(file, "rb") as fs_file: it = iter(fs_file) line = next(it) assert line == b"abcdef\n" line = next(it) assert line == b"\n"
def find_latest_ckpt(model_dir: str) -> Optional[str]: latest_ckpt = None latest_epoch = -1 pattern = r".*model_(\d+).pt" resolved_fs, _ = filesystem.resolve_filesystem_and_path(model_dir) if resolved_fs.exists(model_dir): for p in resolved_fs.ls(model_dir): groups = re.match(pattern, p) if groups: epoch = int(groups.group(1)) if epoch > latest_epoch: latest_ckpt = groups.group(0) latest_epoch = epoch return latest_ckpt
def path_to_hdfs(): file_content = "Hello!" path_on_hdfs = f"hdfs:///tmp/{uuid.uuid4()}" filepath_on_hdfs = f"{path_on_hdfs}/hello.txt" fs, _ = filesystem.resolve_filesystem_and_path(path_on_hdfs) with tempfile.TemporaryDirectory() as temp_dir: with open(f"{temp_dir}/hello.txt", 'w') as fd: fd.write(file_content) fs.mkdir(path_on_hdfs) fs.put(f"{temp_dir}/hello.txt", filepath_on_hdfs) yield filepath_on_hdfs, file_content fs.rm(path_on_hdfs, recursive=True) fs.close()
def test_chmod(): with tempfile.TemporaryDirectory() as temp_dir: file = f"{temp_dir}/script.sh" with open(file, "wb") as f: lines = ("#! /bin/bash\n" "echo 'Hello world'\n") f.write(lines.encode()) fs, _ = filesystem.resolve_filesystem_and_path(file) with pytest.raises(PermissionError): subprocess.check_output([file]) fs.chmod(file, 0o755) output = subprocess.check_output([file]) assert "Hello world" in output.decode()
def test_put(): with tempfile.TemporaryDirectory() as temp_dir: file = f"{temp_dir}/script.sh" with open(file, "wb") as f: lines = ("#! /bin/bash\n" "echo 'Hello world'\n") f.write(lines.encode()) os.chmod(file, 0o755) fs, _ = filesystem.resolve_filesystem_and_path(file) remote_file = f"{temp_dir}/copied_script.sh" fs.put(file, remote_file) assert os.path.exists(remote_file) assert os.stat(remote_file).st_mode & 0o777 == 0o755
def upload_spec(spec_file: str, package_path: str = None, force_upload: bool = False, fs_args: Dict[str, Any] = {}) -> str: """Upload an environment from a spec file :param spec_file: the spec file, must be requirements.txt or conda.yaml :param package_path: the path where to upload the package :param force_upload: whether the cache should be cleared :param fs_args: specific arguments for special file systems (like S3) :return: package_path """ packer = packaging.detect_packer_from_spec(spec_file) if not package_path: package_path = ( f"{packaging.get_default_fs()}/user/{getpass.getuser()}" f"/envs/{_unique_filename(spec_file, packer)}") elif not package_path.endswith(packer.extension()): package_path = os.path.join(package_path, _unique_filename(spec_file, packer)) resolved_fs, path = filesystem.resolve_filesystem_and_path( package_path, **fs_args) hash = _get_hash(spec_file) _logger.info(f"Packaging from {spec_file} with hash={hash}") reqs = [hash] if force_upload or not _is_archive_up_to_date(package_path, reqs, resolved_fs): _logger.info(f"Zipping and uploading your env to {package_path}") with tempfile.TemporaryDirectory() as tempdir: archive_local = packer.pack_from_spec( spec_file=spec_file, output=f"{tempdir}/{packer.env_name()}.{packer.extension()}") dir = os.path.dirname(package_path) if not resolved_fs.exists(dir): resolved_fs.mkdir(dir) resolved_fs.put(archive_local, package_path) _dump_archive_metadata(package_path, reqs, resolved_fs) else: _logger.info(f"{package_path} already exists") return package_path
def upload_zip(zip_file: str, package_path: str = None): packer = packaging.detect_packer_from_file(zip_file) package_path, _, _ = packaging.detect_archive_names(packer, package_path) resolved_fs, path = filesystem.resolve_filesystem_and_path(package_path) with tempfile.TemporaryDirectory() as tempdir: parsed_url = parse.urlparse(zip_file) if parsed_url.scheme == "http": tmp_zip_file = os.path.join(tempdir, os.path.basename(parsed_url.path)) request.urlretrieve(zip_file, tmp_zip_file) zip_file = tmp_zip_file _upload_zip(zip_file, package_path, resolved_fs) return package_path
def _submit_and_await_app_master(func, assert_result_status=True, assert_log_content=None): with skein.Client() as client: log_output_path = f"hdfs:///tmp/{uuid.uuid4()}.log" app_id = skein_launcher.submit_func( client, func=func, args=[], memory="2 GiB", process_logs=functools.partial(skein_launcher.upload_logs_to_hdfs, log_output_path)) result = skein_launcher.wait_for_finished(client, app_id) fs, _ = filesystem.resolve_filesystem_and_path(log_output_path) with fs.open(log_output_path, "rb") as f: logs = f.read().decode() assert result == assert_result_status _logger.info(f"appmaster logs:\n{logs}") assert assert_log_content in logs
def upload_env(package_path: str = None, packer=None, additional_packages: Dict[str, str] = {}, ignored_packages: Collection[str] = []) -> Tuple[str, str]: if packer is None: packer = packaging.detect_packer_from_env() package_path, env_name, pex_file = packaging.detect_archive_names( packer, package_path) resolved_fs, path = filesystem.resolve_filesystem_and_path(package_path) if not packaging._running_from_pex(): _upload_env_from_venv(package_path, packer, additional_packages, ignored_packages, resolved_fs) else: _upload_zip(pex_file, package_path, resolved_fs) return (package_path, env_name)
def __init__(self, dataset_path: str, batch_size: int, num_samples: Optional[int] = None, columns: List[str] = None) -> None: self.fs, _ = resolve_filesystem_and_path(dataset_path) self.columns = columns self.num_samples = num_samples if num_samples \ else _read_num_samples(dataset_path, self.fs) self.dataset_file_paths = [ f for f in self.fs.base_fs.ls(dataset_path) if f.endswith(".parquet") ] self.batch_size = batch_size self.worker_id = dist.get_rank() if dist.is_initialized() else 0 self.num_workers = dist.get_world_size() if dist.is_initialized( ) else 1 logger.info( f"worker_id: {self.worker_id}; num_workers: {self.num_workers}")
def test_rm(): with tempfile.TemporaryDirectory() as temp_dir: d = os.path.join(temp_dir, "a", "b", "c") os.makedirs(d) file1 = _create_temp_file(d, "file1.txt") file2 = _create_temp_file(d, "file2.txt") fs, _ = filesystem.resolve_filesystem_and_path(file1) assert fs.exists(file1) assert fs.exists(file2) assert fs.exists(d) fs.rm(file1) fs.rm(d, recursive=True) assert not fs.exists(file1) assert not fs.exists(file2) assert not fs.exists(d)
def save_ckpt( model_dir: str, model: Union[DDP, torch.nn.Module], optimizer: torch.optim.Optimizer, epoch: int, **kwargs: Dict[Any, Any] ) -> Optional[str]: if int(os.environ[PYTORCH_DPP_RANK]) != 0: return None state = { 'model': _unwrap_model(model).state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, **kwargs } resolved_fs, _ = filesystem.resolve_filesystem_and_path(model_dir) if not resolved_fs.exists(model_dir): resolved_fs.mkdir(model_dir) model_ckpt_path = os.path.join(model_dir, f"model_{epoch}.pt") with resolved_fs.open(model_ckpt_path, "wb") as fd: torch.save(state, fd) return model_ckpt_path
def main(): fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE) if not fs.exists(WINE_EQUALITY_FILE): raise Exception(f"{WINE_EQUALITY_FILE} not found") run_on_yarn(experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__, }, custom_task_module="tf_yarn.tensorflow.tasks.gloo_allred_task")
def upload_env(package_path: str = None, packer: packaging.Packer = None, additional_packages: Dict[str, str] = {}, ignored_packages: Collection[str] = [], force_upload: bool = False, include_editable: bool = False, fs_args: Dict[str, Any] = {}) -> Tuple[str, str]: if packer is None: packer = packaging.detect_packer_from_env() package_path, env_name, pex_file = packaging.detect_archive_names( packer, package_path) resolved_fs, path = filesystem.resolve_filesystem_and_path( package_path, **fs_args) if not packaging._running_from_pex(): _upload_env_from_venv(package_path, packer, additional_packages, ignored_packages, resolved_fs, force_upload, include_editable) else: _upload_zip(pex_file, package_path, resolved_fs, force_upload) return (package_path, env_name)
def main(): fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE) if not fs.exists(WINE_EQUALITY_FILE): raise Exception(f"{WINE_EQUALITY_FILE} not found") # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn() run_on_yarn(experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__ })
estimator = tf.estimator.LinearClassifier( feature_columns=winequality.get_feature_columns(), model_dir=f"{HDFS_DIR}", n_classes=winequality.get_n_classes(), config=tf.estimator.RunConfig(save_checkpoints_steps=1000, )) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=100), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE) if not fs.exists(WINE_EQUALITY_FILE): raise Exception(f"{WINE_EQUALITY_FILE} not found") # you need to install mlflow `pip install mlflow` # and set MLflow tracking uri mlflow.set_tracking_uri(os.getenv("CRITEO_MLFLOW_TRACKING_URI", "")) experiment_name = "tf-yarn-tests" exp = mlflow.get_experiment_by_name(experiment_name) if not exp: experiment_id = mlflow.create_experiment( experiment_name, f"{_get_fs_for_tests()}/user/{USER}/mlflow_artifacts") else: experiment_id = exp.experiment_id
def _get_num_rows(file: str) -> int: fs, _ = resolve_filesystem_and_path(file) with fs.base_fs.open(file) as f: parquet_file = pq.ParquetFile(f) return parquet_file.metadata.num_rows