Exemplo n.º 1
0
def _upload_tensorboard_on_hdfs(local_dir: str, hdfs_dir: str) -> None:
    resolved_fs, _ = filesystem.resolve_filesystem_and_path(hdfs_dir)
    if not resolved_fs.exists(hdfs_dir):
        resolved_fs.mkdir(hdfs_dir)
    for f in os.listdir(local_dir):
        hdfs_file_path = os.path.join(hdfs_dir, f)
        local_file_path = os.path.join(local_dir, f)
        resolved_fs.put(local_file_path, hdfs_file_path)
Exemplo n.º 2
0
def test_readlines(size, expected_lines):
    with tempfile.TemporaryDirectory() as temp_dir:
        file = _create_temp_file(temp_dir)

        resolved_fs, path = filesystem.resolve_filesystem_and_path(file)

        with resolved_fs.open(file, "rb") as fs_file:
            lines = fs_file.readlines(size)
            assert lines == expected_lines
Exemplo n.º 3
0
def test_file_as_lines_list():
    with tempfile.TemporaryDirectory() as temp_dir:
        file = _create_temp_file(temp_dir)

        resolved_fs, path = filesystem.resolve_filesystem_and_path(file)

        with resolved_fs.open(file, "rb") as fs_file:
            lines = list(fs_file)
            assert lines == [b"abcdef\n", b"\n", b"\n", b"123456789\n", b"\n", b"\n"]
Exemplo n.º 4
0
def load_ckpt(
    model_ckpt_path: str, model: Union[DDP, torch.nn.Module],
    optimizer: torch.optim.Optimizer, device: Union[int, str]
) -> Dict[Any, Any]:
    resolved_fs, _ = filesystem.resolve_filesystem_and_path(model_ckpt_path)
    _logger.info(f"Loading model checkpoint {model_ckpt_path}")
    with resolved_fs.open(model_ckpt_path, "rb") as fd:
        checkpoint = torch.load(fd, map_location=torch.device(device))
    _unwrap_model(model).load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    return checkpoint
Exemplo n.º 5
0
def test_file_iterator():
    with tempfile.TemporaryDirectory() as temp_dir:
        file = _create_temp_file(temp_dir)

        resolved_fs, path = filesystem.resolve_filesystem_and_path(file)

        with resolved_fs.open(file, "rb") as fs_file:
            it = iter(fs_file)
            line = next(it)
            assert line == b"abcdef\n"
            line = next(it)
            assert line == b"\n"
Exemplo n.º 6
0
def find_latest_ckpt(model_dir: str) -> Optional[str]:
    latest_ckpt = None
    latest_epoch = -1
    pattern = r".*model_(\d+).pt"
    resolved_fs, _ = filesystem.resolve_filesystem_and_path(model_dir)
    if resolved_fs.exists(model_dir):
        for p in resolved_fs.ls(model_dir):
            groups = re.match(pattern, p)
            if groups:
                epoch = int(groups.group(1))
                if epoch > latest_epoch:
                    latest_ckpt = groups.group(0)
                    latest_epoch = epoch
    return latest_ckpt
Exemplo n.º 7
0
def path_to_hdfs():
    file_content = "Hello!"
    path_on_hdfs = f"hdfs:///tmp/{uuid.uuid4()}"
    filepath_on_hdfs = f"{path_on_hdfs}/hello.txt"

    fs, _ = filesystem.resolve_filesystem_and_path(path_on_hdfs)

    with tempfile.TemporaryDirectory() as temp_dir:
        with open(f"{temp_dir}/hello.txt", 'w') as fd:
            fd.write(file_content)
        fs.mkdir(path_on_hdfs)
        fs.put(f"{temp_dir}/hello.txt", filepath_on_hdfs)
    yield filepath_on_hdfs, file_content
    fs.rm(path_on_hdfs, recursive=True)
    fs.close()
Exemplo n.º 8
0
def test_chmod():
    with tempfile.TemporaryDirectory() as temp_dir:
        file = f"{temp_dir}/script.sh"
        with open(file, "wb") as f:
            lines = ("#! /bin/bash\n" "echo 'Hello world'\n")
            f.write(lines.encode())

        fs, _ = filesystem.resolve_filesystem_and_path(file)

        with pytest.raises(PermissionError):
            subprocess.check_output([file])
        fs.chmod(file, 0o755)

        output = subprocess.check_output([file])
        assert "Hello world" in output.decode()
Exemplo n.º 9
0
def test_put():
    with tempfile.TemporaryDirectory() as temp_dir:
        file = f"{temp_dir}/script.sh"
        with open(file, "wb") as f:
            lines = ("#! /bin/bash\n" "echo 'Hello world'\n")
            f.write(lines.encode())
        os.chmod(file, 0o755)

        fs, _ = filesystem.resolve_filesystem_and_path(file)

        remote_file = f"{temp_dir}/copied_script.sh"
        fs.put(file, remote_file)

        assert os.path.exists(remote_file)
        assert os.stat(remote_file).st_mode & 0o777 == 0o755
Exemplo n.º 10
0
def upload_spec(spec_file: str,
                package_path: str = None,
                force_upload: bool = False,
                fs_args: Dict[str, Any] = {}) -> str:
    """Upload an environment from a spec file

    :param spec_file: the spec file, must be requirements.txt or conda.yaml
    :param package_path: the path where to upload the package
    :param force_upload: whether the cache should be cleared
    :param fs_args: specific arguments for special file systems (like S3)
    :return: package_path
    """
    packer = packaging.detect_packer_from_spec(spec_file)
    if not package_path:
        package_path = (
            f"{packaging.get_default_fs()}/user/{getpass.getuser()}"
            f"/envs/{_unique_filename(spec_file, packer)}")
    elif not package_path.endswith(packer.extension()):
        package_path = os.path.join(package_path,
                                    _unique_filename(spec_file, packer))

    resolved_fs, path = filesystem.resolve_filesystem_and_path(
        package_path, **fs_args)

    hash = _get_hash(spec_file)
    _logger.info(f"Packaging from {spec_file} with hash={hash}")
    reqs = [hash]

    if force_upload or not _is_archive_up_to_date(package_path, reqs,
                                                  resolved_fs):
        _logger.info(f"Zipping and uploading your env to {package_path}")

        with tempfile.TemporaryDirectory() as tempdir:
            archive_local = packer.pack_from_spec(
                spec_file=spec_file,
                output=f"{tempdir}/{packer.env_name()}.{packer.extension()}")

            dir = os.path.dirname(package_path)
            if not resolved_fs.exists(dir):
                resolved_fs.mkdir(dir)
            resolved_fs.put(archive_local, package_path)

            _dump_archive_metadata(package_path, reqs, resolved_fs)
    else:
        _logger.info(f"{package_path} already exists")

    return package_path
Exemplo n.º 11
0
def upload_zip(zip_file: str, package_path: str = None):
    packer = packaging.detect_packer_from_file(zip_file)
    package_path, _, _ = packaging.detect_archive_names(packer, package_path)

    resolved_fs, path = filesystem.resolve_filesystem_and_path(package_path)

    with tempfile.TemporaryDirectory() as tempdir:
        parsed_url = parse.urlparse(zip_file)
        if parsed_url.scheme == "http":
            tmp_zip_file = os.path.join(tempdir,
                                        os.path.basename(parsed_url.path))
            request.urlretrieve(zip_file, tmp_zip_file)
            zip_file = tmp_zip_file

        _upload_zip(zip_file, package_path, resolved_fs)

        return package_path
Exemplo n.º 12
0
def _submit_and_await_app_master(func, assert_result_status=True, assert_log_content=None):
    with skein.Client() as client:
        log_output_path = f"hdfs:///tmp/{uuid.uuid4()}.log"
        app_id = skein_launcher.submit_func(
            client,
            func=func,
            args=[],
            memory="2 GiB",
            process_logs=functools.partial(skein_launcher.upload_logs_to_hdfs, log_output_path))
        result = skein_launcher.wait_for_finished(client, app_id)

        fs, _ = filesystem.resolve_filesystem_and_path(log_output_path)
        with fs.open(log_output_path, "rb") as f:
            logs = f.read().decode()
            assert result == assert_result_status
            _logger.info(f"appmaster logs:\n{logs}")
            assert assert_log_content in logs
Exemplo n.º 13
0
def upload_env(package_path: str = None,
               packer=None,
               additional_packages: Dict[str, str] = {},
               ignored_packages: Collection[str] = []) -> Tuple[str, str]:
    if packer is None:
        packer = packaging.detect_packer_from_env()
    package_path, env_name, pex_file = packaging.detect_archive_names(
        packer, package_path)

    resolved_fs, path = filesystem.resolve_filesystem_and_path(package_path)

    if not packaging._running_from_pex():
        _upload_env_from_venv(package_path, packer, additional_packages,
                              ignored_packages, resolved_fs)
    else:
        _upload_zip(pex_file, package_path, resolved_fs)

    return (package_path, env_name)
Exemplo n.º 14
0
 def __init__(self,
              dataset_path: str,
              batch_size: int,
              num_samples: Optional[int] = None,
              columns: List[str] = None) -> None:
     self.fs, _ = resolve_filesystem_and_path(dataset_path)
     self.columns = columns
     self.num_samples = num_samples if num_samples \
         else _read_num_samples(dataset_path, self.fs)
     self.dataset_file_paths = [
         f for f in self.fs.base_fs.ls(dataset_path)
         if f.endswith(".parquet")
     ]
     self.batch_size = batch_size
     self.worker_id = dist.get_rank() if dist.is_initialized() else 0
     self.num_workers = dist.get_world_size() if dist.is_initialized(
     ) else 1
     logger.info(
         f"worker_id: {self.worker_id}; num_workers: {self.num_workers}")
Exemplo n.º 15
0
def test_rm():
    with tempfile.TemporaryDirectory() as temp_dir:
        d = os.path.join(temp_dir, "a", "b", "c")
        os.makedirs(d)
        file1 = _create_temp_file(d, "file1.txt")
        file2 = _create_temp_file(d, "file2.txt")

        fs, _ = filesystem.resolve_filesystem_and_path(file1)

        assert fs.exists(file1)
        assert fs.exists(file2)
        assert fs.exists(d)

        fs.rm(file1)
        fs.rm(d, recursive=True)

        assert not fs.exists(file1)
        assert not fs.exists(file2)
        assert not fs.exists(d)
Exemplo n.º 16
0
def save_ckpt(
    model_dir: str, model: Union[DDP, torch.nn.Module], optimizer: torch.optim.Optimizer,
    epoch: int, **kwargs: Dict[Any, Any]
) -> Optional[str]:
    if int(os.environ[PYTORCH_DPP_RANK]) != 0:
        return None

    state = {
        'model': _unwrap_model(model).state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': epoch,
        **kwargs
    }
    resolved_fs, _ = filesystem.resolve_filesystem_and_path(model_dir)
    if not resolved_fs.exists(model_dir):
        resolved_fs.mkdir(model_dir)
    model_ckpt_path = os.path.join(model_dir, f"model_{epoch}.pt")
    with resolved_fs.open(model_ckpt_path, "wb") as fd:
        torch.save(state, fd)
    return model_ckpt_path
Exemplo n.º 17
0
def main():
    fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE)
    if not fs.exists(WINE_EQUALITY_FILE):
        raise Exception(f"{WINE_EQUALITY_FILE} not found")

    run_on_yarn(experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "worker":
                    TaskSpec(memory="2 GiB",
                             vcores=4,
                             instances=(HVD_SIZE - 1)),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                    os.path.basename(__file__): __file__,
                },
                custom_task_module="tf_yarn.tensorflow.tasks.gloo_allred_task")
Exemplo n.º 18
0
def upload_env(package_path: str = None,
               packer: packaging.Packer = None,
               additional_packages: Dict[str, str] = {},
               ignored_packages: Collection[str] = [],
               force_upload: bool = False,
               include_editable: bool = False,
               fs_args: Dict[str, Any] = {}) -> Tuple[str, str]:
    if packer is None:
        packer = packaging.detect_packer_from_env()
    package_path, env_name, pex_file = packaging.detect_archive_names(
        packer, package_path)

    resolved_fs, path = filesystem.resolve_filesystem_and_path(
        package_path, **fs_args)

    if not packaging._running_from_pex():
        _upload_env_from_venv(package_path, packer, additional_packages,
                              ignored_packages, resolved_fs, force_upload,
                              include_editable)
    else:
        _upload_zip(pex_file, package_path, resolved_fs, force_upload)

    return (package_path, env_name)
Exemplo n.º 19
0
def main():
    fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE)
    if not fs.exists(WINE_EQUALITY_FILE):
        raise Exception(f"{WINE_EQUALITY_FILE} not found")

    # forcing call to model_to_estimator._save_first_checkpoint l457
    # https://github.com/tensorflow/estimator/blob/ \
    # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py
    # otherwise there is a race condition
    # when all workers try to save the first checkpoint at the same time
    experiment_fn()

    run_on_yarn(experiment_fn,
                task_specs={
                    "chief": TaskSpec(memory="2 GiB", vcores=4),
                    "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4),
                    "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2),
                    "evaluator": TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                    os.path.basename(__file__): __file__
                })
Exemplo n.º 20
0
    estimator = tf.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=f"{HDFS_DIR}",
        n_classes=winequality.get_n_classes(),
        config=tf.estimator.RunConfig(save_checkpoints_steps=1000, ))
    return Experiment(
        estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=100),
        tf.estimator.EvalSpec(eval_input_fn,
                              steps=10,
                              start_delay_secs=0,
                              throttle_secs=30))


if __name__ == "__main__":
    fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE)
    if not fs.exists(WINE_EQUALITY_FILE):
        raise Exception(f"{WINE_EQUALITY_FILE} not found")

    # you need to install mlflow `pip install mlflow`
    # and set MLflow tracking uri
    mlflow.set_tracking_uri(os.getenv("CRITEO_MLFLOW_TRACKING_URI", ""))

    experiment_name = "tf-yarn-tests"
    exp = mlflow.get_experiment_by_name(experiment_name)
    if not exp:
        experiment_id = mlflow.create_experiment(
            experiment_name,
            f"{_get_fs_for_tests()}/user/{USER}/mlflow_artifacts")
    else:
        experiment_id = exp.experiment_id
Exemplo n.º 21
0
def _get_num_rows(file: str) -> int:
    fs, _ = resolve_filesystem_and_path(file)
    with fs.base_fs.open(file) as f:
        parquet_file = pq.ParquetFile(f)
        return parquet_file.metadata.num_rows