예제 #1
0
def test_upload_project_to_dbfs(dbfs_root_mock, tmpdir, dbfs_path_exists_mock,
                                upload_to_dbfs_mock):  # pylint: disable=unused-argument
    # Upload project to a mock directory
    dbfs_path_exists_mock.return_value = False
    runner = DatabricksJobRunner(databricks_profile="DEFAULT")
    dbfs_uri = runner._upload_project_to_dbfs(project_dir=TEST_PROJECT_DIR,
                                              experiment_id=0)
    # Get expected tar
    local_tar_path = os.path.join(dbfs_root_mock, dbfs_uri.split("/dbfs/")[1])
    expected_tar_path = str(tmpdir.join("expected.tar.gz"))
    file_utils.make_tarfile(output_filename=expected_tar_path,
                            source_dir=TEST_PROJECT_DIR,
                            archive_name=databricks.DB_TARFILE_ARCHIVE_NAME)
    # Extract the tarred project, verify its contents
    assert filecmp.cmp(local_tar_path, expected_tar_path, shallow=False)
예제 #2
0
def _create_docker_build_ctx(work_dir, dockerfile_contents):
    """
    Creates build context tarfile containing Dockerfile and project code, returning path to tarfile
    """
    directory = tempfile.mkdtemp()
    try:
        dst_path = os.path.join(directory, "mlflow-project-contents")
        shutil.copytree(src=work_dir, dst=dst_path)
        with open(os.path.join(dst_path, _GENERATED_DOCKERFILE_NAME), "w") as handle:
            handle.write(dockerfile_contents)
        _, result_path = tempfile.mkstemp()
        file_utils.make_tarfile(
            output_filename=result_path,
            source_dir=dst_path, archive_name=_PROJECT_TAR_ARCHIVE_NAME)
    finally:
        shutil.rmtree(directory)
    return result_path
예제 #3
0
    def _upload_project_to_dbfs(self, project_dir, experiment_id):
        """
        Tars a project directory into an archive in a temp dir and uploads it to DBFS, returning
        the HDFS-style URI of the tarball in DBFS (e.g. dbfs:/path/to/tar).

        :param project_dir: Path to a directory containing an MLflow project to upload to DBFS (e.g.
                            a directory containing an MLproject file).
        """
        temp_tarfile_dir = tempfile.mkdtemp()
        temp_tar_filename = os.path.join(temp_tarfile_dir, "project.tar.gz")

        def custom_filter(x):
            return None if os.path.basename(x.name) == "mlruns" else x

        try:
            directory_size = file_utils._get_local_project_dir_size(project_dir)
            _logger.info(
                f"=== Creating tarball from {project_dir} in temp directory {temp_tarfile_dir} ==="
            )
            _logger.info(f"=== Total file size to compress: {directory_size} KB ===")
            file_utils.make_tarfile(
                temp_tar_filename, project_dir, DB_TARFILE_ARCHIVE_NAME, custom_filter=custom_filter
            )
            with open(temp_tar_filename, "rb") as tarred_project:
                tarfile_hash = hashlib.sha256(tarred_project.read()).hexdigest()
            # TODO: Get subdirectory for experiment from the tracking server
            dbfs_path = posixpath.join(
                DBFS_EXPERIMENT_DIR_BASE,
                str(experiment_id),
                "projects-code",
                "%s.tar.gz" % tarfile_hash,
            )
            tar_size = file_utils._get_local_file_size(temp_tar_filename)
            dbfs_fuse_uri = posixpath.join("/dbfs", dbfs_path)
            if not self._dbfs_path_exists(dbfs_path):
                _logger.info(
                    f"=== Uploading project tarball (size: {tar_size} KB) to {dbfs_fuse_uri} ==="
                )
                self._upload_to_dbfs(temp_tar_filename, dbfs_fuse_uri)
                _logger.info("=== Finished uploading project to %s ===", dbfs_fuse_uri)
            else:
                _logger.info("=== Project already exists in DBFS ===")
        finally:
            shutil.rmtree(temp_tarfile_dir)
        return dbfs_fuse_uri
예제 #4
0
    def _upload_project_to_dbfs(self, project_dir, experiment_id):
        """
        Tars a project directory into an archive in a temp dir and uploads it to DBFS, returning
        the HDFS-style URI of the tarball in DBFS (e.g. dbfs:/path/to/tar).

        :param project_dir: Path to a directory containing an MLflow project to upload to DBFS (e.g.
                            a directory containing an MLproject file).
        """
        temp_tarfile_dir = tempfile.mkdtemp()
        temp_tar_filename = file_utils.build_path(temp_tarfile_dir,
                                                  "project.tar.gz")

        def custom_filter(x):
            return None if os.path.basename(x.name) == "mlruns" else x

        try:
            file_utils.make_tarfile(temp_tar_filename,
                                    project_dir,
                                    DB_TARFILE_ARCHIVE_NAME,
                                    custom_filter=custom_filter)
            with open(temp_tar_filename, "rb") as tarred_project:
                tarfile_hash = hashlib.sha256(
                    tarred_project.read()).hexdigest()
            # TODO: Get subdirectory for experiment from the tracking server
            dbfs_fuse_uri = os.path.join("/dbfs", DBFS_EXPERIMENT_DIR_BASE,
                                         str(experiment_id), "projects-code",
                                         "%s.tar.gz" % tarfile_hash)
            if not self._dbfs_path_exists(dbfs_fuse_uri):
                self._upload_to_dbfs(temp_tar_filename, dbfs_fuse_uri)
                eprint("=== Finished uploading project to %s ===" %
                       dbfs_fuse_uri)
            else:
                eprint("=== Project already exists in DBFS ===")
        finally:
            shutil.rmtree(temp_tarfile_dir)
        return dbfs_fuse_uri