Пример #1
0
def uninstall_cli(
        api_client,
        cluster_id,
        all,
        jar,
        egg,
        maven_coordinates,
        maven_repo,  # noqa
        maven_exclusion,
        pypi_package,
        pypi_repo,
        cran_package,
        cran_repo):
    """
    Mark libraries on a cluster to be uninstalled. Libraries which are marked to be uninstalled
    will stay attached until the cluster is restarted. (see `databricks clusters restart -h`).
    """
    if all:
        library_statuses = _cluster_status(api_client, cluster_id).get(
            'library_statuses', [])
        libraries = [l_status['library'] for l_status in library_statuses]
        LibrariesApi(api_client).uninstall_libraries(cluster_id, libraries)
        _uninstall_cli_exit_help(cluster_id)
        return
    library = _get_library_from_options(jar, egg, maven_coordinates,
                                        maven_repo, maven_exclusion,
                                        pypi_package, pypi_repo, cran_package,
                                        cran_repo)
    LibrariesApi(api_client).uninstall_libraries(cluster_id, [library])
    _uninstall_cli_exit_help(cluster_id)
Пример #2
0
 def __init__(self, logger, **kwargs):
     """
     :param **kwargs:
         reserved python word for unlimited parameters
         keys should only include: token, host
     :type **kwargs: dict
     """
     self.api_client = ApiClient(**kwargs)
     self.cluster_client = ClusterApi(self.api_client)
     self.libraries_client = LibrariesApi(self.api_client)
     self.logger = logger
Пример #3
0
def install_cli(
        api_client,
        cluster_id,
        jar,
        egg,
        maven_coordinates,
        maven_repo,
        maven_exclusion,  # noqa
        pypi_package,
        pypi_repo,
        cran_package,
        cran_repo):
    """
    Install a library ona a cluster. Libraries must be first uploaded to dbfs or s3
    (see `dbfs cp -h`). Unlike the API, only one library can be installed for each execution of
    `databricks libraries install`.

    Users should only provide one of
    [--jar, --egg, --maven-coordinates, --pypi-package, --cran-package].
    """
    library = _get_library_from_options(jar, egg, maven_coordinates,
                                        maven_repo, maven_exclusion,
                                        pypi_package, pypi_repo, cran_package,
                                        cran_repo)
    LibrariesApi(api_client).install_libraries(cluster_id, [library])
def get_library_state(profile, cluster_id):
    """Get the state of the library installation on the remote cluster
    
    Args:
        cluster_id (str): Cluster ID
        host (str): host from databricks cli config for given profile string
        token (str): token from databricks cli config for given profile stringf
    
    Returns:
        list: list of installation status for each custom library
    """
    try:
        apiclient = connect(profile)
        client = LibrariesApi(apiclient)
        libraries = client.cluster_status(cluster_id)
    except Exception as ex:
        print_error(ex)
        return None

    if libraries.get("library_statuses", None) is None:
        return []
    else:
        return [lib["status"] for lib in libraries["library_statuses"]]
Пример #5
0
def prepare_for_operationalization(cluster_id, api_client, dbfs_path,
                                   overwrite, spark_version):
    """
    Installs appropriate versions of several libraries to support operationalization.

    Args:
        cluster_id (str): cluster_id representing the cluster to prepare for operationalization
        api_client (ApiClient): the ApiClient object used to authenticate to the workspace
        dbfs_path (str): the path on dbfs to upload libraries to
        overwrite (bool): whether to overwrite existing files on dbfs with new files of the same name
        spark_version (str): str version indicating which version of spark is installed on the databricks cluster

    Returns:
        A dictionary of libraries installed
    """
    print("Preparing for operationlization...")

    cosmosdb_jar_url = COSMOSDB_JAR_FILE_OPTIONS[spark_version]

    # download the cosmosdb jar
    local_jarname = os.path.basename(cosmosdb_jar_url)
    # only download if you need it:
    if overwrite or not os.path.exists(local_jarname):
        print("Downloading {}...".format(cosmosdb_jar_url))
        local_jarname, _ = urlretrieve(cosmosdb_jar_url, local_jarname)
    else:
        print("File {} already downloaded.".format(local_jarname))

    # upload jar to dbfs:
    upload_path = Path(dbfs_path, local_jarname).as_posix()
    print("Uploading CosmosDB driver to databricks at {}".format(upload_path))
    if dbfs_file_exists(api_client, upload_path) and overwrite:
        print("Overwriting file at {}".format(upload_path))
    DbfsApi(api_client).cp(recursive=False,
                           src=local_jarname,
                           dst=upload_path,
                           overwrite=overwrite)

    # setup the list of libraries to install:
    # jar library setup
    libs2install = [{"jar": upload_path}]
    # setup libraries to install:
    libs2install.extend([{"pypi": {"package": i}} for i in PYPI_O16N_LIBS])
    print(
        "Installing jar and pypi libraries required for operationalization...")
    LibrariesApi(api_client).install_libraries(cluster_id, libs2install)
    return libs2install
Пример #6
0
def install_cli(
        api_client,
        cluster_id,
        jar,
        egg,
        whl,
        maven_coordinates,
        maven_repo,  # noqa
        maven_exclusion,
        pypi_package,
        pypi_repo,
        cran_package,
        cran_repo):
    """
    Install a library on a cluster. Libraries must be first uploaded to dbfs or s3
    (see `dbfs cp -h`). Unlike the API, only one library can be installed for each execution of
    `databricks libraries install`.

    Users should only provide one of
    [--jar, --egg, --whl, --maven-coordinates, --pypi-package, --cran-package].

    Installing a whl library on clusters running Databricks Runtime 4.2 or higher effectively runs
    the pip command against the wheel file directly on driver and executors.The library must satisfy
    the wheel file name convention.
    To install multiple wheel files, use the .wheelhouse.zip file that includes all the wheel files
    with the --whl option.

    Installing a wheel library on clusters running Databricks Runtime lower than 4.2 just adds the
    file to the PYTHONPATH variable, without installing the dependencies.
    More information is available here:
    https://docs.databricks.com/api/latest/libraries.html#managedlibrariesmanagedlibraryserviceinstalllibraries
    """
    library = _get_library_from_options(jar, egg, whl, maven_coordinates,
                                        maven_repo, maven_exclusion,
                                        pypi_package, pypi_repo, cran_package,
                                        cran_repo)
    LibrariesApi(api_client).install_libraries(cluster_id, [library])
Пример #7
0
def _cluster_status(api_client, cluster_id):
    click.echo(
        pretty_format(LibrariesApi(api_client).cluster_status(cluster_id)))
Пример #8
0
def _all_cluster_statuses(config):
    click.echo(pretty_format(LibrariesApi(config).all_cluster_statuses()))
Пример #9
0
class ClusterManagement:
    def __init__(self, logger, **kwargs):
        """
        :param **kwargs:
            reserved python word for unlimited parameters
            keys should only include: token, host
        :type **kwargs: dict
        """
        self.api_client = ApiClient(**kwargs)
        self.cluster_client = ClusterApi(self.api_client)
        self.libraries_client = LibrariesApi(self.api_client)
        self.logger = logger

    def create_cluster(self, cluster_specs):
        """function to build/edit cluster and start

        :param cluster_specs: cluster specs in clusterconf.yaml
        :type cluster_specs: dict
        """
        # self.cluster_client.get_cluster_by_name("unknown")

        try:
            cluster = self.cluster_client.get_cluster_by_name(
                cluster_specs["cluster_name"])

            self.logger.info(f"cluster {cluster['cluster_name']} exists "
                             f"with id {cluster['cluster_id']}")
            self.logger.debug(cluster_specs)
            self.logger.debug(cluster)

            if not cluster_specs.items() <= cluster.items():
                self.logger.warning(
                    "cluster spec doesn't match existing cluster")

                cluster_specs['cluster_id'] = cluster['cluster_id']
                self.cluster_client.edit_cluster(cluster_specs)
            else:
                self.logger.info("cluster spec matches")
        except Exception:
            cluster = self.cluster_client.create_cluster(cluster_specs)
            self.logger.info(f"the cluster {cluster} is being created")
            time.sleep(30)

        cluster_id = cluster['cluster_id']
        status = self._cluster_status(cluster_id)

        while status['state'] in ["RESTARTING", "RESIZING", "TERMINATING"]:
            self.logger.info(
                f"waiting for the cluster. status {status['state']}")
            time.sleep(10)
            status = self._cluster_status(cluster_id)

        while status['state'] in ["TERMINATED", "PENDING"]:
            self.logger.info(f"cluster status {status['state']}")
            if status['state'] == "TERMINATED":
                self.logger.info(f"starting cluster, status {status['state']}")
                self.cluster_client.start_cluster(cluster_id)

            time.sleep(10)
            status = self._cluster_status(cluster_id)

        self.logger.info(f"cluster is up. final status: {status['state']}")

        return cluster_id

    def install_cluster_library(self, cluster_id, cluster_libraries):
        """function to install libraries on cluster

        :param cluster_id: id of cluster in Databricks to install libs on
        :type cluster_id: str
        :param cluster_libraries: clusterlib.yaml
        :type cluster_libraries: list(dict)
        """
        try:
            if not isinstance(cluster_libraries, list):
                raise ValueError(
                    f"cluster_libraries is not a list: {cluster_libraries}")

            current_libs = self.libraries_client.cluster_status(cluster_id)

            # parse the libs to match the yaml
            parsed_currentlibs = []
            if current_libs.get("library_statuses"):
                for lib in current_libs["library_statuses"]:
                    parsed_currentlibs.append(lib["library"])

            install_libs = [
                x for x in cluster_libraries if x not in parsed_currentlibs
            ]
            self.logger.info(f"install libraries: {install_libs}")
            self.libraries_client.install_libraries(cluster_id, install_libs)

            uninstall_libs = [
                x for x in parsed_currentlibs if x not in cluster_libraries
            ]
            self.logger.warning(f"uninstall libraries: {uninstall_libs}")
            self.libraries_client.uninstall_libraries(cluster_id,
                                                      uninstall_libs)

        except Exception as error:
            self.logger.error(f"install_cluster_library error: {repr(error)}")

    def _cluster_status(self, cluster_id):
        """internal method to get cluster status

        :param cluster_id: id of databricks cluster
        :type cluster_id: str
        """
        try:
            status = self.cluster_client.get_cluster(cluster_id)
            return status
        except Exception as error:
            self.logger.error(f"cluster status error: {error}")

    def delete_unmanaged_clusters(self, cluster_config):
        """function to delete clusters that are not in clusterconf.yaml

        :param cluster_config: clusterconf.yaml
        :type cluster_config: list(dict)
        """
        existing_clusters = self.cluster_client.list_clusters()
        if existing_clusters.get("clusters"):
            existing_clusters = [
                c for c in existing_clusters.get("clusters")
                if c["cluster_source"].upper() != "JOB"
            ]
        self.logger.debug(existing_clusters)

        cluster_list = [c["cluster_name"] for c in cluster_config]
        remove_cluster = [(c["cluster_name"], c["cluster_id"])
                          for c in existing_clusters
                          if c["cluster_name"] not in cluster_list]

        self.logger.warning("removing unmanaged clusters:")
        self.logger.warning(remove_cluster)

        for c in remove_cluster:
            self.logger.debug(f"deleting {c[1]}")
            self.cluster_client.permanent_delete(c[1])

        return

    def main(self, cluster_specs, cluster_libraries):
        """main method to build/edit clusters and install libs

        :cluster_spec: cluster spec in clusterconf.yaml
        :type cluster_spec: dict
        :param cluster_libraries: clusterlib.yaml
        :type cluster_libraries: list(dict)
        """
        # self.logger.info("=======================================================")
        self.logger.info(
            f"create/update cluster: {cluster_specs['cluster_name']}")
        cluster_id = self.create_cluster(cluster_specs)

        self.logger.info("installing libraries")
        self.install_cluster_library(cluster_id, cluster_libraries)
Пример #10
0
                                               status["state"])))
        sys.exit()

    # install the library and its dependencies
    print("Installing the reco_utils module onto databricks cluster {}".format(
        args.cluster_id))
    libs2install = [{"egg": upload_path}]
    # PYPI dependencies:
    libs2install.extend([{"pypi": {"package": i}} for i in PYPI_RECO_LIB_DEPS])

    # add mmlspark if selected.
    if args.mmlspark:
        print("Installing MMLSPARK package...")
        libs2install.extend([MMLSPARK_INFO])
    print(libs2install)
    LibrariesApi(my_api_client).install_libraries(args.cluster_id,
                                                  libs2install)

    # prepare for operationalization if desired:
    if args.prepare_o16n:
        prepare_for_operationalization(
            cluster_id=args.cluster_id,
            api_client=my_api_client,
            dbfs_path=args.dbfs_path,
            overwrite=args.overwrite,
            spark_version=status["spark_version"][0],
        )

    # restart the cluster for new installation(s) to take effect.
    print("Restarting databricks cluster {}".format(args.cluster_id))
    ClusterApi(my_api_client).restart_cluster(args.cluster_id)