def create(self, group_name): """ Creates a new group Parameters ---------- group_name : str A group to be created Returns ------- dict : A Python representation of the JSON returned by the API """ METHOD = 'POST' API_PATH = '/groups/create' data = {'group_name': group_name} # Make REST call resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return resp.json() elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def list_node_types(self): """ List details on all possible node types for Databricks. Not all node types will be available for the given subscription. :return: List object with information (dict) of all possible node """ METHOD = 'GET' API_PATH = 'clusters/list-node-types' resp = self._rest_call[METHOD](API_PATH) if resp.status_code == 200: return resp.json()['node_types'] elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def list(self): """ Lists all groups in the workspace Returns ------- list : A list of of group_names """ METHOD = 'GET' API_PATH = '/groups/list' # Make REST call resp = self._rest_call[METHOD](API_PATH) if resp.status_code == 200: return resp.json().get('group_names') elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def list_members(self, group_name): """ Lists members of a given group Parameters ---------- group_name : str A group for which members should be listed Returns ------- list : A list of Python dict objects (specifying user_name or group_name """ METHOD = 'GET' API_PATH = '/groups/list-members' data = {'group_name': group_name} # Make REST call resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return resp.json().get('members') elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def delete(self, group_name): """ Deletes a group Parameters ---------- group_name : str A group to be deleted Returns ------- str : The name of the removed group """ METHOD = 'POST' API_PATH = '/groups/delete' data = {'group_name': group_name} # Make REST call resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return group_name elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def all_cluster_statuses(self): """ Returns library status for all clusters. Parameters ---------- Returns ------- A json string containing the libraries installed on all clusters Format here : https://docs.azuredatabricks.net/dev-tools/api/latest/libraries.html#all-cluster-statuses """ METHOD = 'GET' API_PATH = '/libraries/all-cluster-statuses' # Make REST call resp = self._rest_call[METHOD](API_PATH) if resp.status_code == 200: return resp.json() elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def export(self, dbx_path, file_path, file_format='DBC'): """ Exports the Databricks path to a file on the local PC. Parameters ---------- dbx_path : str The path, in the Databricks workspace, to export file_path : str The path, on the local PC, where the file should be created file_format: str, optional The format of the file to be saved. Defaults to DBC. Must be in SOURCT Returns ------- file_path if successful Raises ------ ResourceDoesNotExist: If the given Databricks path does not exist AuthorizationError: If the services returns a 403 status code APIError: If the status code returned by the service is anything except 200 and is not captured above """ METHOD = 'GET' API_PATH = '/workspace/export' if file_format.upper() not in EXPORT_FORMATS: raise UnknownFormat('{0} is not a supported format type. Please use DBC, SOURCE, HTML, or JUPYTER') data = {'path': dbx_path, 'format': file_format, 'direct_download': True} resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: with open(file_path, 'wb+') as fo: fo.write(resp.get('content')) return file_path elif resp.status_code == 403: raise AuthorizationError("User is not authorized or token is incorrect.") elif resp.json().get("error_code") == "MAX_NOTEBOOK_SIZE_EXCEEDED": raise MaxNotebookSizeExceeded(resp.json().get('message')) elif resp.json().get("error_code") == "RESOURCE_DOES_NOT_EXIST": raise ResourceDoesNotExist(resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format(resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def choose_exception(response: requests.Response) -> Exception: """ Choose the correct error handling message if status is not 200 Parameters ---------- response: The requests.Response object returned from the API call Returns ------- Exception: The appropriate exception to raise """ if response.status_code == 403: # pragma: no cover return_error = AuthorizationError( "User is not authorized or token is incorrect.") else: # pragma: no cover if response.json().get("error_code") in ERROR_CODES: return_error = ERROR_CODES[response.json().get('error_code')]( response.json().get('message')) else: return_error = APIError("Response code {0}: {1} {2}".format( response.status_code, response.json().get('error_code'), response.json().get('message'))) return return_error
def delete(self, path, recursive=False, not_exists_ok=False): """ Deletes the path in the given workspace. Parameters ---------- path : str The path, in the Databricks workspace, to delete recursive : bool, optional Recursively delete the given path not_exists_ok : bool, optional If the given path is not found, avoid raising error Returns ------- path if successfully deleted Raises ------ ResourceDoesNotExist: If not_exists_ok is set to False and the given path does not exist AuthorizationError: If the services returns a 403 status code APIError: If the status code returned by the service is anything except 200 and is not captured above """ METHOD = 'POST' API_PATH = '/workspace/delete' data = {'path': path, 'recursive': recursive} resp = self._rest_call[METHOD](API_PATH, data=data) # Process response if resp.status_code == 200: return path elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: if resp.json().get( "error_code" ) == "RESOURCE_DOES_NOT_EXIST" and not_exists_ok: return path else: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def mkdirs(self, path, exists_ok=False): """ Creates the given directory and necessary parent directories if they do not exist. If there exists an object (not a directory) at any prefix of the input path, this call raises an error RESOURCE_ALREADY_EXISTS. Note that if this operation fails it may have succeeded in creating some of the necessary parent directories. Parameters ---------- path : str The path, in the Databricks workspace, where a directory should be made exists_ok : bool, optional Supress an error a resource already exists at the given endpoint Returns ------- path : str The path that was created Raises ------ ResourceAlreadyExists If you are trying to create a path that already exists and the exists_ok flag is false. APIError If the Databricks API returned an error """ METHOD = 'POST' API_PATH = '/workspace/mkdirs' data = {'path': path} resp = self._rest_call[METHOD](API_PATH, data=data) # Process response if resp.status_code == 200: return path elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: if resp.json( ).get("error_code") == "RESOURCE_ALREADY_EXISTS" and exists_ok: return path else: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def __send_cluster_id_to_endpoint(self, method, api_path, cluster_name, cluster_id): """ Private method to post cluster id only to a given endpoint Parameters ---------- method : str HTTP POST or GET method api_path : str API path that post request is sent to cluster_name : str, optional The name of the cluster. cluster_id : str, optional The id of the cluster to be terminated. Returns ------- The cluster ID of a stopped cluster Raises ------ ValueError When neither cluster_name or cluster_id are passed ResourceDoesNotExist When a cluster with the given name or id aren't found Returns ------- """ if not (cluster_name or cluster_id): raise ValueError("Either cluster_id or cluster_name must be specified") if cluster_name and not cluster_id: cluster_id = self.get_cluster_id(cluster_name) data = {"cluster_id": cluster_id} resp = self._rest_call[method](api_path, data=data) if resp.status_code == 200 and method == 'GET': return resp.json() elif resp.status_code == 200: return cluster_id elif resp.status_code == 403: raise AuthorizationError("User is not authorized or token is incorrect.") elif resp.status_code == 400 and resp.json()['message'] == "Cluster {id} does not exist": raise ResourceDoesNotExist(resp.json()['message']) else: raise APIError("Response code {0}: {1} {2}".format(resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def add_member(self, parent_group, group_name=None, user_name=None): """ Adds a new member (either user or group) to a given parent group Parameters ---------- parent_group : str The group to which the new user or group should be added group_name : str, optional A group to be added to parent group user_name : str, optional A user to be added to parent group Returns ------- str : The group name or user name added Raises ------ ValueError If both group_name and user_name are defined or if neither group_name or user_name are defined """ METHOD = 'POST' API_PATH = '/groups/add-member' # Process group_name and user_name and add parent name to resulting dict data, target_name = self.__prep_group_or_user(group_name=group_name, user_name=user_name) data['parent_name'] = parent_group # Make REST call resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return target_name elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def list(self, path): """Lists the contents of the given director Parameters ---------- path : str The path, in the Databricks workspace, of which, the contents should be listed Returns ------- List of WorkspaceObjectgs Raises ------ AuthorizationError: If the services returns a 403 status code APIError: If the status code returned by the service is anything except 200 and is not captured above """ METHOD = 'GET' API_PATH = '/workspace/list' data = {'path': path} resp = self._rest_call[METHOD](API_PATH, data=data) # Process response if resp.status_code == 200: if resp.json().get('objects'): return [ WorkspaceObjectInfo(**obj) for obj in resp.json().get('objects') ] else: return [] elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def spark_versions(self): METHOD = 'GET' API_PATH = 'clusters/spark-versions' resp = self._rest_call[METHOD](API_PATH) if resp.status_code == 200: return {item['key']: item['name'] for item in resp.json()['versions']} elif resp.status_code == 403: raise AuthorizationError("User is not authorized or token is incorrect.") else: raise APIError("Response code {0}: {1} {2}".format(resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def install(self, cluster_id, libraries, wait_for_completion=False, timeout=120): """ Installs new libraries on the cluster This is an async call. You can check the status of library installation using the 'cluster_status' method. Parameters ---------- cluster_id : str The display name of the cluster on which to install libraries libraries : array of libraries see https://docs.azuredatabricks.net/dev-tools/api/latest/libraries.html#install Returns ------- Cluster library status for given cluster """ METHOD = 'POST' API_PATH = '/libraries/install' # create payload to add librairies data = {'cluster_id': cluster_id, 'libraries': libraries} # Make REST call resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return self.cluster_status(cluster_id) elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def get_status(self, path): """ Gets the status of a given Databricks path Parameters ---------- path : str The path, in the Databricks workspace, to get the status of Returns ------- WorkspaceObject - details of the item at given path Raises ------ AuthorizationError: If the services returns a 403 status code APIError: If the status code returned by the service is anything except 200 and is not captured above """ METHOD = 'GET' API_PATH = '/workspace/get-status' data = {'path': path} resp = self._rest_call[METHOD](API_PATH, data=data) # Process response if resp.status_code == 200: return WorkspaceObjectInfo(**resp.json()) elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def list_parents(self, group_name=None, user_name=None): """ Lists all groups of a given user or group Parameters ---------- group_name : str, optional The name of a group user_name : str, optional The name of a user Returns ------- list : A list of of group_names """ METHOD = 'GET' API_PATH = '/groups/list-parents' # Process group_name and user_name data, target_name = self.__prep_group_or_user(group_name=group_name, user_name=user_name) # Make REST call resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return resp.json().get('group_names') elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def list(self): METHOD = 'GET' API_PATH = 'clusters/list' resp = self._rest_call[METHOD](API_PATH) if resp.status_code == 200: return resp.json().get('clusters') elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def cluster_status(self, cluster_id): """ Returns library status for a specific cluster. Parameters ---------- cluster_id : str The cluster ID to query Returns ------- A json string containing the libraries installed on this cluster_id Format here : https://docs.azuredatabricks.net/dev-tools/api/latest/libraries.html#cluster-status """ METHOD = 'GET' API_PATH = '/libraries/cluster-status' data = {'cluster_id': cluster_id} # Make REST call resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return resp.json() elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def import_file(self, dbx_path, file_format, language="", overwrite=False, url=None, filepath=None): """ Imports a file to the Databricks workspace from a given URL or file path Parameters ---------- dbx_path : str The path, in the Databricks workspace, where the object should be created file_format : str The format of the file imported. Options are SOURCE, HTML, JUPYTER, DBC language : str, optional Required if file_format is set to SOURCE The computer language that the source code is written in. Options are SCALA, PYTHON, SQL or R overwrite : bool, optional Overwrite the Databricks path (not currently supported for DBC) url : str, optional The url for the file to be imported. Often this is a Github raw URL. filepath : str, optional The path on the local PC of the file to be uploaded Returns ------- dbx_path if successful Raises ------ AttributeError: If the requirements for attributes are not met MaxNotebookSizeExceeded: If imported file size is greater than 10 MB. ResourceAlreadyExists: If overwrite is set to false and there is already an object at the given dbx_path AuthorizationError: If the services returns a 403 status code APIError: If the status code returned by the service is anything except 200 and is not captured above """ METHOD = 'POST' API_PATH = '/workspace/import' # url XOR filepath defined if not (url or filepath): raise AttributeError( "Must pass either URL or filepath to Workspace Import") elif file_format.upper() == 'SOURCE' and language.upper( ) not in LANGUAGES: raise AttributeError( "If file_format=SOURCE, language must be Scala, Jupyter, Python or R" ) elif file_format.upper() not in EXPORT_FORMATS: raise AttributeError( "File format must be SOURCE, DBC, JUPYTER or HTML") if url: content = url_content_to_b64(url) else: content = file_content_to_b64(filepath) data = { "content": content.decode('utf-8'), "format": file_format.upper(), "overwrite": overwrite, "path": dbx_path } if file_format.upper() == 'SOURCE': data['language'] = language.upper() resp = self._rest_call[METHOD](API_PATH, data=data) if resp.status_code == 200: return dbx_path elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: if resp.json().get("error_code") in ERROR_CODES: raise ERROR_CODES[resp.json().get('error_code')]( resp.json().get('message')) else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def create(self, cluster_name, num_workers, spark_version, node_type_id, python_version=3, autotermination_minutes=60, **kwargs): """ Creates a new cluster in the given Parameters ---------- cluster_name num_workers spark_version : str node_type_id : str python_version : int, optional, default=3 autotermination_minutes : int, optional, default=60 kwargs Returns ------- """ METHOD = 'POST' API_PATH = 'clusters/create' # Check if spark_version supported: if not spark_version in self.spark_versions(): raise ValueError("'{0}' is not a recognized spark_version. Please see the ".format(spark_version) + "spark_versions() method for available Spark Versions. ") available_vms = self.list_available_node_type_names() driver_vm_id = kwargs.get('driver_node_type_id') # Check if node_type available supported: if not node_type_id in available_vms or (driver_vm_id and driver_vm_id not in available_vms): raise ValueError("'{0}' is not an available VM type. Please see the ".format(node_type_id) + "list_available_node_type_names() method for available node types") cluster_config = {'cluster_name': cluster_name, 'spark_version': spark_version, 'node_type_id': node_type_id} # If python_version is set to Python 3, then overwrite the PYSPARK_PYTHON environment variable if python_version == 3: if kwargs.get('spark_env_vars'): kwargs['spark_env_vars']['PYSPARK_PYTHON'] = '/databricks/python3/bin/python3' else: kwargs['spark_env_vars'] = {'PYSPARK_PYTHON': '/databricks/python3/bin/python3'} # Set default value of autotermination minutes - this defaults to 60 minutes. if autotermination_minutes: kwargs['autotermination_minutes'] = autotermination_minutes # Specify the size of the cluster if type(num_workers) == 'dict': cluster_config['autoscale'] = num_workers else: cluster_config['num_workers'] = int(num_workers) # Merge kwargs and cluster_config cluster_config = dict_update(kwargs, cluster_config) resp = self._rest_call[METHOD](API_PATH, data=cluster_config) if resp.status_code == 200: return resp.json()['cluster_id'] elif resp.status_code == 403: raise AuthorizationError("User is not authorized or token is incorrect.") else: raise APIError("Response code {0}: {1} {2}".format(resp.status_code, resp.json().get('error_code'), resp.json().get('message')))
def create(self, cluster_name, num_workers, spark_version, node_type_id, python_version=3, autotermination_minutes=60, custom_spark_version=False, **kwargs): """ Creates a new cluster in the given Parameters ---------- cluster_name : str The display name of the cluster being created num_workers : int The number of worker nodes in the cluster spark_version : str node_type_id : str python_version : int, optional, default=3 autotermination_minutes : int, optional, default=60 Automatically terminates the cluster after it is inactive for this time in minutes. If not set, this cluster will not be automatically terminated. If specified, the threshold must be between 10 and 10000 minutes. You can also set this value to 0 to explicitly disable automatic termination. custom_spark_version : bool, optional, default=False If a custom Spark version is passed - then this prevents error checking for supported Spark versions kwargs : optional Other keyword arguments are passed to the API in the JSON payload. See supported arguments here: https://docs.azuredatabricks.net/api/latest/clusters.html#create Returns ------- """ METHOD = 'POST' API_PATH = 'clusters/create' # Check if spark_version supported: if not spark_version in self.spark_versions( ) and not custom_spark_version: raise ValueError( "'{0}' is not a recognized spark_version. Please see the ". format(spark_version) + "spark_versions() method for available Spark Versions. ") available_vms = self.list_available_node_type_names() driver_vm_id = kwargs.get('driver_node_type_id') # Check if node_type available supported: if not node_type_id in available_vms or (driver_vm_id and driver_vm_id not in available_vms): raise ValueError( "'{0}' is not an available VM type. Please see the ".format( node_type_id) + "list_available_node_type_names() method for available node types" ) cluster_config = { 'cluster_name': cluster_name, 'spark_version': spark_version, 'node_type_id': node_type_id } # If python_version is set to Python 3, then overwrite the PYSPARK_PYTHON environment variable if python_version == 3: if kwargs.get('spark_env_vars'): kwargs['spark_env_vars'][ 'PYSPARK_PYTHON'] = '/databricks/python3/bin/python3' else: kwargs['spark_env_vars'] = { 'PYSPARK_PYTHON': '/databricks/python3/bin/python3' } # Set default value of autotermination minutes - this defaults to 60 minutes. if autotermination_minutes: kwargs['autotermination_minutes'] = autotermination_minutes # Specify the size of the cluster if type(num_workers) == 'dict': cluster_config['autoscale'] = num_workers else: cluster_config['num_workers'] = int(num_workers) # Merge kwargs and cluster_config cluster_config = dict_update(kwargs, cluster_config) resp = self._rest_call[METHOD](API_PATH, data=cluster_config) if resp.status_code == 200: return resp.json()['cluster_id'] elif resp.status_code == 403: raise AuthorizationError( "User is not authorized or token is incorrect.") else: raise APIError("Response code {0}: {1} {2}".format( resp.status_code, resp.json().get('error_code'), resp.json().get('message')))