def scrape_workspace(workspace, session, instance_types): log.info(f"Scraping workspace {workspace.name}, {workspace.url}") result = ScraperRun.empty() result.start() session.merge(workspace) result.num_workspaces += 1 api = DatabricksAPI(host=workspace.url, token=workspace.token) # CLUSTERS log.info(f"Started scraping clusters in workspace {workspace.name}.") clusters = query_paginated(api.cluster.list_clusters, {}, 'clusters') for cluster in clusters: scrape_cluster(workspace, cluster, instance_types, session, api, result) log.info(f"Finished scraping clusters in workspace {workspace.name}.") # JOBS log.info(f"Started scraping jobs in workspace {workspace.name}.") jobs = query_paginated(api.jobs.list_jobs, {}, 'jobs') for job in jobs: scrape_jobs(workspace, job, session, api, result) log.info(f"Finished scraping jobs in workspace {workspace.name}. " f"Jobs scraped: {len(jobs)}") # USERS log.info(f"Started scraping users in workspace {workspace.name}.") scrape_users(workspace, session, result) log.info(f"Finished scraping users in workspace {workspace.name}. " f"Users scraped: {result.num_users}") result.finish(ScraperRun.SUCCESSFUL) return result
def upload_artifacts(workspace_url: str, oauth_access_token: str, local_artifacts_path: str, dbfs_dir_path): adb_client = DatabricksAPI(host=workspace_url, token=oauth_access_token) files_to_upload = [] if isdir(local_artifacts_path): files_to_upload = [ join(local_artifacts_path, f) for f in listdir(local_artifacts_path) if isfile(join(local_artifacts_path, f)) ] else: files_to_upload = [local_artifacts_path] dbfs_folder_exists = False try: status_rsp = adb_client.dbfs.get_status(path=dbfs_dir_path) dbfs_folder_exists = status_rsp["is_dir"] except HTTPError as er: if er.response.status_code == 404: dbfs_folder_exists = False else: raise er if not dbfs_folder_exists: print("Creating destination directory on DBFS %s " % dbfs_dir_path) adb_client.dbfs.mkdirs(path=dbfs_dir_path) for local_file in files_to_upload: print("Uploading %s ..." % local_file) _upload_multipart(adb_client, local_file, dbfs_dir_path)
def upload_mount_storage_file(databricks_host: str, token: str): adb_client = DatabricksAPI(host=databricks_host, token=token) adb_client.dbfs.mkdirs(path="/mnt/provision") res = adb_client.dbfs.list("/mnt/provision") print("Successfully created the provision folder: /mnt/provision") try: res = adb_client.dbfs.delete("/mnt/provision/mount_dbfs.py") except Exception as e: pass print("Preparing to upload mount_dbfs.py to: /mnt/provision/mount_dbfs.py") handle = adb_client.dbfs.create( path="/mnt/provision/mount_dbfs.py")['handle'] # TODO: get the current path config_dir = pathlib.Path(__file__).parent.absolute() local_file = os.path.join(config_dir, "mount_dbfs.py") print("Path for mount_dbfs.py is: ", local_file) with open(local_file, "rb") as f: while True: # A block can be at most 1MB block = f.read(1 << 20) if not block: break data = base64.standard_b64encode(block) adb_client.dbfs.add_block(handle=handle, data=data.decode("utf-8")) # close the handle to finish uploading adb_client.dbfs.close(handle=handle) print("Upload succeeded: ", local_file)
def test__execute__raises_403_http_exception__no_retries_and_raises(mocker): retrier = HTTPRetrier(2, 1) db = DatabricksAPI(host='HOST', token='TOKEN') mock_request = mocker.patch.object(db.client.session, 'request') mock_resp = requests.models.Response() mock_resp.status_code = 403 mock_request.return_value = mock_resp with pytest.raises(HTTPError): return_value = retrier.execute(db.jobs.get_run_output, 1) assert retrier._tries == 0
def __init__(self): config = cfg.get_auth_config() self.min_timeout = MIN_TIMEOUT if config is None: raise InvalidConfigurationException # TODO: remove the dependency with this API, an instead use httpclient/requests db = DatabricksAPI(host=config.host, token=config.token) self.inner_dbclient = db # The retrier uses the recommended defaults # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/jobs self._retrier = HTTPRetrier()
def test__execute__raises_invalid_state_http_exception__retries_twice_and_raises( mocker): retrier = HTTPRetrier(2, 1) db = DatabricksAPI(host='HOST', token='TOKEN') mock_request = mocker.patch.object(db.client.session, 'request') response_body = " { 'error_code': 'INVALID_STATE', 'message': 'Run result is empty. " + \ " There may have been issues while saving or reading results.'} " mock_resp = requests.models.Response() mock_resp.status_code = 400 mock_resp.raw = io.BytesIO(bytes(response_body, 'utf-8')) mock_request.return_value = mock_resp with pytest.raises(HTTPError): return_value = retrier.execute(db.jobs.get_run_output, 1) assert retrier._tries == 2
import inspect from databricks_api import DatabricksAPI import databricks_cli db = DatabricksAPI(host="localhost", token="token") intro = """databricks-api ============== |pypi| |pyversions| .. |pypi| image:: https://img.shields.io/pypi/v/databricks-api.svg :target: https://pypi.python.org/pypi/databricks-api .. |pyversions| image:: https://img.shields.io/pypi/pyversions/databricks-api.svg :target: https://pypi.python.org/pypi/databricks-api *[This documentation is auto-generated]* This package provides a simplified interface for the Databricks REST API. The interface is autogenerated on instantiation using the underlying client library used in the official ``databricks-cli`` python package. Install using .. code-block:: bash pip install databricks-api
def __init__(self, host, token, workspace_id=None): self.host = host self.workspace_id = workspace_id self.client = DatabricksAPI(host=host, token=token)
import sys import time from databricks_api import DatabricksAPI from datetime import datetime, timedelta print(sys.argv) token = sys.argv[1] # Provide a host and token db = DatabricksAPI(host="eastus.azuredatabricks.net", token=token) job = db.jobs.run_now(job_id=1)
def jobRunner(self): client = DatabricksAPI(host=self.dbParams['instance'], token=self.dbParams['token']) resp = client.jobs.run_now(job_id=self.dbParams['job_id']) return resp
import sys import base64 from databricks_api import DatabricksAPI import re prod_token = sys.argv[1] prod_host = sys.argv[2] notebook_name = sys.argv[3] db = DatabricksAPI(host=prod_host, token=prod_token) def import_notebook(file_data, deployment_reference, notebook_full_name): # adding disclaimer disclaimer = "# Databricks notebook source\n# MAGIC %md\n# MAGIC\n# MAGIC # {0}\n# MAGIC\n# MAGIC > Deployed version as {1}\n# MAGIC\n# MAGIC <em>Please only edit using proper [git flow](https://dev.azure.com/Teck/_git/RACE21%20-%20Trail) as this document will be overwritten during the next deployment. Checkout [dev branch](https://dev.azure.com/Teck/_git/RACE21%20-%20Trail?path=%2F&version=GBdev&_a=contents).</em>\n\n# COMMAND ----------\n".format( deployment_reference, notebook_full_name) databricks_note = "# Databricks notebook source\n" file_data = file_data.replace(databricks_note, disclaimer) # encoding for databricks import encodedBytes = base64.b64encode(file_data.encode("utf-8")) encodedStr = str(encodedBytes, "utf-8") db.workspace.import_workspace(notebook_full_name, format="SOURCE", language="PYTHON", content=encodedStr, overwrite="true") print("{} deployed!".format(notebook_full_name)) with open("{}".format(notebook_name)) as file:
def provision_databricks_cluster(install_config: InstallConfiguration, workspace_url: str, oauth_access_token: str, gdc_sp_secret_value: str, managed_libraries: list = None, gdc_sp_secret_name: str = "gdc-service-principal-secret", gdc_graph_api_sp_secret_name = "graph-api-service-principal-secret", secret_scope_name: str = "gdc", adb_cluster_name: str = "default-gdc-cluster", max_worker: int = 2, node_type_id: str = "Standard_DS3_v2", autotermination_minutes: int = 60): """ :param managed_libraries: list of json object in format https://docs.databricks.com/dev-tools/api/latest/libraries.html#example-request :param workspace_url: :param oauth_access_token: :param gdc_sp_secret_value: :param gdc_sp_secret_name: :param secret_scope_name: :param adb_cluster_name: :param max_worker: :param node_type_id: :param autotermination_minutes: :return: dict { "cluster_id": cluster_id, "api_token": adb_api_token } """ print("Provisioning ADB cluster ...") assert oauth_access_token is not None adb_client = DatabricksAPI(host=workspace_url, token=oauth_access_token) scopes = adb_client.secret.list_scopes().get("scopes", []) if not any(x for x in scopes if x.get("name") == secret_scope_name): adb_client.secret.create_scope(scope=secret_scope_name, initial_manage_principal="users") adb_client.secret.put_secret(scope=secret_scope_name, key=gdc_sp_secret_name, string_value=gdc_sp_secret_value) # both databricks jobs use gdc-service service principal to access Graph API and other component # but we've introduce two secrets for flexibility even thought they have same value for now adb_client.secret.put_secret(scope=secret_scope_name, key=gdc_graph_api_sp_secret_name, string_value=gdc_sp_secret_value) adb_api_token = adb_client.token.create_token(comment="GDC Pipeline API token") cluster_id = None clusters = adb_client.cluster.list_clusters().get("clusters", []) cluster_rsp = list([x for x in clusters if x.get("cluster_name") == adb_cluster_name]) if not cluster_rsp: print("Creating a new cluster %s" % adb_cluster_name) cluster_rsp = adb_client.cluster.create_cluster(cluster_name=adb_cluster_name, autoscale={ "min_workers": 1, "max_workers": max_worker }, node_type_id=node_type_id, driver_node_type_id=node_type_id, autotermination_minutes=autotermination_minutes, enable_elastic_disk=True, spark_version="6.6.x-scala2.11") else: print("Cluster %s exists at %s" % (adb_cluster_name, workspace_url)) cluster_rsp = cluster_rsp[0] # capture cluster details as soon as it's available install_config.adb_cluster_details = { "cluster_id": cluster_rsp['cluster_id'], "api_token": adb_api_token } cluster_id = cluster_rsp['cluster_id'] if managed_libraries: cluster_info = adb_client.cluster.get_cluster(cluster_id=cluster_id) cluster_state = cluster_info['state'] # possible values PENDING, TERMINATED and RUNNING if cluster_state == "TERMINATED": print("Starting cluster %s " % cluster_id) adb_client.cluster.start_cluster(cluster_id=cluster_id) cluster_state = "PENDING" while cluster_state == "PENDING" or cluster_state == "RESTARTING" or cluster_state == "RESIZING": print("Waiting cluster %s " % cluster_id) sleep(5) cluster_info = adb_client.cluster.get_cluster(cluster_id=cluster_id) cluster_state = cluster_info['state'] print("Cluster is now in state %s " % cluster_state) if cluster_state == "TERMINATING" or cluster_state == "TERMINATED" or cluster_state == "ERROR": print("Can't install managed libraries, cluster %s is not running" % cluster_id) raise RuntimeError("Can't install managed libraries, cluster %s is not running. Check Databricks Workspace Portal for details and try again later" % cluster_id) else: try: print("Installing managed libraries on cluster %s " % cluster_id) install_managed_libraries(adb_client, cluster_id, managed_libraries) except BaseException as e: print("Failed to install libraries into cluster %s " % cluster_id) print(e) return { "cluster_id": cluster_id, "api_token": adb_api_token }
def getDbxApi(self) -> DatabricksAPI: return DatabricksAPI(host=self.__config.dbx.host, user=self.__config.dbx.user, token=self.__config.dbx.token)
def execute_script_mount_storage_script(databricks_host: str, token: str, cluster_id: str, storage_account_name: str, container_name: str, secret_key: str): adb_client = DatabricksAPI(host=databricks_host, token=token) res = adb_client.dbfs.list("/mnt/") print("Waiting 30 seconds before proceeding with the deployment") time.sleep(30) deployment_succesfull = False for i in range(0, 3): submit_run_res = adb_client.jobs.submit_run( run_name="mount_storage", existing_cluster_id=cluster_id, spark_python_task={ "python_file": "dbfs:/mnt/provision/mount_dbfs.py", "parameters": [ f"--account_name####" + storage_account_name, f"--container_name####" + container_name, f"--secret_key_name####" + secret_key, f"--mount_point####/mnt/watercooler" ] }, timeout_seconds=3600) run_id = submit_run_res["run_id"] while True: res = adb_client.jobs.get_run(run_id=run_id) if "state" in res: print("Cluster mount job status is: " + str(res["state"])) if res is None: print("Cluster mount job completed") deployment_succesfull = True break if "state" in res and "life_cycle_state" in res["state"] and res[ "state"]["life_cycle_state"] in ["PENDING", "RUNNING"]: time.sleep(5) continue if "state" in res and "life_cycle_state" in res["state"] and res[ "state"]["life_cycle_state"] in [ "INTERNAL_ERROR", "FAILED", "TIMED_OUT" ]: deployment_succesfull = False break if "state" in res and "life_cycle_state" in res["state"] and res[ "state"]["life_cycle_state"] in [ "SUCCESSFUL", "TERMINATED" ]: deployment_succesfull = True break time.sleep(5) if deployment_succesfull: break else: print("Retrying: ", str(i), " time. Waiting 10 seconds") time.sleep(10) time.sleep(30) print("Cluster mount job successfully completed:", str(deployment_succesfull))
def __init__(self): self.db = DatabricksAPI( host=settings.DATABRICKS_HOST, token=settings.DATABRICKS_TOKEN)
def getSecretToken(spark, dbUserId): # get the clusterID and host db-connect is configured for clusterId = spark.conf.get("spark.databricks.service.clusterId") # service.address has the org in it too, which the API doesn't like dbHost = spark.conf.get("spark.databricks.service.address").split( "?")[0] pat = spark.conf.get('spark.databricks.service.token') # get our client for running API requests db = DatabricksAPI(host=dbHost, token=pat) # upload the notebook code required to fetch the token to our workspace # create a "tmp" folder under our user folder # import the notebook there notebookDir = "/Users/{}/tmp".format(dbUserId) notebookPath = notebookDir + "/fetch_apiToken" try: # Create a tmp folder under the user dir in workspace, if not exists db.workspace.mkdirs(notebookDir) # Import a one line python notebook to get and return the apiToken notebookData = b'# Databricks notebook source\n' + \ b'dbutils.notebook.exit' + \ b'((".".join(list(dbutils.notebook.entry_point.getDbutils()' + \ b'.notebook().getContext().apiToken().get()))))' encodedNotebookData = base64.b64encode(notebookData) encodedNotebookDataStr = encodedNotebookData.decode("utf-8") db.workspace.import_workspace(content=encodedNotebookDataStr, path=notebookPath, language='PYTHON', overwrite=True, format='SOURCE') # Submit a "runs/submit" call for the fetch_apiToken notebook notebookTask = {"notebook_path": notebookPath} submitResponse = db.jobs.submit_run( run_name='fetch_apiToken', existing_cluster_id=clusterId, notebook_task=notebookTask, timeout_seconds=120, ) # Wait for the runs/submit job to finish, so we can get the result runId = submitResponse['run_id'] jobRunning = True jobOutput = None # check for results every 1 second until it's done while jobRunning: time.sleep(1) output = db.jobs.get_run_output(runId) state = output['metadata']['state']['life_cycle_state'] # print(state) if state not in ['RUNNING', 'PENDING', 'TERMINATING']: jobRunning = False result_state = output['metadata']['state']['result_state'] if result_state == 'SUCCESS': jobOutput = output['notebook_output']['result'] else: jobOutput = "FAILED" # return our token # it comes with . between each char, remove them return jobOutput.replace('.', '') except Exception as e: print( "Exception while importing and executing tmp/fetch_apiToken notebook" ) print(e)
result = dbricks_client.get_run(db=DATABRICKS_API, run_id=run_id) return {'result': result} @runs_ns.doc('delete run') # @runs_ns.response(204, 'run deleted') def delete(self, run_id): '''Delete a run given its identifier''' print(run_id) cancel_run_result, delete_run_result = dbricks_client.cancel_and_delete_run( db=DATABRICKS_API, run_id=run_id) return {'run_id': run_id, 'result': delete_run_result} if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) JOB_CONFIG_PATH = os.path.join(os.path.dirname(__file__), "config", "job.config.json") # Provide a host and token for connecting to DataBricks DATABRICKS_HOST = os.getenv("DATABRICKS_HOST") DATABRICKS_TOKEN = os.getenv("DATABRICKS_TOKEN") PYPI_INDEX_URL = os.getenv("PYPI_INDEX_URL") DATABRICKS_API = DatabricksAPI(host=DATABRICKS_HOST, token=DATABRICKS_TOKEN) app.run(host='0.0.0.0')
def __init__(self, host, token): self.client = DatabricksAPI(host=host, token=token)
def get_db(): with open("../config.json", "r") as config: return DatabricksAPI(**json.load(config))