def _get_hook(databricks_conn_secret, databricks_retry_limit, databricks_retry_delay): return DatabricksHook( databricks_conn_secret, retry_limit=databricks_retry_limit, retry_delay=databricks_retry_delay, )
def get_hook(self): return DatabricksHook( self.databricks_conn_secret, retry_limit=self.databricks_retry_limit, retry_delay=self.databricks_retry_delay, )
def run( self, databricks_conn_secret: dict = None, tasks: List[JobTaskSettings] = None, run_name: str = None, timeout_seconds: int = None, idempotency_token: str = None, access_control_list: List[AccessControlRequest] = None, polling_period_seconds: int = None, databricks_retry_limit: int = None, databricks_retry_delay: float = None, ): """ Task run method. Any values passed here will overwrite the values used when initializing the task. Args: - databricks_conn_secret (dict, optional): Dictionary representation of the Databricks Connection String. Structure must be a string of valid JSON. To use token based authentication, provide the key `token` in the string for the connection and create the key `host`. `PREFECT__CONTEXT__SECRETS__DATABRICKS_CONNECTION_STRING= '{"host": "abcdef.xyz", "login": "******", "password": "******"}'` OR `PREFECT__CONTEXT__SECRETS__DATABRICKS_CONNECTION_STRING= '{"host": "abcdef.xyz", "token": "ghijklmn"}'` - tasks (List[JobTaskSettings]):" A list containing the Databricks task configuration. Should contain configuration for at least one task. - timeout_seconds (int, optional): An optional timeout applied to each run of this job. The default behavior is to have no timeout. - run_name (str, optional): An optional name for the run. The default value is "Job run created by Prefect flow run {flow_run_name}". - idempotency_token (str, optional): An optional token that can be used to guarantee the idempotency of job run requests. Defaults to the flow run ID. - access_control_list (List[AccessControlRequest]): List of permissions to set on the job. - polling_period_seconds (int, optional): Controls the rate which we poll for the result of this run. By default the task will poll every 30 seconds. - databricks_retry_limit (int, optional): Amount of times retry if the Databricks backend is unreachable. Its value must be greater than or equal to 1. - databricks_retry_delay (float, optional): Number of seconds to wait between retries (it might be a floating point number). Returns: - run_id (str): Run id of the submitted run """ if databricks_conn_secret is None or not isinstance( databricks_conn_secret, dict): raise ValueError( "Databricks connection info must be supplied as a dictionary.") if tasks is None or len(tasks) < 1: raise ValueError( "Please supply at least one Databricks task to be run.") run_name = ( run_name or f"Job run created by Prefect flow run {prefect.context.flow_run_name}" ) # Ensures that multiple job runs are not created on retries idempotency_token = idempotency_token or prefect.context.flow_run_id # Set polling_period_seconds on task because _handle_databricks_task_execution expects it if polling_period_seconds: self.polling_period_seconds = polling_period_seconds databricks_client = DatabricksHook( databricks_conn_secret, retry_limit=databricks_retry_limit, retry_delay=databricks_retry_delay, ) # Set json on task instance because _handle_databricks_task_execution expects it self.json = _deep_string_coerce( dict( tasks=[task.dict() for task in tasks], run_name=run_name, timeout_seconds=timeout_seconds, idempotency_token=idempotency_token, access_control_list=[ entry.json() for entry in access_control_list or [] ], )) submitted_run_id = databricks_client.submit_multi_task_run(self.json) _handle_databricks_task_execution(self, databricks_client, self.logger, submitted_run_id) return submitted_run_id