예제 #1
0
def test_list_jobs():
    with mock.patch('databricks_cli.sdk.ApiClient') as api_client_mock:
        api = JobsApi(api_client_mock)
        api.list_jobs()
        api_client_mock.perform_query.assert_called_with('GET',
                                                         '/jobs/list',
                                                         data={},
                                                         headers=None,
                                                         version=None)

        api.list_jobs(version='3.0')
        api_client_mock.perform_query.assert_called_with('GET',
                                                         '/jobs/list',
                                                         data={},
                                                         headers=None,
                                                         version='3.0')
예제 #2
0
class SdkClient():
    def __init__(self, profile=None):
        client = utils.get_api_client(profile)
        self.cluster_client = ClusterApi(client)
        self.jobs_client = JobsApi(client)

    def list_clusters(self):
        return self.cluster_client.list_clusters()

    def get_cluster(self, cluster_id):
        return self.cluster_client.get_cluster(cluster_id)

    def list_jobs(self):
        return self.jobs_client.list_jobs()
예제 #3
0
def export_cli(dry_run, tag, delete, git_ssh_url, api_client: ApiClient, hcl, pattern_matches):
    block_key_map = {
        "new_cluster": handle_block,
        "notebook_task": handle_block,
        "aws_attributes": handle_block,
        "spark_env_vars": handle_block,
        "autoscale": handle_block,
        "spark_submit_task": handle_block,
        "libraries": handle_libraries,
        "email_notifications": handle_map,
        "custom_tags": handle_map
    }
    ignore_attribute_key = {
        "created_time", "creator_user_name", "job_id"
    }
    required_attributes_key = {
        "max_concurrent_runs", "name"
    }

    if hcl:
        job_api = JobsApi(api_client)

        jobs = job_api.list_jobs()["jobs"]
        log.info(jobs)

        with GitExportHandler(git_ssh_url, "jobs", delete_not_found=delete, dry_run=dry_run, tag=tag) as gh:
            for job in jobs:
                if not pattern_matches(job["settings"]["name"]):
                    log.debug(f"{job['settings']['name']} did not match pattern function {pattern_matches}")
                    continue
                log.debug(f"{job['settings']['name']} matched the pattern function {pattern_matches}")
                job_resource_data = prep_json(block_key_map, ignore_attribute_key, job['settings'], required_attributes_key)

                base_name = normalize_identifier(job['settings']['name'])
                name = "databricks_job"
                identifier = f"databricks_job-{base_name}"

                #need to escape quotes in the name.
                job_resource_data['name'] = job_resource_data['name'].replace('"','\\"')

                instance_job_hcl = create_resource_from_dict(name, identifier, job_resource_data, False)
                file_name_identifier = f"{identifier}.tf"
                gh.add_file(file_name_identifier, instance_job_hcl)
                log.debug(instance_job_hcl)
예제 #4
0
파일: cli.py 프로젝트: Men0x/aobd_project
def list_cli(api_client, output):
    """
    Lists the jobs in the Databricks Job Service.

    By default the output format will be a human readable table with the following fields

      - Job ID

      - Job name

    A JSON formatted output can also be requested by setting the --output parameter to "JSON"

    In table mode, the jobs are sorted by their name.
    """
    jobs_api = JobsApi(api_client)
    jobs_json = jobs_api.list_jobs()
    if OutputClickType.is_json(output):
        click.echo(pretty_format(jobs_json))
    else:
        click.echo(tabulate(_jobs_to_table(jobs_json), tablefmt='plain', disable_numparse=True))
예제 #5
0
def list_cli(api_client, output, job_type, version, expand_tasks, offset, limit, _all):
    """
    Lists the jobs in the Databricks Job Service.

    By default the output format will be a human readable table with the following fields

      - Job ID

      - Job name

    A JSON formatted output can also be requested by setting the --output parameter to "JSON"

    In table mode, the jobs are sorted by their name.
    """
    check_version(api_client, version)
    api_version = version or api_client.jobs_api_version
    if api_version != '2.1' and (expand_tasks or offset or limit or _all):
        click.echo(click.style('ERROR', fg='red') + ': the options --expand-tasks, ' +
                   '--offset, --limit, and --all are only available in API 2.1', err=True)
        return
    jobs_api = JobsApi(api_client)
    has_more = True
    jobs = []
    if _all:
        offset = 0
        limit = 20
    while has_more:
        jobs_json = jobs_api.list_jobs(job_type=job_type, expand_tasks=expand_tasks,
                                       offset=offset, limit=limit, version=version)
        jobs += jobs_json['jobs'] if 'jobs' in jobs_json else []
        has_more = jobs_json.get('has_more', False) and _all
        if has_more:
            offset = offset + \
                (len(jobs_json['jobs']) if 'jobs' in jobs_json else 20)

    out = {'jobs': jobs}
    if OutputClickType.is_json(output):
        click.echo(pretty_format(out))
    else:
        click.echo(tabulate(_jobs_to_table(out),
                   tablefmt='plain', disable_numparse=True))
예제 #6
0
class DeployToDatabricks(Step):
    def __init__(self, env: ApplicationVersion, config: dict):
        super().__init__(env, config)
        self.vault_name, self.vault_client = KeyVaultClient.vault_and_client(self.config, self.env)
        self.databricks_client = Databricks(self.vault_name, self.vault_client).api_client(self.config)
        self.jobs_api = JobsApi(self.databricks_client)
        self.runs_api = RunsApi(self.databricks_client)

    def schema(self) -> vol.Schema:
        return SCHEMA

    def run(self):
        self.deploy_to_databricks()

    @staticmethod
    def _job_is_streaming(job_config: dict):
        """
        If there is no schedule, the job would not run periodically, therefore we assume that is a
        streaming job
        :param job_config: the configuration of the Databricks job
        :return: (bool) if it is a streaming job
        """
        return "schedule" not in job_config.keys()

    def deploy_to_databricks(self):
        """
        The application parameters (cosmos and eventhub) will be removed from this file as they
        will be set as databricks secrets eventually
        If the job is a streaming job this will directly start the new job_run given the new
        configuration. If the job is batch this will not start it manually, assuming the schedule
        has been set correctly.
        """
        for job in self.config["jobs"]:
            app_name = self._construct_name(job["name"])
            job_name = f"{app_name}-{self.env.artifact_tag}"
            job_config = self.create_config(job_name, job)
            is_streaming = self._job_is_streaming(job_config)

            logger.info("Removing old job")
            self.remove_job(self.env.artifact_tag, is_streaming=is_streaming)

            logger.info("Submitting new job with configuration:")
            logger.info(pprint.pformat(job_config))
            self._submit_job(job_config, is_streaming)

    def create_config(self, job_name: str, job_config: dict):
        common_arguments = dict(
            config_file=job_config["config_file"],
            application_name=job_name,
            log_destination=job_name,
            parameters=self._construct_arguments(job_config["arguments"]),
            schedule=self._get_schedule(job_config),
            environment=self.env.environment_formatted,
        )

        root_library_folder = self.config["common"]["databricks_fs_libraries_mount_path"]
        storage_base_path = f"{root_library_folder}/{self.application_name}"
        artifact_path = f"{storage_base_path}/{self.application_name}-{self.env.artifact_tag}"

        if job_config["lang"] == "python":
            wheel_name = get_whl_name(self.application_name, self.env.artifact_tag, ".whl")
            py_main_name = get_main_py_name(self.application_name, self.env.artifact_tag, ".py")
            run_config = DeployToDatabricks._construct_job_config(
                **common_arguments,
                whl_file=f"{root_library_folder}/{wheel_name}",
                python_file=f"{root_library_folder}/{py_main_name}",
            )
        else:  # java/scala jobs
            run_config = DeployToDatabricks._construct_job_config(
                **common_arguments, class_name=job_config["main_name"], jar_file=f"{artifact_path}.jar"
            )
        return run_config

    def _get_schedule(self, job_config: dict) -> Optional[dict]:
        schedule = job_config.get("schedule", None)
        if schedule:
            if "quartz_cron_expression" in schedule:
                return schedule
            else:
                return schedule.get(self.env.environment.lower(), None)

        return schedule

    def _construct_name(self, name: str) -> str:
        postfix = f"-{name}" if name else ""
        return f"{self.application_name}{postfix}"

    @staticmethod
    def _construct_arguments(args: List[dict]) -> list:
        params = []
        for named_arguments_pair in args:
            for k, v in named_arguments_pair.items():
                params.extend([f"--{k}", v])

        return params

    @staticmethod
    def _construct_job_config(config_file: str, **kwargs) -> dict:
        return util.render_file_with_jinja(config_file, kwargs, json.loads)

    def remove_job(self, branch: str, is_streaming: bool):
        """
        Removes the existing job and cancels any running job_run if the application is streaming.
        If the application is batch, it'll let the batch job finish but it will remove the job,
        making sure no other job_runs can start for that old job.
        """

        job_configs = [
            JobConfig(_["settings"]["name"], _["job_id"]) for _ in self.jobs_api.list_jobs()["jobs"]
        ]
        job_ids = self._application_job_id(self.application_name, branch, job_configs)

        if not job_ids:
            logger.info(f"Could not find jobs in list of {pprint.pformat(job_configs)}")

        for job_id in job_ids:
            logger.info(f"Found Job with ID {job_id}")
            if is_streaming:
                self._kill_it_with_fire(job_id)
            logger.info(f"Deleting Job with ID {job_id}")
            self.jobs_api.delete_job(job_id)

    @staticmethod
    def _application_job_id(application_name: str, branch: str, jobs: List[JobConfig]) -> List[int]:
        snapshot = "SNAPSHOT"
        tag = "\d+\.\d+\.\d+"
        pattern = re.compile(rf"^({application_name})-({snapshot}|{tag}|{branch})$")

        return [_.job_id for _ in jobs if has_prefix_match(_.name, application_name, pattern)]

    def _kill_it_with_fire(self, job_id):
        logger.info(f"Finding runs for job_id {job_id}")
        runs = self.runs_api.list_runs(job_id, active_only=True, completed_only=None, offset=None, limit=None)
        # If the runs is empty, there are no jobs at all
        # TODO: Check if the has_more flag is true, this means we need to go over the pages
        if "runs" in runs:
            active_run_ids = [_["run_id"] for _ in runs["runs"]]
            logger.info(f"Canceling active runs {active_run_ids}")
            [self.runs_api.cancel_run(_) for _ in active_run_ids]

    def _submit_job(self, job_config: dict, is_streaming: bool):
        job_resp = self.jobs_api.create_job(job_config)
        logger.info(f"Created Job with ID {job_resp['job_id']}")

        if is_streaming:
            resp = self.jobs_api.run_now(
                job_id=job_resp["job_id"],
                jar_params=None,
                notebook_params=None,
                python_params=None,
                spark_submit_params=None,
            )
            logger.info(f"Created run with ID {resp['run_id']}")