Пример #1
0
def delete(core_job_operations,
           spark_job_operations,
           job_id: str,
           keep_logs: bool = False):
    try:
        return _delete(core_job_operations, spark_job_operations, job_id,
                       keep_logs)
    except batch_error.BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #2
0
 def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str:
     try:
         cluster = self.get_cluster(cluster_id)
         master_node_id = cluster.master_node_id
         if not master_node_id:
             raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.")
         self.__create_user(cluster.id, master_node_id, username, password, ssh_key)
     except batch_error.BatchErrorException as e:
         raise error.AztkError(helpers.format_batch_exception(e))
Пример #3
0
 def list_applications(self, job_id):
     try:
         applications = job_submit_helper.list_applications(self, job_id)
         for item in applications:
             if applications[item]:
                 applications[item] = models.Application(applications[item])
         return applications
     except batch_error.BatchErrorException as e:
         raise error.AztkError(helpers.format_batch_exception(e))
Пример #4
0
 def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None):
     try:
         return self.__cluster_run(cluster_id,
                                   command,
                                   internal,
                                   container_name='spark' if not host else None,
                                   timeout=timeout)
     except batch_error.BatchErrorException as e:
         raise error.AztkError(helpers.format_batch_exception(e))
Пример #5
0
    def submit_job(self, job_configuration: models.JobConfiguration):
        try:
            job_configuration = _apply_default_for_job_config(
                job_configuration)
            job_configuration.validate()
            cluster_data = self._get_cluster_data(job_configuration.id)
            node_data = NodeData(
                job_configuration.to_cluster_config()).add_core().done()
            zip_resource_files = cluster_data.upload_node_data(
                node_data).to_resource_file()

            start_task = create_cluster_helper.generate_cluster_start_task(
                self,
                zip_resource_files,
                job_configuration.id,
                job_configuration.gpu_enabled,
                job_configuration.get_docker_repo(),
                mixed_mode=job_configuration.mixed_mode(),
                worker_on_master=job_configuration.worker_on_master)

            application_tasks = []
            for application in job_configuration.applications:
                application_tasks.append((application,
                                          cluster_submit_helper.generate_task(
                                              self, job_configuration.id,
                                              application)))

            job_manager_task = job_submit_helper.generate_task(
                self, job_configuration, application_tasks)

            software_metadata_key = "spark"

            vm_image = models.VmImage(publisher='Canonical',
                                      offer='UbuntuServer',
                                      sku='16.04')

            autoscale_formula = "$TargetDedicatedNodes = {0}; " \
                                "$TargetLowPriorityNodes = {1}".format(
                                    job_configuration.max_dedicated_nodes,
                                    job_configuration.max_low_pri_nodes)

            job = self.__submit_job(
                job_configuration=job_configuration,
                start_task=start_task,
                job_manager_task=job_manager_task,
                autoscale_formula=autoscale_formula,
                software_metadata_key=software_metadata_key,
                vm_image_model=vm_image,
                application_metadata='\n'.join(
                    application.name
                    for application in (job_configuration.applications or [])))

            return models.Job(job)

        except batch_error.BatchErrorException as e:
            raise error.AztkError(helpers.format_batch_exception(e))
Пример #6
0
def create_cluster(core_cluster_operations,
                   spark_cluster_operations,
                   cluster_conf: models.ClusterConfiguration,
                   vm_image: base_models.VmImage,
                   wait: bool = False):
    """
    Create a new aztk spark cluster

    Args:
        cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created
        wait(bool): If you should wait for the cluster to be ready before returning
        vm_image: models for cluster vm

    Returns:
        :obj:`aztk.spark.models.Cluster`
    """
    cluster_conf = _apply_default_for_cluster_config(cluster_conf)
    cluster_conf.validate()

    cluster_data = core_cluster_operations.get_cluster_data(
        cluster_conf.cluster_id)
    try:
        zip_resource_files = None
        node_data = NodeData(cluster_conf).add_core().done()
        zip_resource_files = cluster_data.upload_node_data(
            node_data).to_resource_file()

        start_task = spark_cluster_operations._generate_cluster_start_task(
            core_cluster_operations,
            zip_resource_files,
            cluster_conf.cluster_id,
            cluster_conf.gpu_enabled(),
            cluster_conf.get_docker_repo(),
            cluster_conf.get_docker_run_options(),
            cluster_conf.file_shares,
            cluster_conf.mixed_mode(),
            cluster_conf.worker_on_master,
        )

        software_metadata_key = base_models.Software.spark

        cluster = core_cluster_operations.create(cluster_conf,
                                                 software_metadata_key,
                                                 start_task, vm_image)

        # Wait for the master to be ready
        if wait:
            util.wait_for_master_to_be_ready(core_cluster_operations,
                                             spark_cluster_operations,
                                             cluster.id)
            cluster = spark_cluster_operations.get(cluster.id)

        return cluster

    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #7
0
def list_applications(core_operations, cluster_id):
    try:
        scheduling_target = core_operations.get_cluster_configuration(cluster_id).scheduling_target
        if scheduling_target is not SchedulingTarget.Any:
            tasks = core_operations.list_task_table_entries(cluster_id)
        else:
            tasks = core_operations.list_batch_tasks(cluster_id)
        return [Application(task) for task in tasks]
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #8
0
def _download_resource_file(task_id, resource_file):
    response = http_request_wrapper(requests.get, url=resource_file.blob_source, timeout=None, stream=True)
    if resource_file.file_path:
        write_path = os.path.join(os.environ.get("AZ_BATCH_TASK_WORKING_DIR"), resource_file.file_path)
        with open(write_path, 'wb') as stream:
            for chunk in response.iter_content(chunk_size=16777216):
                stream.write(chunk)
            return None

    raise error.AztkError("ResourceFile file_path not set.")
Пример #9
0
def get_application(spark_client, job_id, application_name):
    # info about the app
    recent_run_job = __get_recent_job(spark_client, job_id)
    try:
        return spark_client.batch_client.task.get(job_id=recent_run_job.id,
                                                  task_id=application_name)
    except batch_models.batch_error.BatchErrorException:
        raise error.AztkError(
            "The Spark application {0} is still being provisioned or does not exist."
            .format(application_name))
Пример #10
0
def run_cluster_diagnostics(spark_cluster_operations,
                            cluster_id,
                            output_directory=None,
                            brief=False):
    try:
        output = _run(spark_cluster_operations, cluster_id, output_directory,
                      brief)
        return output
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #11
0
def submit(core_cluster_operations,
           spark_cluster_operations,
           cluster_id: str,
           application: models.ApplicationConfiguration,
           remote: bool = False,
           wait: bool = False):
    try:
        submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait)
    except batch_error.BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #12
0
def get_application_log(base_operations,
                        cluster_id: str,
                        application_name: str,
                        tail=False,
                        current_bytes: int = 0):
    try:
        return get_log(base_operations, cluster_id, application_name, tail,
                       current_bytes)
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #13
0
def submit_job(core_job_operations,
               spark_job_operations,
               job_configuration: models.JobConfiguration,
               wait: bool = False):
    try:
        job_configuration = _apply_default_for_job_config(job_configuration)
        job_configuration.validate()
        cluster_data = core_job_operations.get_cluster_data(job_configuration.id)
        node_data = NodeData(job_configuration.to_cluster_config()).add_core().done()
        zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file()

        start_task = spark_job_operations._generate_cluster_start_task(
            core_job_operations,
            zip_resource_files,
            job_configuration.id,
            job_configuration.gpu_enabled,
            job_configuration.get_docker_repo(),
            job_configuration.get_docker_run_options(),
            mixed_mode=job_configuration.mixed_mode(),
            worker_on_master=job_configuration.worker_on_master,
        )

        application_tasks = []
        for application in job_configuration.applications:
            application_tasks.append((
                application,
                spark_job_operations._generate_application_task(core_job_operations, job_configuration.id, application),
            ))

        job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks)

        software_metadata_key = base_models.Software.spark

        vm_image = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04")

        autoscale_formula = "$TargetDedicatedNodes = {0}; " "$TargetLowPriorityNodes = {1}".format(
            job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes)

        job = core_job_operations.submit(
            job_configuration=job_configuration,
            start_task=start_task,
            job_manager_task=job_manager_task,
            autoscale_formula=autoscale_formula,
            software_metadata_key=software_metadata_key,
            vm_image_model=vm_image,
            application_metadata="\n".join(application.name for application in (job_configuration.applications or [])),
        )

        if wait:
            spark_job_operations.wait(id=job_configuration.id)

        return models.Job(job)

    except batch_error.BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #14
0
def cluster_run(core_cluster_operations,
                cluster_id: str,
                command: str,
                host=False,
                internal: bool = False,
                timeout=None):
    try:
        return core_cluster_operations.run(
            cluster_id, command, internal, container_name="spark" if not host else None, timeout=timeout)
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #15
0
 def get_application_log(self,
                         cluster_id: str,
                         application_name: str,
                         tail=False,
                         current_bytes: int = 0):
     try:
         return get_log_helper.get_log(self.batch_client, self.blob_client,
                                       cluster_id, application_name, tail,
                                       current_bytes)
     except batch_error.BatchErrorException as e:
         raise error.AztkError(helpers.format_batch_exception(e))
Пример #16
0
 def create_user(self,
                 cluster_id: str,
                 username: str,
                 password: str = None,
                 ssh_key: str = None) -> str:
     try:
         cluster = self.get_cluster(cluster_id)
         master_node_id = cluster.master_node_id
         self.__create_user(cluster.id, master_node_id, username, password,
                            ssh_key)
     except batch_error.BatchErrorException as e:
         raise error.AztkError(helpers.format_batch_exception(e))
Пример #17
0
def _get_application_log(core_job_operations, spark_job_operations, job_id, application_name):
    # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs
    #           current: job_id, application_name/output.log
    #           new: job_id, recent_run_job.id/application_name/output.log
    recent_run_job = get_recent_job(core_job_operations, job_id)
    try:
        task = core_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name)
    except batch_models.batch_error.BatchErrorException as e:
        # see if the application is written to metadata of pool
        applications = spark_job_operations.list_applications(job_id)

        for application in applications:
            if applications[application] is None and application == application_name:
                raise error.AztkError("The application {0} has not yet been created.".format(application))
        raise error.AztkError("The application {0} does not exist".format(application_name))
    else:
        if task.state in (batch_models.TaskState.active, batch_models.TaskState.running,
                          batch_models.TaskState.preparing):
            raise error.AztkError("The application {0} has not yet finished executing.".format(application_name))

        return core_job_operations.get_application_log(job_id, application_name)
Пример #18
0
 def cluster_download(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout=None):
     try:
         container_name = None if host else 'spark'
         return self.__cluster_copy(cluster_id,
                                    source_path,
                                    destination_path,
                                    container_name=container_name,
                                    get=True,
                                    internal=internal,
                                    timeout=timeout)
     except batch_error.BatchErrorException as e:
         raise error.AztkError(helpers.format_batch_exception(e))
Пример #19
0
def _get_application_log(core_job_operations, spark_job_operations, job_id,
                         application_name):
    scheduling_target = core_job_operations.get_cluster_configuration(
        job_id).scheduling_target
    if scheduling_target is not models.SchedulingTarget.Any:
        return core_job_operations.get_application_log(job_id,
                                                       application_name)

    # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs
    #           current: job_id, application_name/output.log
    #           new: job_id, recent_run_job.id/application_name/output.log
    recent_run_job = core_job_operations.get_recent_job(job_id)
    try:
        task = core_job_operations.get_batch_task(id=recent_run_job.id,
                                                  task_id=application_name)
    except batch_models.BatchErrorException as e:
        # task may not exist since it may not yet be scheduled
        # see if the task is written to metadata of pool
        applications = spark_job_operations.list_applications(job_id)

        for application in applications:
            if applications[
                    application] is None and application == application_name:
                raise error.AztkError(
                    "The application {0} has not yet been created.".format(
                        application))
        raise error.AztkError(
            "The application {0} does not exist".format(application_name))
    else:
        if task.state in (
                batch_models.TaskState.active,
                batch_models.TaskState.running,
                batch_models.TaskState.preparing,
        ):
            raise error.AztkError(
                "The application {0} has not yet finished executing.".format(
                    application_name))

        return core_job_operations.get_application_log(job_id,
                                                       application_name)
Пример #20
0
 def cluster_ssh_into_master(self,
                             cluster_id,
                             node_id,
                             username,
                             ssh_key=None,
                             password=None,
                             port_forward_list=None,
                             internal=False):
     try:
         self.__ssh_into_node(cluster_id, node_id, username, ssh_key,
                              password, port_forward_list, internal)
     except batch_error.BatchErrorException as e:
         raise error.AztkError(helpers.format_batch_exception(e))
Пример #21
0
    def validate(self) -> bool:
        """
        Validate the config at its current state.
        Raises: Error if invalid
        """
        if self.id is None:
            raise error.AztkError(
                "Please supply an ID for the Job in your configuration.")

        if self.max_dedicated_nodes == 0 and self.max_low_pri_nodes == 0:
            raise error.AztkError(
                "Please supply a valid (greater than 0) value for either max_dedicated_nodes or max_low_pri_nodes in your configuration."
            )

        if self.vm_size is None:
            raise error.AztkError(
                "Please supply a vm_size in your configuration.")

        if self.mixed_mode() and not self.subnet_id:
            raise error.AztkError(
                "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes) and pass the subnet_id in your configuration.."
            )
Пример #22
0
def get_task_state(core_cluster_operations, cluster_id: str, task_id: str):
    try:
        scheduling_target = core_cluster_operations.get_cluster_configuration(
            cluster_id).scheduling_target
        if scheduling_target is not SchedulingTarget.Any:
            task = core_cluster_operations.get_task_from_table(
                cluster_id, task_id)
            return task.state
        else:
            task = core_cluster_operations.get_batch_task(cluster_id, task_id)
        return task.state
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #23
0
def _get_application(core_operations, job_id, application_name):
    # info about the app
    recent_run_job = core_operations.get_recent_job(job_id)
    scheduling_target = core_operations.get_cluster_configuration(
        job_id).scheduling_target
    if scheduling_target is not models.SchedulingTarget.Any:
        return core_operations.get_task_from_table(job_id, application_name)
    try:
        return core_operations.get_batch_task(id=recent_run_job.id,
                                              task_id=application_name)
    except batch_models.BatchErrorException:
        raise error.AztkError(
            "The Spark application {0} is still being provisioned or does not exist."
            .format(application_name))
Пример #24
0
    def create_cluster(self, configuration: models.ClusterConfiguration, wait: bool = False):
        """
        Create a new aztk spark cluster

        Args:
            cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created
            wait(bool): If you should wait for the cluster to be ready before returning

        Returns:
            aztk.spark.models.Cluster
        """
        cluster_conf = models.ClusterConfiguration()
        cluster_conf.merge(DEFAULT_CLUSTER_CONFIG)
        cluster_conf.merge(configuration)
        cluster_conf.validate()
        cluster_data = self._get_cluster_data(cluster_conf.cluster_id)
        try:
            zip_resource_files = None
            node_data = NodeData(cluster_conf).add_core().done()
            zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file()

            start_task = create_cluster_helper.generate_cluster_start_task(self,
                                                                           zip_resource_files,
                                                                           cluster_conf.cluster_id,
                                                                           cluster_conf.gpu_enabled(),
                                                                           cluster_conf.get_docker_repo(),
                                                                           cluster_conf.file_shares,
                                                                           cluster_conf.plugins,
                                                                           cluster_conf.mixed_mode(),
                                                                           cluster_conf.worker_on_master)

            software_metadata_key = "spark"

            vm_image = models.VmImage(
                publisher='Canonical',
                offer='UbuntuServer',
                sku='16.04')

            cluster = self.__create_pool_and_job(
                cluster_conf, software_metadata_key, start_task, vm_image)

            # Wait for the master to be ready
            if wait:
                util.wait_for_master_to_be_ready(self, cluster.id)
                cluster = self.get_cluster(cluster.id)

            return cluster

        except batch_error.BatchErrorException as e:
            raise error.AztkError(helpers.format_batch_exception(e))
Пример #25
0
    def validate(self) -> bool:
        """
        Validate the config at its current state.
        Raises: Error if invalid
        """

        if self.cluster_id is None:
            raise error.AztkError(
                "Please supply an id for the cluster with a parameter (--id)")

        if self.vm_count == 0 and self.vm_low_pri_count == 0:
            raise error.AztkError(
                "Please supply a valid (greater than 0) size or size_low_pri value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-pri)"
            )

        if self.vm_size is None:
            raise error.AztkError(
                "Please supply a vm_size in either the cluster.yaml configuration file or with a parameter (--vm-size)"
            )

        if self.mixed_mode() and not self.subnet_id:
            raise error.AztkError(
                "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml."
            )
Пример #26
0
def ssh_into_master(
        spark_cluster_operations,
        core_cluster_operations,
        cluster_id,
        username,
        ssh_key=None,
        password=None,
        port_forward_list=None,
        internal=False,
):
    try:
        master_node_id = spark_cluster_operations.get(cluster_id).master_node_id
        core_cluster_operations.ssh_into_node(cluster_id, master_node_id, username, ssh_key, password,
                                              port_forward_list, internal)
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))
Пример #27
0
def get_log_from_storage(blob_client, container_name, application_name, task):
    try:
        blob = blob_client.get_blob_to_text(
            container_name,
            application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE)
    except azure.common.AzureMissingResourceHttpError:
        raise error.AztkError(
            "Logs not found in your storage account. They were either deleted or never existed."
        )

    return models.ApplicationLog(name=application_name,
                                 cluster_id=container_name,
                                 application_state=task.state._value_,
                                 log=blob.content,
                                 total_bytes=blob.properties.content_length,
                                 exit_code=task.execution_info.exit_code)
Пример #28
0
def http_request_wrapper(func, *args, timeout=None, max_execution_time=300, **kwargs):
    start_time = time.clock()
    while True:
        try:
            response = func(*args, timeout=timeout, **kwargs)
            response.raise_for_status()
            return response
        except requests.Timeout:
            pass

        if (time.clock() - start_time > max_execution_time):
            raise error.AztkError("Waited {} seconds for request {}, exceeded max_execution_time={}".format(
                time.clock() - start_time,
                func.__name__,
                max_execution_time,
            ))
Пример #29
0
def cluster_copy(
    cluster_operations,
    cluster_id,
    source_path,
    destination_path=None,
    container_name=None,
    internal=False,
    get=False,
    timeout=None,
):
    cluster = cluster_operations.get(cluster_id)
    pool, nodes = cluster.pool, list(cluster.nodes)
    if internal:
        cluster_nodes = [(node,
                          models.RemoteLogin(ip_address=node.ip_address,
                                             port="22")) for node in nodes]
    else:
        cluster_nodes = [
            (node,
             cluster_operations.get_remote_login_settings(pool.id, node.id))
            for node in nodes
        ]

    try:
        generated_username, ssh_key = cluster_operations.generate_user_on_cluster(
            pool.id, nodes)
    except BatchErrorException as e:
        raise error.AztkError(helpers.format_batch_exception(e))

    try:
        output = asyncio.get_event_loop().run_until_complete(
            ssh_lib.clus_copy(
                container_name=container_name,
                username=generated_username,
                nodes=cluster_nodes,
                source_path=source_path,
                destination_path=destination_path,
                ssh_key=ssh_key.exportKey().decode("utf-8"),
                get=get,
                timeout=timeout,
            ))
        return output
    except (OSError, BatchErrorException) as exc:
        raise exc
    finally:
        cluster_operations.delete_user_on_cluster(pool.id, nodes,
                                                  generated_username)
Пример #30
0
def create_pool_if_not_exist(pool, batch_client):
    """
    Creates the specified pool if it doesn't already exist
    :param batch_client: The batch client to use.
    :type batch_client: `batchserviceclient.BatchServiceClient`
    :param pool: The pool to create.
    :type pool: `batchserviceclient.models.PoolAddParameter`
    """
    try:
        batch_client.pool.add(pool)
    except batch_models.BatchErrorException as e:
        if e.error.code == "PoolExists":
            raise error.AztkError(
                "A cluster with the same id already exists. Use a different id or delete the existing cluster")
        else:
            raise
    return True