Пример #1
0
    def run(self, wait=True, **kwargs) -> None:
        r"""
        :param config: A :class:`BatchConfig` instance with the Azure Batch run parameters
        :type config: :class:BatchConfig

        :param boolean wait: If true, wait for the batch to complete and then
                download the results to file

        :raises BatchErrorException: If raised by the Azure Batch Python SDK
        """
        # replace any missing values in the configuration with environment variables

        if not hasattr(self, "tasks"):
            raise ValueError(
                "Client restored from data cannot be used to run the job")

        try:
            # Create the pool that will contain the compute nodes that will execute the
            # tasks.
            if not (self.config.POOL_VM_SIZE and
                    (self.config.POOL_NODE_COUNT
                     or self.config.POOL_LOW_PRIORITY_NODE_COUNT)):
                print("Using existing pool: ", self.config.POOL_ID)

            else:
                try:
                    self._create_pool()
                    print("Created pool: ", self.config.POOL_ID)
                except models.BatchErrorException:
                    print("Using pool: ", self.config.POOL_ID)

            # Create the job that will run the tasks.
            job_description = models.JobAddParameter(
                id=self.config.JOB_ID,
                pool_info=models.PoolInformation(pool_id=self.config.POOL_ID),
            )
            self.batch_client.job.add(job_description)

            # Add the tasks to the job.
            self.batch_client.task.add_collection(self.config.JOB_ID,
                                                  self.tasks)

        except models.BatchErrorException as err:
            print_batch_exception(err)
            raise err

        if wait:
            self.load_results(**kwargs)
Пример #2
0
    def create_job(self, job_id, pool_id, total_nodes, is_linux_pool):
        client = self._get_batch_client()
        try:
            pool_info = batchmodels.PoolInformation(pool_id=pool_id)
            job = batchmodels.JobAddParameter(id=job_id, pool_info=pool_info)

            try:
                client.job.add(job)
            except batchmodels.BatchErrorException as be:
                if be.error and be.error.code == 'JobExists':
                    pass
                else:
                    print('Error creating job, code={}, message={}'.format(
                        be.error.code, be.error.message))
                    raise

            if is_linux_pool:
                cmd_line = '/bin/bash -c azure-batch-ses.sh'
                script = 'azure-batch-ses.sh'
                script_url = 'https://raw.githubusercontent.com/Azure/azure-deadline/master/CloudProviderPlugin/Scripts/azure-batch-ses.sh'
            else:
                cmd_line = 'powershell.exe -file azure-batch-ses.ps1'
                script = 'azure-batch-ses.ps1'
                script_url = 'https://raw.githubusercontent.com/Azure/azure-deadline/master/CloudProviderPlugin/Scripts/azure-batch-ses.ps1'

            task = batchmodels.TaskAddParameter(
                id='',
                command_line=cmd_line,
                resource_files=[batchmodels.ResourceFile(script_url, script)],
                constraints=batchmodels.TaskConstraints(
                    max_task_retry_count=3),
                user_identity=batchmodels.UserIdentity(
                    auto_user=batchmodels.AutoUserSpecification(
                        scope=batchmodels.AutoUserScope.pool,
                        elevation_level=batchmodels.ElevationLevel.admin)))

            for i in range(total_nodes):
                task.id = str(uuid.uuid4())
                client.task.add(job_id=job.id, task=task)

        except batchmodels.BatchErrorException as be:
            if be.error:
                print('Error creating job, code={}, message={}'.format(
                    be.error.code, be.error.message))
                if be.error.values:
                    for e in be.error.values:
                        print('Key={}, Value={}'.format(e.key, e.value))
            raise
Пример #3
0
def submit_job_and_add_tasks(batch_client, block_blob_client, job_id, pool_id, in_files, out_container_name, app_files, storage_account_name, out_sas_token):
  """Submits jobs to the Azure Batch service and adds
  tasks that runs a python script.

  :param batch_client: The batch client to use.
  :type batch_client: `batchserviceclient.BatchServiceClient`
  :param block_blob_client: The storage block blob client to use.
  :type block_blob_client: `azure.storage.blob.BlockBlobService`
  :param str job_id: The id of the job to create.
  :param str pool_id: The id of the pool to use.
  :param list in_files: The list of the file paths of the inputs.
  :param str out_container_name: The name of the output container.
  :param list app_files: The list of all the other scripts to upload.
  :param str storage_account_name: The name of the storage account.
  :param str out_sas_token: A SAS token granting the specified 
  permissions to the output container.
  """
  start = time.time()
  job = batchmodels.JobAddParameter(
    id=job_id,
    pool_info=batchmodels.PoolInformation(pool_id=pool_id))
  
  batch_client.job.add(job)
  logging.info('job created in seconds {}'.format(time.time() - start))

  start = time.time()
  
  tasks = [batchmodels.TaskAddParameter(
    id="EBOTask-{}".format(i),
    command_line='python {} --filepath {} --storageaccount {} --storagecontainer {} --sastoken "{}"'.format(_TASK_FILE,
             in_file.file_path,
             storage_account_name,
             out_container_name,
             out_sas_token),
    resource_files=[in_file] + app_files) \
    for i, in_file in enumerate(in_files)]

  cnt = 0
  tot_tasks = len(tasks)
  while cnt < tot_tasks:
    try:
      batch_client.task.add_collection(job.id, tasks[cnt:cnt+100])
      cnt += 100
    except Exception as e:
      print("Adding task failed... Going to try again in 5 seconds")
      logging.error(e)
      time.sleep(5)
  logging.info('task created in seconds {}'.format(time.time() - start))
Пример #4
0
    def __create_pool_and_job(self, cluster_conf, software_metadata_key: str, start_task, VmImageModel):
        """
            Create a pool and job
            :param cluster_conf: the configuration object used to create the cluster
            :type cluster_conf: aztk.models.ClusterConfiguration 
            :parm software_metadata_key: the id of the software being used on the cluster
            :param start_task: the start task for the cluster
            :param VmImageModel: the type of image to provision for the cluster
            :param wait: wait until the cluster is ready
        """
        # reuse pool_id as job_id
        pool_id = cluster_conf.cluster_id
        job_id = cluster_conf.cluster_id

        # Get a verified node agent sku
        sku_to_use, image_ref_to_use = \
            helpers.select_latest_verified_vm_image_with_node_agent_sku(
                VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client)

        # Confiure the pool
        pool = batch_models.PoolAddParameter(
            id=pool_id,
            virtual_machine_configuration=batch_models.VirtualMachineConfiguration(
                image_reference=image_ref_to_use,
                node_agent_sku_id=sku_to_use),
            vm_size=cluster_conf.vm_size,
            target_dedicated_nodes=cluster_conf.vm_count,
            target_low_priority_nodes=cluster_conf.vm_low_pri_count,
            start_task=start_task,
            enable_inter_node_communication=True,
            max_tasks_per_node=1,
            metadata=[
                batch_models.MetadataItem(
                    name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
            ])

        # Create the pool + create user for the pool
        helpers.create_pool_if_not_exist(pool, self.batch_client)

        # Create job
        job = batch_models.JobAddParameter(
            id=job_id,
            pool_info=batch_models.PoolInformation(pool_id=pool_id))

        # Add job to batch
        self.batch_client.job.add(job)
        
        return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client)
Пример #5
0
def create_job(batch_service_client, job_id, pool_id):
    """
    Creates a job with the specified ID, associated with the specified pool.

    :param batch_service_client: A Batch service client.
    :type batch_service_client: `azure.batch.BatchServiceClient`
    :param str job_id: The ID for the job.
    :param str pool_id: The ID for the pool.
    """
    print('Creating job [{}]...'.format(job_id))

    job = batchmodels.JobAddParameter(
        id=job_id,
        pool_info=batchmodels.PoolInformation(pool_id=pool_id))

    batch_service_client.job.add(job)
Пример #6
0
def newBatchJobSchedule(account, key, URL, job_id, pool_id):
    creds = batchAuth.SharedKeyCredentials(account_name=account, key=key)
    client_creds = clientAuth.BatchServiceClient(creds, URL)

    stop_running = input('How many days do you have this job to run for?: ')
    hours = input('How many hours would you like the recurrence interval to be for your schedule?: ')
    time_to_run = datetime.datetime.utcnow() + datetime.timedelta(days=int(stop_running))

    try:
        pool = batchmodels.PoolInformation(pool_id=pool_id)
        jobSpec = batchmodels.JobSpecification(pool_info=pool)
        schedule = batchmodels.Schedule(do_not_run_after=time_to_run, recurrence_interval=datetime.timedelta(hours=int(hours)))
        job = batchmodels.JobScheduleAddParameter(id=job_id, schedule=schedule, job_specification=jobSpec)

        client_creds.job_schedule.add(cloud_job_schedule=job)

    except Exception as e:
        logging.error(msg=e)
Пример #7
0
    def create_job(self, pool_id: str):
        from azure.batch import models as batchmodels

        job_queue_name = pool_id + '-queue'
        job = batchmodels.JobAddParameter(
            id=job_queue_name,
            display_name=job_queue_name,
            pool_info=batchmodels.PoolInformation(pool_id=pool_id))

        try:
            self.batch_client.job.add(job)
        except batchmodels.BatchErrorException as err:
            if err.error.code != "JobExists":
                raise
            else:
                logging.info("Job {!r} already exists".format(job_queue_name))

        return job
Пример #8
0
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id):
    job = batchmodels.JobAddParameter(
        id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id))

    batch_client.job.add(job)

    block_blob_client.create_container(CONTAINER_NAME, fail_on_exist=False)

    sas_url = upload_blob_and_create_sas(
        block_blob_client, CONTAINER_NAME, TASK_NAME, TASK_PATH,
        datetime.datetime.utcnow() + datetime.timedelta(hours=1))

    task = batchmodels.TaskAddParameter(id="SliceTask",
                                        command_line="python3 " + TASK_NAME,
                                        resource_files=[
                                            batchmodels.ResourceFile(
                                                file_path=TASK_NAME,
                                                blob_source=sas_url)
                                        ])

    batch_client.task.add(job_id=job.id, task=task)
Пример #9
0
def create_job(batch_service_client, job_id, pool_id):
    """
    Creates a job with the specified ID, associated with the specified pool.
    :param batch_service_client: A Batch service client.
    :type batch_service_client: `azure.batch.BatchServiceClient`
    :param str job_id: The ID for the job.
    :param str pool_id: The ID for the pool.
    """
    print('Creating job [{}]...'.format(job_id))

    job = batchmodels.JobAddParameter(
        id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id))

    try:
        batch_service_client.job.add(job)
    except batchmodels.batch_error.BatchErrorException as err:
        print_batch_exception(err)
        if err.error.code != "JobExists":
            raise
        else:
            print("Job {!r} already exists".format(job_id))
Пример #10
0
    def configure_job(
        self,
        job_id: str,
        pool_id: str,
        display_name: Optional[str] = None,
        **kwargs,
    ) -> JobAddParameter:
        """
        Configures a job for use in the pool

        :param job_id: A string that uniquely identifies the job within the account
        :param pool_id: A string that identifies the pool
        :param display_name: The display name for the job
        """
        job = batch_models.JobAddParameter(
            id=job_id,
            pool_info=batch_models.PoolInformation(pool_id=pool_id),
            display_name=display_name,
            **kwargs,
        )
        return job
Пример #11
0
def create_job(batch_service_client, job_id, pool_id):
    """
    Creates a job with the specified ID, associated with the specified pool.
    :param batch_service_client: A Batch service client.
    :type batch_service_client: `azure.batch.BatchServiceClient`
    :param str job_id: The ID for the job.
    :param str pool_id: The ID for the pool.
    """
    LOGGER.info('Creating job [{}]...'.format(job_id))

    job = batch_models.JobAddParameter(
            id=job_id,
            pool_info=batch_models.PoolInformation(pool_id=pool_id),
            uses_task_dependencies=True)
    try:
        batch_service_client.job.add(job)
        LOGGER.info("Job Created")
    except batch_models.BatchErrorException as err:
        if 'The specified job already exists.' in err.error.message.value:
            LOGGER.info("Job already exists...")
        else:
            raise
Пример #12
0
def add_tasks(batch_service_client, pool_id, task_id, docker_image,
              storage_account, storage_key, container_name, file_name,
              output_container):
    job_id = "batchjob"
    try:
        job = batchmodels.JobAddParameter(
            id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id))
        batch_service_client.job.add(job)
        logging.info('Adding job {} to pool...'.format(job_id))
    except Exception:
        logging.info(
            'Job ID: {} already exists and associated with pool...'.format(
                job_id))
        pass

    logging.info('Adding tasks to job [{}]...'.format(job_id))

    # This is the user who run the command inside the container.
    # An unprivileged one
    user = batchmodels.AutoUserSpecification(
        scope=batchmodels.AutoUserScope.task,
        elevation_level=batchmodels.ElevationLevel.admin)

    # This is the docker image we want to run
    task_container_settings = batchmodels.TaskContainerSettings(
        image_name=docker_image,
        container_run_options='--rm -v /scratch:/scratch')

    # The container needs this argument to be executed
    task = batchmodels.TaskAddParameter(
        id=task_id,
        command_line='/opt/azureblobworker.sh %s %s %s %s %s %s' %
        (storage_account, storage_key, task_id, container_name, file_name,
         output_container),
        container_settings=task_container_settings,
        user_identity=batchmodels.UserIdentity(auto_user=user))
    batch_service_client.task.add(job_id, task)
Пример #13
0
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id):
    """Submits a job to the Azure Batch service and adds
    a task that runs a python script.

    :param batch_client: The batch client to use.
    :type batch_client: `batchserviceclient.BatchServiceClient`
    :param block_blob_client: The storage block blob client to use.
    :type block_blob_client: `azure.storage.blob.BlockBlobService`
    :param str job_id: The id of the job to create.
    :param str pool_id: The id of the pool to use.
    """
    job = batchmodels.JobAddParameter(
        id=job_id,
        pool_info=batchmodels.PoolInformation(pool_id=pool_id))

    batch_client.job.add(job)

    block_blob_client.create_container(
        _CONTAINER_NAME,
        fail_on_exist=False)

    sas_url = common.helpers.upload_blob_and_create_sas(
        block_blob_client,
        _CONTAINER_NAME,
        _SIMPLE_TASK_NAME,
        _SIMPLE_TASK_PATH,
        datetime.datetime.utcnow() + datetime.timedelta(hours=1))

    task = batchmodels.TaskAddParameter(
        id="MyPythonTask",
        command_line="python " + _SIMPLE_TASK_NAME,
        resource_files=[batchmodels.ResourceFile(
                        file_path=_SIMPLE_TASK_NAME,
                        blob_source=sas_url)])

    batch_client.task.add(job_id=job.id, task=task)
Пример #14
0
            start_task=batchmodels.StartTask(
                user_identity=batchmodels.UserIdentity(
                    auto_user=batchmodels.AutoUserSpecification(
                        elevation_level=batchmodels.ElevationLevel.admin,
                        scope=batchmodels.AutoUserScope.pool)),
                command_line=common.helpers.wrap_commands_in_shell(
                    "linux", pool_start_commands),
                resource_files=[]),
        )
        common.helpers.create_pool_if_not_exist(batch_client, pool)

        # Create job to assign tasks
        job_id = "{0:}-job{1:}".format(pool_id, job_n)
        job_ids.append(job_id)
        job = batchmodels.JobAddParameter(
            id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id))
        batch_client.job.add(job)
        print("Job created: {0:}".format(job_id))

        # TODO: SOMETHING WRONG WITH THE OUTPUT FILE GENERATION!!!!

        # Create a task per analysis grid
        for n in job_chunk:

            # CHECKING OUTPUT METHOD
            output_file = analysis_grid_names[n].replace(
                ".json", "_result.json")
            output_file_node = os.path.join("Results", output_file)
            container_sas_url = "https://{0:}.blob.core.windows.net/{1:}?{3:}".format(
                storage_account_name, project_id, output_file_node,
                container_sas_token)
Пример #15
0
def create_job_schedule(batch_client, job_schedule_id, vm_size, vm_count,
                        block_blob_client):
    """Creates an Azure Batch pool and job schedule with the specified ids.

    :param batch_client: The batch client to use.
    :type batch_client: `batchserviceclient.BatchServiceClient`
    :param str job_schedule_id: The id of the job schedule to create
    :param str vm_size: vm size (sku)
    :param int vm_count: number of vms to allocate
    :param block_blob_client: The storage block blob client to use.
    :type block_blob_client: `azure.storage.blob.BlockBlobService`
    """
    cloud_service_config = batchmodels.CloudServiceConfiguration(os_family='6')

    user_id = batchmodels.UserIdentity(
        auto_user=batchmodels.AutoUserSpecification(
            elevation_level=_USER_ELEVATION_LEVEL))

    python_download = batchmodels.ResourceFile(http_url=_PYTHON_DOWNLOAD,
                                               file_path='python373.exe')

    pool_info = batchmodels.PoolInformation(
        auto_pool_specification=batchmodels.AutoPoolSpecification(
            auto_pool_id_prefix="JobScheduler",
            pool=batchmodels.PoolSpecification(
                vm_size=vm_size,
                target_dedicated_nodes=vm_count,
                cloud_service_configuration=cloud_service_config,
                start_task=batchmodels.StartTask(
                    command_line=common.helpers.wrap_commands_in_shell(
                        'windows', ['{}'.format(_PYTHON_INSTALL)]),
                    resource_files=[python_download],
                    wait_for_success=True,
                    user_identity=user_id)),
            keep_alive=False,
            pool_lifetime_option=batchmodels.PoolLifetimeOption.job))

    sas_url = common.helpers.upload_blob_and_create_sas(
        block_blob_client, _CONTAINER_NAME, _SIMPLE_TASK_NAME,
        _SIMPLE_TASK_PATH,
        datetime.datetime.utcnow() + datetime.timedelta(minutes=30))

    job_spec = batchmodels.JobSpecification(
        pool_info=pool_info,
        # Terminate job once all tasks under it are complete to allow for a new
        # job to be created under the schedule
        on_all_tasks_complete=batchmodels.OnAllTasksComplete.terminate_job,
        job_manager_task=batchmodels.JobManagerTask(
            id="JobManagerTask",
            command_line=common.helpers.wrap_commands_in_shell(
                'windows', ['python {}'.format(_SIMPLE_TASK_NAME)]),
            resource_files=[
                batchmodels.ResourceFile(file_path=_SIMPLE_TASK_NAME,
                                         http_url=sas_url)
            ]))

    do_not_run_after = datetime.datetime.utcnow() \
        + datetime.timedelta(minutes=30)

    schedule = batchmodels.Schedule(
        do_not_run_after=do_not_run_after,
        recurrence_interval=datetime.timedelta(minutes=10))

    scheduled_job = batchmodels.JobScheduleAddParameter(
        id=job_schedule_id, schedule=schedule, job_specification=job_spec)

    batch_client.job_schedule.add(cloud_job_schedule=scheduled_job)
Пример #16
0
    def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration,
                              software_metadata_key: str, start_task,
                              VmImageModel):
        """
            Create a pool and job
            :param cluster_conf: the configuration object used to create the cluster
            :type cluster_conf: aztk.models.ClusterConfiguration
            :parm software_metadata_key: the id of the software being used on the cluster
            :param start_task: the start task for the cluster
            :param VmImageModel: the type of image to provision for the cluster
            :param wait: wait until the cluster is ready
        """
        self._get_cluster_data(
            cluster_conf.cluster_id).save_cluster_config(cluster_conf)
        # reuse pool_id as job_id
        pool_id = cluster_conf.cluster_id
        job_id = cluster_conf.cluster_id

        # Get a verified node agent sku
        sku_to_use, image_ref_to_use = \
            helpers.select_latest_verified_vm_image_with_node_agent_sku(
                VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client)

        network_conf = None
        if cluster_conf.subnet_id is not None:
            network_conf = batch_models.NetworkConfiguration(
                subnet_id=cluster_conf.subnet_id)
        auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format(
            cluster_conf.vm_count, cluster_conf.vm_low_pri_count)

        # Confiure the pool
        pool = batch_models.PoolAddParameter(
            id=pool_id,
            virtual_machine_configuration=batch_models.
            VirtualMachineConfiguration(image_reference=image_ref_to_use,
                                        node_agent_sku_id=sku_to_use),
            vm_size=cluster_conf.vm_size,
            enable_auto_scale=True,
            auto_scale_formula=auto_scale_formula,
            auto_scale_evaluation_interval=timedelta(minutes=5),
            start_task=start_task,
            enable_inter_node_communication=True
            if not cluster_conf.subnet_id else False,
            max_tasks_per_node=1,
            network_configuration=network_conf,
            metadata=[
                batch_models.MetadataItem(
                    name=constants.AZTK_SOFTWARE_METADATA_KEY,
                    value=software_metadata_key),
                batch_models.MetadataItem(
                    name=constants.AZTK_MODE_METADATA_KEY,
                    value=constants.AZTK_CLUSTER_MODE_METADATA)
            ])

        # Create the pool + create user for the pool
        helpers.create_pool_if_not_exist(pool, self.batch_client)

        # Create job
        job = batch_models.JobAddParameter(
            id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id))

        # Add job to batch
        self.batch_client.job.add(job)

        return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client)
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id,
                            storage_account_name):
    """Submits a job to the Azure Batch service and adds
    a task that runs a python script.

    :param batch_client: The batch client to use.
    :type batch_client: `batchserviceclient.BatchServiceClient`
    :param block_blob_client: The storage block blob client to use.
    :type block_blob_client: `azure.storage.blob.BlockBlobService`
    :param str job_id: The id of the job to create.
    :param str pool_id: The id of the pool to use.
    """
    job = batchmodels.JobAddParameter(
        id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id))

    batch_client.job.add(job)

    output_container_sas = common.helpers.create_container_and_create_sas(
        block_blob_client,
        job_id,
        azureblob.BlobPermissions.WRITE,
        expiry=None,
        timeout=120)

    output_container_sas_url = 'https://{}.blob.core.windows.net/{}?{}'.format(
        storage_account_name, job_id, output_container_sas)

    app_file_list = get_resource_file_list_from_container(
        block_blob_client, _APP_CONTAINER_NAME)

    blob_list = block_blob_client.list_blobs(_RESOURCE_CONTAINER_NAME)
    for blob in blob_list:
        (blob_base_name, blob_extension) = os.path.splitext(blob.name)
        output_file_name = f"{blob_base_name}_out{blob_extension}"
        command_line = f"{_APP_EXE_NAME} {_APP_EXTRA_ARGS} {blob.name} {output_file_name}"
        task_id = f"{_APP_EXE_NAME}_{blob_base_name}_Task"
        resource_sas_url = common.helpers.create_sas_url(
            block_blob_client, _RESOURCE_CONTAINER_NAME, blob.name,
            azureblob.BlobPermissions.READ,
            datetime.datetime.utcnow() + datetime.timedelta(hours=1))
        resource_file = batchmodels.ResourceFile(file_path=blob.name,
                                                 http_url=resource_sas_url)
        print(resource_sas_url)
        print(app_file_list)

        print(f"Creating task ({task_id}): " + command_line)
        output_file = batchmodels.OutputFile(
            file_pattern=output_file_name,
            destination=batchmodels.OutputFileDestination(
                container=batchmodels.OutputFileBlobContainerDestination(
                    container_url=output_container_sas_url)),
            upload_options=batchmodels.OutputFileUploadOptions(
                upload_condition=batchmodels.OutputFileUploadCondition.
                task_completion))

        task = batchmodels.TaskAddParameter(id=task_id,
                                            command_line=command_line,
                                            resource_files=app_file_list +
                                            [resource_file],
                                            output_files=[output_file])

        batch_client.task.add(job_id=job.id, task=task)
Пример #18
0
def create_job_schedule(batch_client, job_schedule_id, vm_size, vm_count):
    """Creates an Azure Batch pool and job schedule with the specified ids.

    :param batch_client: The batch client to use.
    :type batch_client: `batchserviceclient.BatchServiceClient`
    :param str job_schedule_id: The id of the job schedule to create
    :param str vm_size: vm size (sku)
    :param int vm_count: number of vms to allocate
    """

    pool_info = batchmodels.PoolInformation(
        auto_pool_specification=batchmodels.AutoPoolSpecification(
            auto_pool_id_prefix="JobScheduler",
            pool=batchmodels.PoolSpecification(
                vm_size=vm_size,
                target_dedicated_nodes=vm_count,
                virtual_machine_configuration=batchmodels.VirtualMachineConfiguration(
                    image_reference=batchmodels.ImageReference(
                        publisher="Canonical",
                        offer="UbuntuServer",
                        sku="18.04-LTS",
                        version="latest"
                    ),
                    node_agent_sku_id="batch.node.ubuntu 18.04"
                ),
                start_task=batchmodels.StartTask(
                    command_line="/bin/bash -c "
                                 "\"$AZ_BATCH_APP_PACKAGE_azure_batch_1/azure_batch/job_schedular_node_startup_tasks.sh\"",
                    wait_for_success=True,
                    user_identity=batchmodels.UserIdentity(
                        auto_user=batchmodels.AutoUserSpecification(
                            scope=batchmodels.AutoUserScope.pool,
                            elevation_level=batchmodels.ElevationLevel.admin)
                    ),
                ),
                application_package_references=[batchmodels.ApplicationPackageReference(
                    application_id="azure_batch", version="1"
                )],
            ),
            keep_alive=False,
            pool_lifetime_option=batchmodels.PoolLifetimeOption.job
        )
    )

    job_spec = batchmodels.JobSpecification(
        pool_info=pool_info,
        # Terminate job once all tasks under it are complete to allow for a new
        # job to be created under the schedule
        on_all_tasks_complete=batchmodels.OnAllTasksComplete.terminate_job,
        job_manager_task=batchmodels.JobManagerTask(
            id="JobManagerTask",
            #specify the command that needs to run recursively in job_schedular
            command_line="/bin/bash -c \" python3 "
                         "$AZ_BATCH_APP_PACKAGE_azure_batch_1/azure_batch/azure_batch_main.py\""
        ))

    #mention the interval of the job schedular
    schedule = batchmodels.Schedule(
        recurrence_interval=datetime.timedelta(days=15))

    scheduled_job = batchmodels.JobScheduleAddParameter(
        id=job_schedule_id,
        schedule=schedule,
        job_specification=job_spec)

    batch_client.job_schedule.add(cloud_job_schedule=scheduled_job)
Пример #19
0
    def __submit_job(self,
                     job_configuration,
                     start_task,
                     job_manager_task,
                     autoscale_formula,
                     software_metadata_key: str,
                     vm_image_model,
                     application_metadata):
        """
            Job Submission
            :param job_configuration -> aztk_sdk.spark.models.JobConfiguration
            :param start_task -> batch_models.StartTask
            :param job_manager_task -> batch_models.TaskAddParameter
            :param autoscale forumula -> str
            :param software_metadata_key -> str
            :param vm_image_model -> aztk_sdk.models.VmImage
            :returns None
        """
        self._get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config())

        # get a verified node agent sku
        sku_to_use, image_ref_to_use = \
            helpers.select_latest_verified_vm_image_with_node_agent_sku(
                vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client)

        # set up subnet if necessary
        network_conf = None
        if job_configuration.subnet_id:
            network_conf = batch_models.NetworkConfiguration(
                subnet_id=job_configuration.subnet_id)

        # set up a schedule for a recurring job
        auto_pool_specification = batch_models.AutoPoolSpecification(
            pool_lifetime_option=batch_models.PoolLifetimeOption.job_schedule,
            auto_pool_id_prefix=job_configuration.id,
            keep_alive=False,
            pool=batch_models.PoolSpecification(
                display_name=job_configuration.id,
                virtual_machine_configuration=batch_models.VirtualMachineConfiguration(
                    image_reference=image_ref_to_use,
                    node_agent_sku_id=sku_to_use),
                vm_size=job_configuration.vm_size,
                enable_auto_scale=True,
                auto_scale_formula=autoscale_formula,
                auto_scale_evaluation_interval=timedelta(minutes=5),
                start_task=start_task,
                enable_inter_node_communication=not job_configuration.mixed_mode(),
                network_configuration=network_conf,
                max_tasks_per_node=4,
                metadata=[
                    batch_models.MetadataItem(
                        name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key),
                    batch_models.MetadataItem(
                        name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA)
                ]
            )
        )

        # define job specification
        job_spec = batch_models.JobSpecification(
            pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification),
            display_name=job_configuration.id,
            on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job,
            job_manager_task=job_manager_task,
            metadata=[
                batch_models.MetadataItem(
                    name='applications', value=application_metadata)
            ]
        )

        # define schedule
        schedule = batch_models.Schedule(
            do_not_run_until=None,
            do_not_run_after=None,
            start_window=None,
            recurrence_interval=None
        )

        # create job schedule and add task
        setup = batch_models.JobScheduleAddParameter(
            id=job_configuration.id,
            schedule=schedule,
            job_specification=job_spec)

        self.batch_client.job_schedule.add(setup)

        return self.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)
Пример #20
0
def job_create():
    batch_service.job.add(job=batchmodel.JobAddParameter(
        id=config_azure['job_id'],
        pool_info=batchmodel.PoolInformation(
            pool_id=config_azure['batch_pool_name'])))
Пример #21
0
def retarget_job_to_new_pool(batch_service_client: batch.BatchExtensionsClient,
                             job_id: str, new_pool_id: str):
    """ Disables a job with task requeue, then patches it to target a new pool.
    
    :param batch_service_client: The batch client used for making batch operations
    :type batch_service_client: `azure.batch.BatchExtensionsClient`
    :param job_id: The job to retarget
    :type job_id: str
    :param new_pool_id: The id of the new pool
    :type new_pool_id: str
    """
    logger.info("Retargeting job [{}] to new pool [{}]".format(
        job_id, new_pool_id))

    try:

        batch_service_client.job.disable(job_id, "requeue")

    except batchmodels.BatchErrorException as batch_exception:
        # potential race condition where the nodes have gone idle and the job has 'Completed' between our internal
        # node-idle-timeout check and the call to disable the job. Just return in this case
        if expected_exception(batch_exception,
                              "The specified job does not exist"):
            logger.info(
                "The specified Job [{}] did not exist when we tried to delete it."
                .format(job_id))
            raise ex.JobAlreadyCompleteException(
                job_id, "Job already complete and deleted.")

        if expected_exception(
                batch_exception,
                "The specified job is already in a completed state"):
            logger.info(
                "The specified Job [{}] was already in completed state when we tried to delete it."
                .format(job_id))
            raise ex.JobAlreadyCompleteException(job_id,
                                                 "Job already complete.")
        raise

    # give the job time to move to disabled state before we try Patch it
    time.sleep(service_state_transition_seconds)

    looping_job_patch = True
    job_patch_retry_count = 0
    while looping_job_patch:
        try:
            batch_service_client.job.patch(
                job_id,
                batchmodels.JobPatchParameter(
                    pool_info=batchmodels.PoolInformation(
                        pool_id=new_pool_id)))
            looping_job_patch = False
        except batchmodels.BatchErrorException as batch_exception:
            if expected_exception(
                    batch_exception,
                    "The specified operation is not valid for the current state of the resource"
            ):
                if job_patch_retry_count > 10:
                    logger.error(
                        "Exhausted retries and Failed to patch job [{}] due to the current state of the resource"
                        .format(job_id))
                    raise
                logger.info(
                    "Failed to patch job [{}] due to the current state of the resource, retrying...."
                    .format(job_id))
                time.sleep(5)
                job_patch_retry_count = job_patch_retry_count + 1

    logger.info("Successfully retargeted job [{}] to pool [{}]".format(
        job_id, new_pool_id))
    def test_batch_job_schedules(self, **kwargs):
        client = self.create_aad_client(**kwargs)
        # Test Create Job Schedule
        schedule_id = self.get_resource_name('batch_schedule_')
        job_spec = models.JobSpecification(
            pool_info=models.PoolInformation("pool_id"),
            constraints=models.JobConstraints(max_task_retry_count=2),
            on_all_tasks_complete=models.OnAllTasksComplete.terminate_job
        )
        schedule = models.Schedule(
            start_window=datetime.timedelta(hours=1),
            recurrence_interval=datetime.timedelta(days=1)
        )
        params = models.JobScheduleAddParameter(
            schedule_id,
            schedule,
            job_spec
        )
        response = client.job_schedule.add(params)
        self.assertIsNone(response)

        # Test List Job Schedules
        schedules = list(client.job_schedule.list())
        self.assertTrue(len(schedules) > 0)

        # Test Get Job Schedule
        schedule = client.job_schedule.get(schedule_id)
        self.assertIsInstance(schedule, models.CloudJobSchedule)
        self.assertEqual(schedule.id, schedule_id)
        self.assertEqual(schedule.state, models.JobScheduleState.active)

        # Test Job Schedule Exists
        exists = client.job_schedule.exists(schedule_id)
        self.assertTrue(exists)

        # Test List Jobs from Schedule
        jobs = list(client.job.list_from_job_schedule(schedule_id))
        self.assertTrue(len(jobs) > 0)

        # Test Disable Job Schedule
        response = client.job_schedule.disable(schedule_id)
        self.assertIsNone(response)

        # Test Enable Job Schedule
        response = client.job_schedule.enable(schedule_id)
        self.assertIsNone(response)

        # Test Update Job Schedule
        job_spec = models.JobSpecification(
            pool_info=models.PoolInformation('pool_id')
        )
        schedule = models.Schedule(
            recurrence_interval=datetime.timedelta(hours=10)
        )
        params = models.JobScheduleUpdateParameter(schedule, job_spec)
        response = client.job_schedule.update(schedule_id, params)
        self.assertIsNone(response)

        # Test Patch Job Schedule
        schedule = models.Schedule(
            recurrence_interval=datetime.timedelta(hours=5)
        )
        params = models.JobSchedulePatchParameter(schedule)
        response = client.job_schedule.patch(schedule_id, params)
        self.assertIsNone(response)

        # Test Terminate Job Schedule
        response = client.job_schedule.terminate(schedule_id)
        self.assertIsNone(response)

        # Test Delete Job Schedule
        response = client.job_schedule.delete(schedule_id)
        self.assertIsNone(response)
    def test_batch_jobs(self, **kwargs):
        client = self.create_sharedkey_client(**kwargs)
        # Test Create Job
        auto_pool = models.AutoPoolSpecification(
            pool_lifetime_option=models.PoolLifetimeOption.job,
            pool=models.PoolSpecification(
                vm_size='small',
                cloud_service_configuration=models.CloudServiceConfiguration(
                    os_family='5'
                )
            )
        )
        job_prep = models.JobPreparationTask(command_line="cmd /c \"echo hello world\"")
        job_release = models.JobReleaseTask(command_line="cmd /c \"echo goodbye world\"")
        job_param = models.JobAddParameter(
            id=self.get_resource_name('batch_job1_'),
            pool_info=models.PoolInformation(
                auto_pool_specification=auto_pool
            ),
            job_preparation_task=job_prep,
            job_release_task=job_release
        )
        response = client.job.add(job_param)
        self.assertIsNone(response)

        # Test Update Job
        constraints = models.JobConstraints(max_task_retry_count=3)
        options = models.JobUpdateParameter(
            priority=500,
            constraints=constraints,
            pool_info=models.PoolInformation(
                auto_pool_specification=auto_pool
            )
        )
        response = client.job.update(job_param.id, options)
        self.assertIsNone(response)

        # Test Patch Job
        options = models.JobPatchParameter(priority=900)
        response = client.job.patch(job_param.id, options)
        self.assertIsNone(response)

        job = client.job.get(job_param.id)
        self.assertIsInstance(job, models.CloudJob)
        self.assertEqual(job.id, job_param.id)
        self.assertEqual(job.constraints.max_task_retry_count, 3)
        self.assertEqual(job.priority, 900)

        # Test Create Job with Auto Complete
        job_auto_param = models.JobAddParameter(
            id=self.get_resource_name('batch_job2_'),
            on_all_tasks_complete=models.OnAllTasksComplete.terminate_job,
            on_task_failure=models.OnTaskFailure.perform_exit_options_job_action,
            pool_info=models.PoolInformation(
                auto_pool_specification=auto_pool
            )
        )
        response = client.job.add(job_auto_param)
        self.assertIsNone(response)
        job = client.job.get(job_auto_param.id)
        self.assertIsInstance(job, models.CloudJob)
        self.assertEqual(job.on_all_tasks_complete, models.OnAllTasksComplete.terminate_job)
        self.assertEqual(job.on_task_failure, models.OnTaskFailure.perform_exit_options_job_action)

        # Test List Jobs
        jobs = client.job.list()
        self.assertIsInstance(jobs, models.CloudJobPaged)
        self.assertEqual(len(list(jobs)), 2)

        # Test Disable Job
        response = client.job.disable(job_param.id, models.DisableJobOption.requeue)
        self.assertIsNone(response)

        # Test Enable Job
        response = client.job.enable(job_param.id)
        self.assertIsNone(response)

        # Prep and release task status
        task_status = client.job.list_preparation_and_release_task_status(job_param.id)
        self.assertIsInstance(task_status, models.JobPreparationAndReleaseTaskExecutionInformationPaged)
        self.assertEqual(list(task_status), [])

        # Test Terminate Job
        response = client.job.terminate(job_param.id)
        self.assertIsNone(response)

        # Test Delete Job
        response = client.job.delete(job_auto_param.id)
        self.assertIsNone(response)

        # Test Job Lifetime Statistics
        stats = client.job.get_all_lifetime_statistics()
        self.assertIsInstance(stats, models.JobStatistics)
        self.assertEqual(stats.num_succeeded_tasks, 0)
        self.assertEqual(stats.num_failed_tasks, 0)
Пример #24
0
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id,
                            block_indices, input_files,
                            output_container_sas_url):
    """Submits a job to the Azure Batch service and adds
    a task that runs a python script.
    :param batch_client: The batch client to use.
    :type batch_client: `batchserviceclient.BatchServiceClient`
    :param block_blob_client: The storage block blob client to use.
    :type block_blob_client: `azure.storage.blob.BlockBlobService`
    :param str job_id: The id of the job to create.
    :param str pool_id: The id of the pool to use.
    """
    print(block_indices)

    job = batchmodels.JobAddParameter(
        id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id))

    batch_client.job.add(job)

    block_blob_client.create_container(_CONTAINER_NAME, fail_on_exist=False)

    sas_url = common.helpers.upload_blob_and_create_sas(
        block_blob_client, _CONTAINER_NAME, _SIMPLE_TASK_NAME,
        _SIMPLE_TASK_PATH, _EXPIRY_TIME)

    tasks = list()

    # Count how many items are stored in the batch
    inBatch = 0

    for block, input_file in zip(block_indices, input_files):
        input_file_path = input_file.file_path
        output_file_path = "".join(
            (input_file_path).split('.')[:-1]) + '_model.dat'
        task_file = batchmodels.ResourceFile(file_path=_SIMPLE_TASK_NAME,
                                             http_url=sas_url)
        print(type(input_file), type(task_file))
        tasks.append(
            batchmodels.TaskAddParameter(
                id='Task{}'.format(block),
                command_line="python3 %s -b %d" % (_SIMPLE_TASK_NAME, block),
                resource_files=[task_file, input_file],
                output_files=[
                    batchmodels.OutputFile(
                        file_pattern=output_file_path,
                        destination=batchmodels.OutputFileDestination(
                            container=batchmodels.
                            OutputFileBlobContainerDestination(
                                container_url=output_container_sas_url)),
                        upload_options=batchmodels.OutputFileUploadOptions(
                            upload_condition=batchmodels.
                            OutputFileUploadCondition.task_success))
                ]))

        inBatch += 1
        # We can only send batches with up to 100 records
        if inBatch > 99:
            batch_client.task.add_collection(job.id, tasks)
            tasks = []
            inBatch = 0

    if inBatch > 0:
        batch_client.task.add_collection(job.id, tasks)