def run(self, wait=True, **kwargs) -> None: r""" :param config: A :class:`BatchConfig` instance with the Azure Batch run parameters :type config: :class:BatchConfig :param boolean wait: If true, wait for the batch to complete and then download the results to file :raises BatchErrorException: If raised by the Azure Batch Python SDK """ # replace any missing values in the configuration with environment variables if not hasattr(self, "tasks"): raise ValueError( "Client restored from data cannot be used to run the job") try: # Create the pool that will contain the compute nodes that will execute the # tasks. if not (self.config.POOL_VM_SIZE and (self.config.POOL_NODE_COUNT or self.config.POOL_LOW_PRIORITY_NODE_COUNT)): print("Using existing pool: ", self.config.POOL_ID) else: try: self._create_pool() print("Created pool: ", self.config.POOL_ID) except models.BatchErrorException: print("Using pool: ", self.config.POOL_ID) # Create the job that will run the tasks. job_description = models.JobAddParameter( id=self.config.JOB_ID, pool_info=models.PoolInformation(pool_id=self.config.POOL_ID), ) self.batch_client.job.add(job_description) # Add the tasks to the job. self.batch_client.task.add_collection(self.config.JOB_ID, self.tasks) except models.BatchErrorException as err: print_batch_exception(err) raise err if wait: self.load_results(**kwargs)
def create_job(self, job_id, pool_id, total_nodes, is_linux_pool): client = self._get_batch_client() try: pool_info = batchmodels.PoolInformation(pool_id=pool_id) job = batchmodels.JobAddParameter(id=job_id, pool_info=pool_info) try: client.job.add(job) except batchmodels.BatchErrorException as be: if be.error and be.error.code == 'JobExists': pass else: print('Error creating job, code={}, message={}'.format( be.error.code, be.error.message)) raise if is_linux_pool: cmd_line = '/bin/bash -c azure-batch-ses.sh' script = 'azure-batch-ses.sh' script_url = 'https://raw.githubusercontent.com/Azure/azure-deadline/master/CloudProviderPlugin/Scripts/azure-batch-ses.sh' else: cmd_line = 'powershell.exe -file azure-batch-ses.ps1' script = 'azure-batch-ses.ps1' script_url = 'https://raw.githubusercontent.com/Azure/azure-deadline/master/CloudProviderPlugin/Scripts/azure-batch-ses.ps1' task = batchmodels.TaskAddParameter( id='', command_line=cmd_line, resource_files=[batchmodels.ResourceFile(script_url, script)], constraints=batchmodels.TaskConstraints( max_task_retry_count=3), user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.pool, elevation_level=batchmodels.ElevationLevel.admin))) for i in range(total_nodes): task.id = str(uuid.uuid4()) client.task.add(job_id=job.id, task=task) except batchmodels.BatchErrorException as be: if be.error: print('Error creating job, code={}, message={}'.format( be.error.code, be.error.message)) if be.error.values: for e in be.error.values: print('Key={}, Value={}'.format(e.key, e.value)) raise
def submit_job_and_add_tasks(batch_client, block_blob_client, job_id, pool_id, in_files, out_container_name, app_files, storage_account_name, out_sas_token): """Submits jobs to the Azure Batch service and adds tasks that runs a python script. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. :param list in_files: The list of the file paths of the inputs. :param str out_container_name: The name of the output container. :param list app_files: The list of all the other scripts to upload. :param str storage_account_name: The name of the storage account. :param str out_sas_token: A SAS token granting the specified permissions to the output container. """ start = time.time() job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) logging.info('job created in seconds {}'.format(time.time() - start)) start = time.time() tasks = [batchmodels.TaskAddParameter( id="EBOTask-{}".format(i), command_line='python {} --filepath {} --storageaccount {} --storagecontainer {} --sastoken "{}"'.format(_TASK_FILE, in_file.file_path, storage_account_name, out_container_name, out_sas_token), resource_files=[in_file] + app_files) \ for i, in_file in enumerate(in_files)] cnt = 0 tot_tasks = len(tasks) while cnt < tot_tasks: try: batch_client.task.add_collection(job.id, tasks[cnt:cnt+100]) cnt += 100 except Exception as e: print("Adding task failed... Going to try again in 5 seconds") logging.error(e) time.sleep(5) logging.info('task created in seconds {}'.format(time.time() - start))
def __create_pool_and_job(self, cluster_conf, software_metadata_key: str, start_task, VmImageModel): """ Create a pool and job :param cluster_conf: the configuration object used to create the cluster :type cluster_conf: aztk.models.ClusterConfiguration :parm software_metadata_key: the id of the software being used on the cluster :param start_task: the start task for the cluster :param VmImageModel: the type of image to provision for the cluster :param wait: wait until the cluster is ready """ # reuse pool_id as job_id pool_id = cluster_conf.cluster_id job_id = cluster_conf.cluster_id # Get a verified node agent sku sku_to_use, image_ref_to_use = \ helpers.select_latest_verified_vm_image_with_node_agent_sku( VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) # Confiure the pool pool = batch_models.PoolAddParameter( id=pool_id, virtual_machine_configuration=batch_models.VirtualMachineConfiguration( image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), vm_size=cluster_conf.vm_size, target_dedicated_nodes=cluster_conf.vm_count, target_low_priority_nodes=cluster_conf.vm_low_pri_count, start_task=start_task, enable_inter_node_communication=True, max_tasks_per_node=1, metadata=[ batch_models.MetadataItem( name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), ]) # Create the pool + create user for the pool helpers.create_pool_if_not_exist(pool, self.batch_client) # Create job job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id)) # Add job to batch self.batch_client.job.add(job) return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client)
def create_job(batch_service_client, job_id, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ print('Creating job [{}]...'.format(job_id)) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_service_client.job.add(job)
def newBatchJobSchedule(account, key, URL, job_id, pool_id): creds = batchAuth.SharedKeyCredentials(account_name=account, key=key) client_creds = clientAuth.BatchServiceClient(creds, URL) stop_running = input('How many days do you have this job to run for?: ') hours = input('How many hours would you like the recurrence interval to be for your schedule?: ') time_to_run = datetime.datetime.utcnow() + datetime.timedelta(days=int(stop_running)) try: pool = batchmodels.PoolInformation(pool_id=pool_id) jobSpec = batchmodels.JobSpecification(pool_info=pool) schedule = batchmodels.Schedule(do_not_run_after=time_to_run, recurrence_interval=datetime.timedelta(hours=int(hours))) job = batchmodels.JobScheduleAddParameter(id=job_id, schedule=schedule, job_specification=jobSpec) client_creds.job_schedule.add(cloud_job_schedule=job) except Exception as e: logging.error(msg=e)
def create_job(self, pool_id: str): from azure.batch import models as batchmodels job_queue_name = pool_id + '-queue' job = batchmodels.JobAddParameter( id=job_queue_name, display_name=job_queue_name, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) try: self.batch_client.job.add(job) except batchmodels.BatchErrorException as err: if err.error.code != "JobExists": raise else: logging.info("Job {!r} already exists".format(job_queue_name)) return job
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id): job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) block_blob_client.create_container(CONTAINER_NAME, fail_on_exist=False) sas_url = upload_blob_and_create_sas( block_blob_client, CONTAINER_NAME, TASK_NAME, TASK_PATH, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) task = batchmodels.TaskAddParameter(id="SliceTask", command_line="python3 " + TASK_NAME, resource_files=[ batchmodels.ResourceFile( file_path=TASK_NAME, blob_source=sas_url) ]) batch_client.task.add(job_id=job.id, task=task)
def create_job(batch_service_client, job_id, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ print('Creating job [{}]...'.format(job_id)) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) try: batch_service_client.job.add(job) except batchmodels.batch_error.BatchErrorException as err: print_batch_exception(err) if err.error.code != "JobExists": raise else: print("Job {!r} already exists".format(job_id))
def configure_job( self, job_id: str, pool_id: str, display_name: Optional[str] = None, **kwargs, ) -> JobAddParameter: """ Configures a job for use in the pool :param job_id: A string that uniquely identifies the job within the account :param pool_id: A string that identifies the pool :param display_name: The display name for the job """ job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id), display_name=display_name, **kwargs, ) return job
def create_job(batch_service_client, job_id, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ LOGGER.info('Creating job [{}]...'.format(job_id)) job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id), uses_task_dependencies=True) try: batch_service_client.job.add(job) LOGGER.info("Job Created") except batch_models.BatchErrorException as err: if 'The specified job already exists.' in err.error.message.value: LOGGER.info("Job already exists...") else: raise
def add_tasks(batch_service_client, pool_id, task_id, docker_image, storage_account, storage_key, container_name, file_name, output_container): job_id = "batchjob" try: job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_service_client.job.add(job) logging.info('Adding job {} to pool...'.format(job_id)) except Exception: logging.info( 'Job ID: {} already exists and associated with pool...'.format( job_id)) pass logging.info('Adding tasks to job [{}]...'.format(job_id)) # This is the user who run the command inside the container. # An unprivileged one user = batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.task, elevation_level=batchmodels.ElevationLevel.admin) # This is the docker image we want to run task_container_settings = batchmodels.TaskContainerSettings( image_name=docker_image, container_run_options='--rm -v /scratch:/scratch') # The container needs this argument to be executed task = batchmodels.TaskAddParameter( id=task_id, command_line='/opt/azureblobworker.sh %s %s %s %s %s %s' % (storage_account, storage_key, task_id, container_name, file_name, output_container), container_settings=task_container_settings, user_identity=batchmodels.UserIdentity(auto_user=user)) batch_service_client.task.add(job_id, task)
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id): """Submits a job to the Azure Batch service and adds a task that runs a python script. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. """ job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) block_blob_client.create_container( _CONTAINER_NAME, fail_on_exist=False) sas_url = common.helpers.upload_blob_and_create_sas( block_blob_client, _CONTAINER_NAME, _SIMPLE_TASK_NAME, _SIMPLE_TASK_PATH, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) task = batchmodels.TaskAddParameter( id="MyPythonTask", command_line="python " + _SIMPLE_TASK_NAME, resource_files=[batchmodels.ResourceFile( file_path=_SIMPLE_TASK_NAME, blob_source=sas_url)]) batch_client.task.add(job_id=job.id, task=task)
start_task=batchmodels.StartTask( user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( elevation_level=batchmodels.ElevationLevel.admin, scope=batchmodels.AutoUserScope.pool)), command_line=common.helpers.wrap_commands_in_shell( "linux", pool_start_commands), resource_files=[]), ) common.helpers.create_pool_if_not_exist(batch_client, pool) # Create job to assign tasks job_id = "{0:}-job{1:}".format(pool_id, job_n) job_ids.append(job_id) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) print("Job created: {0:}".format(job_id)) # TODO: SOMETHING WRONG WITH THE OUTPUT FILE GENERATION!!!! # Create a task per analysis grid for n in job_chunk: # CHECKING OUTPUT METHOD output_file = analysis_grid_names[n].replace( ".json", "_result.json") output_file_node = os.path.join("Results", output_file) container_sas_url = "https://{0:}.blob.core.windows.net/{1:}?{3:}".format( storage_account_name, project_id, output_file_node, container_sas_token)
def create_job_schedule(batch_client, job_schedule_id, vm_size, vm_count, block_blob_client): """Creates an Azure Batch pool and job schedule with the specified ids. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str job_schedule_id: The id of the job schedule to create :param str vm_size: vm size (sku) :param int vm_count: number of vms to allocate :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` """ cloud_service_config = batchmodels.CloudServiceConfiguration(os_family='6') user_id = batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( elevation_level=_USER_ELEVATION_LEVEL)) python_download = batchmodels.ResourceFile(http_url=_PYTHON_DOWNLOAD, file_path='python373.exe') pool_info = batchmodels.PoolInformation( auto_pool_specification=batchmodels.AutoPoolSpecification( auto_pool_id_prefix="JobScheduler", pool=batchmodels.PoolSpecification( vm_size=vm_size, target_dedicated_nodes=vm_count, cloud_service_configuration=cloud_service_config, start_task=batchmodels.StartTask( command_line=common.helpers.wrap_commands_in_shell( 'windows', ['{}'.format(_PYTHON_INSTALL)]), resource_files=[python_download], wait_for_success=True, user_identity=user_id)), keep_alive=False, pool_lifetime_option=batchmodels.PoolLifetimeOption.job)) sas_url = common.helpers.upload_blob_and_create_sas( block_blob_client, _CONTAINER_NAME, _SIMPLE_TASK_NAME, _SIMPLE_TASK_PATH, datetime.datetime.utcnow() + datetime.timedelta(minutes=30)) job_spec = batchmodels.JobSpecification( pool_info=pool_info, # Terminate job once all tasks under it are complete to allow for a new # job to be created under the schedule on_all_tasks_complete=batchmodels.OnAllTasksComplete.terminate_job, job_manager_task=batchmodels.JobManagerTask( id="JobManagerTask", command_line=common.helpers.wrap_commands_in_shell( 'windows', ['python {}'.format(_SIMPLE_TASK_NAME)]), resource_files=[ batchmodels.ResourceFile(file_path=_SIMPLE_TASK_NAME, http_url=sas_url) ])) do_not_run_after = datetime.datetime.utcnow() \ + datetime.timedelta(minutes=30) schedule = batchmodels.Schedule( do_not_run_after=do_not_run_after, recurrence_interval=datetime.timedelta(minutes=10)) scheduled_job = batchmodels.JobScheduleAddParameter( id=job_schedule_id, schedule=schedule, job_specification=job_spec) batch_client.job_schedule.add(cloud_job_schedule=scheduled_job)
def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): """ Create a pool and job :param cluster_conf: the configuration object used to create the cluster :type cluster_conf: aztk.models.ClusterConfiguration :parm software_metadata_key: the id of the software being used on the cluster :param start_task: the start task for the cluster :param VmImageModel: the type of image to provision for the cluster :param wait: wait until the cluster is ready """ self._get_cluster_data( cluster_conf.cluster_id).save_cluster_config(cluster_conf) # reuse pool_id as job_id pool_id = cluster_conf.cluster_id job_id = cluster_conf.cluster_id # Get a verified node agent sku sku_to_use, image_ref_to_use = \ helpers.select_latest_verified_vm_image_with_node_agent_sku( VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) network_conf = None if cluster_conf.subnet_id is not None: network_conf = batch_models.NetworkConfiguration( subnet_id=cluster_conf.subnet_id) auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format( cluster_conf.vm_count, cluster_conf.vm_low_pri_count) # Confiure the pool pool = batch_models.PoolAddParameter( id=pool_id, virtual_machine_configuration=batch_models. VirtualMachineConfiguration(image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), vm_size=cluster_conf.vm_size, enable_auto_scale=True, auto_scale_formula=auto_scale_formula, auto_scale_evaluation_interval=timedelta(minutes=5), start_task=start_task, enable_inter_node_communication=True if not cluster_conf.subnet_id else False, max_tasks_per_node=1, network_configuration=network_conf, metadata=[ batch_models.MetadataItem( name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA) ]) # Create the pool + create user for the pool helpers.create_pool_if_not_exist(pool, self.batch_client) # Create job job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id)) # Add job to batch self.batch_client.job.add(job) return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client)
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id, storage_account_name): """Submits a job to the Azure Batch service and adds a task that runs a python script. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. """ job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) output_container_sas = common.helpers.create_container_and_create_sas( block_blob_client, job_id, azureblob.BlobPermissions.WRITE, expiry=None, timeout=120) output_container_sas_url = 'https://{}.blob.core.windows.net/{}?{}'.format( storage_account_name, job_id, output_container_sas) app_file_list = get_resource_file_list_from_container( block_blob_client, _APP_CONTAINER_NAME) blob_list = block_blob_client.list_blobs(_RESOURCE_CONTAINER_NAME) for blob in blob_list: (blob_base_name, blob_extension) = os.path.splitext(blob.name) output_file_name = f"{blob_base_name}_out{blob_extension}" command_line = f"{_APP_EXE_NAME} {_APP_EXTRA_ARGS} {blob.name} {output_file_name}" task_id = f"{_APP_EXE_NAME}_{blob_base_name}_Task" resource_sas_url = common.helpers.create_sas_url( block_blob_client, _RESOURCE_CONTAINER_NAME, blob.name, azureblob.BlobPermissions.READ, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) resource_file = batchmodels.ResourceFile(file_path=blob.name, http_url=resource_sas_url) print(resource_sas_url) print(app_file_list) print(f"Creating task ({task_id}): " + command_line) output_file = batchmodels.OutputFile( file_pattern=output_file_name, destination=batchmodels.OutputFileDestination( container=batchmodels.OutputFileBlobContainerDestination( container_url=output_container_sas_url)), upload_options=batchmodels.OutputFileUploadOptions( upload_condition=batchmodels.OutputFileUploadCondition. task_completion)) task = batchmodels.TaskAddParameter(id=task_id, command_line=command_line, resource_files=app_file_list + [resource_file], output_files=[output_file]) batch_client.task.add(job_id=job.id, task=task)
def create_job_schedule(batch_client, job_schedule_id, vm_size, vm_count): """Creates an Azure Batch pool and job schedule with the specified ids. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str job_schedule_id: The id of the job schedule to create :param str vm_size: vm size (sku) :param int vm_count: number of vms to allocate """ pool_info = batchmodels.PoolInformation( auto_pool_specification=batchmodels.AutoPoolSpecification( auto_pool_id_prefix="JobScheduler", pool=batchmodels.PoolSpecification( vm_size=vm_size, target_dedicated_nodes=vm_count, virtual_machine_configuration=batchmodels.VirtualMachineConfiguration( image_reference=batchmodels.ImageReference( publisher="Canonical", offer="UbuntuServer", sku="18.04-LTS", version="latest" ), node_agent_sku_id="batch.node.ubuntu 18.04" ), start_task=batchmodels.StartTask( command_line="/bin/bash -c " "\"$AZ_BATCH_APP_PACKAGE_azure_batch_1/azure_batch/job_schedular_node_startup_tasks.sh\"", wait_for_success=True, user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.pool, elevation_level=batchmodels.ElevationLevel.admin) ), ), application_package_references=[batchmodels.ApplicationPackageReference( application_id="azure_batch", version="1" )], ), keep_alive=False, pool_lifetime_option=batchmodels.PoolLifetimeOption.job ) ) job_spec = batchmodels.JobSpecification( pool_info=pool_info, # Terminate job once all tasks under it are complete to allow for a new # job to be created under the schedule on_all_tasks_complete=batchmodels.OnAllTasksComplete.terminate_job, job_manager_task=batchmodels.JobManagerTask( id="JobManagerTask", #specify the command that needs to run recursively in job_schedular command_line="/bin/bash -c \" python3 " "$AZ_BATCH_APP_PACKAGE_azure_batch_1/azure_batch/azure_batch_main.py\"" )) #mention the interval of the job schedular schedule = batchmodels.Schedule( recurrence_interval=datetime.timedelta(days=15)) scheduled_job = batchmodels.JobScheduleAddParameter( id=job_schedule_id, schedule=schedule, job_specification=job_spec) batch_client.job_schedule.add(cloud_job_schedule=scheduled_job)
def __submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, vm_image_model, application_metadata): """ Job Submission :param job_configuration -> aztk_sdk.spark.models.JobConfiguration :param start_task -> batch_models.StartTask :param job_manager_task -> batch_models.TaskAddParameter :param autoscale forumula -> str :param software_metadata_key -> str :param vm_image_model -> aztk_sdk.models.VmImage :returns None """ self._get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config()) # get a verified node agent sku sku_to_use, image_ref_to_use = \ helpers.select_latest_verified_vm_image_with_node_agent_sku( vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client) # set up subnet if necessary network_conf = None if job_configuration.subnet_id: network_conf = batch_models.NetworkConfiguration( subnet_id=job_configuration.subnet_id) # set up a schedule for a recurring job auto_pool_specification = batch_models.AutoPoolSpecification( pool_lifetime_option=batch_models.PoolLifetimeOption.job_schedule, auto_pool_id_prefix=job_configuration.id, keep_alive=False, pool=batch_models.PoolSpecification( display_name=job_configuration.id, virtual_machine_configuration=batch_models.VirtualMachineConfiguration( image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), vm_size=job_configuration.vm_size, enable_auto_scale=True, auto_scale_formula=autoscale_formula, auto_scale_evaluation_interval=timedelta(minutes=5), start_task=start_task, enable_inter_node_communication=not job_configuration.mixed_mode(), network_configuration=network_conf, max_tasks_per_node=4, metadata=[ batch_models.MetadataItem( name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) ] ) ) # define job specification job_spec = batch_models.JobSpecification( pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), display_name=job_configuration.id, on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, job_manager_task=job_manager_task, metadata=[ batch_models.MetadataItem( name='applications', value=application_metadata) ] ) # define schedule schedule = batch_models.Schedule( do_not_run_until=None, do_not_run_after=None, start_window=None, recurrence_interval=None ) # create job schedule and add task setup = batch_models.JobScheduleAddParameter( id=job_configuration.id, schedule=schedule, job_specification=job_spec) self.batch_client.job_schedule.add(setup) return self.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)
def job_create(): batch_service.job.add(job=batchmodel.JobAddParameter( id=config_azure['job_id'], pool_info=batchmodel.PoolInformation( pool_id=config_azure['batch_pool_name'])))
def retarget_job_to_new_pool(batch_service_client: batch.BatchExtensionsClient, job_id: str, new_pool_id: str): """ Disables a job with task requeue, then patches it to target a new pool. :param batch_service_client: The batch client used for making batch operations :type batch_service_client: `azure.batch.BatchExtensionsClient` :param job_id: The job to retarget :type job_id: str :param new_pool_id: The id of the new pool :type new_pool_id: str """ logger.info("Retargeting job [{}] to new pool [{}]".format( job_id, new_pool_id)) try: batch_service_client.job.disable(job_id, "requeue") except batchmodels.BatchErrorException as batch_exception: # potential race condition where the nodes have gone idle and the job has 'Completed' between our internal # node-idle-timeout check and the call to disable the job. Just return in this case if expected_exception(batch_exception, "The specified job does not exist"): logger.info( "The specified Job [{}] did not exist when we tried to delete it." .format(job_id)) raise ex.JobAlreadyCompleteException( job_id, "Job already complete and deleted.") if expected_exception( batch_exception, "The specified job is already in a completed state"): logger.info( "The specified Job [{}] was already in completed state when we tried to delete it." .format(job_id)) raise ex.JobAlreadyCompleteException(job_id, "Job already complete.") raise # give the job time to move to disabled state before we try Patch it time.sleep(service_state_transition_seconds) looping_job_patch = True job_patch_retry_count = 0 while looping_job_patch: try: batch_service_client.job.patch( job_id, batchmodels.JobPatchParameter( pool_info=batchmodels.PoolInformation( pool_id=new_pool_id))) looping_job_patch = False except batchmodels.BatchErrorException as batch_exception: if expected_exception( batch_exception, "The specified operation is not valid for the current state of the resource" ): if job_patch_retry_count > 10: logger.error( "Exhausted retries and Failed to patch job [{}] due to the current state of the resource" .format(job_id)) raise logger.info( "Failed to patch job [{}] due to the current state of the resource, retrying...." .format(job_id)) time.sleep(5) job_patch_retry_count = job_patch_retry_count + 1 logger.info("Successfully retargeted job [{}] to pool [{}]".format( job_id, new_pool_id))
def test_batch_job_schedules(self, **kwargs): client = self.create_aad_client(**kwargs) # Test Create Job Schedule schedule_id = self.get_resource_name('batch_schedule_') job_spec = models.JobSpecification( pool_info=models.PoolInformation("pool_id"), constraints=models.JobConstraints(max_task_retry_count=2), on_all_tasks_complete=models.OnAllTasksComplete.terminate_job ) schedule = models.Schedule( start_window=datetime.timedelta(hours=1), recurrence_interval=datetime.timedelta(days=1) ) params = models.JobScheduleAddParameter( schedule_id, schedule, job_spec ) response = client.job_schedule.add(params) self.assertIsNone(response) # Test List Job Schedules schedules = list(client.job_schedule.list()) self.assertTrue(len(schedules) > 0) # Test Get Job Schedule schedule = client.job_schedule.get(schedule_id) self.assertIsInstance(schedule, models.CloudJobSchedule) self.assertEqual(schedule.id, schedule_id) self.assertEqual(schedule.state, models.JobScheduleState.active) # Test Job Schedule Exists exists = client.job_schedule.exists(schedule_id) self.assertTrue(exists) # Test List Jobs from Schedule jobs = list(client.job.list_from_job_schedule(schedule_id)) self.assertTrue(len(jobs) > 0) # Test Disable Job Schedule response = client.job_schedule.disable(schedule_id) self.assertIsNone(response) # Test Enable Job Schedule response = client.job_schedule.enable(schedule_id) self.assertIsNone(response) # Test Update Job Schedule job_spec = models.JobSpecification( pool_info=models.PoolInformation('pool_id') ) schedule = models.Schedule( recurrence_interval=datetime.timedelta(hours=10) ) params = models.JobScheduleUpdateParameter(schedule, job_spec) response = client.job_schedule.update(schedule_id, params) self.assertIsNone(response) # Test Patch Job Schedule schedule = models.Schedule( recurrence_interval=datetime.timedelta(hours=5) ) params = models.JobSchedulePatchParameter(schedule) response = client.job_schedule.patch(schedule_id, params) self.assertIsNone(response) # Test Terminate Job Schedule response = client.job_schedule.terminate(schedule_id) self.assertIsNone(response) # Test Delete Job Schedule response = client.job_schedule.delete(schedule_id) self.assertIsNone(response)
def test_batch_jobs(self, **kwargs): client = self.create_sharedkey_client(**kwargs) # Test Create Job auto_pool = models.AutoPoolSpecification( pool_lifetime_option=models.PoolLifetimeOption.job, pool=models.PoolSpecification( vm_size='small', cloud_service_configuration=models.CloudServiceConfiguration( os_family='5' ) ) ) job_prep = models.JobPreparationTask(command_line="cmd /c \"echo hello world\"") job_release = models.JobReleaseTask(command_line="cmd /c \"echo goodbye world\"") job_param = models.JobAddParameter( id=self.get_resource_name('batch_job1_'), pool_info=models.PoolInformation( auto_pool_specification=auto_pool ), job_preparation_task=job_prep, job_release_task=job_release ) response = client.job.add(job_param) self.assertIsNone(response) # Test Update Job constraints = models.JobConstraints(max_task_retry_count=3) options = models.JobUpdateParameter( priority=500, constraints=constraints, pool_info=models.PoolInformation( auto_pool_specification=auto_pool ) ) response = client.job.update(job_param.id, options) self.assertIsNone(response) # Test Patch Job options = models.JobPatchParameter(priority=900) response = client.job.patch(job_param.id, options) self.assertIsNone(response) job = client.job.get(job_param.id) self.assertIsInstance(job, models.CloudJob) self.assertEqual(job.id, job_param.id) self.assertEqual(job.constraints.max_task_retry_count, 3) self.assertEqual(job.priority, 900) # Test Create Job with Auto Complete job_auto_param = models.JobAddParameter( id=self.get_resource_name('batch_job2_'), on_all_tasks_complete=models.OnAllTasksComplete.terminate_job, on_task_failure=models.OnTaskFailure.perform_exit_options_job_action, pool_info=models.PoolInformation( auto_pool_specification=auto_pool ) ) response = client.job.add(job_auto_param) self.assertIsNone(response) job = client.job.get(job_auto_param.id) self.assertIsInstance(job, models.CloudJob) self.assertEqual(job.on_all_tasks_complete, models.OnAllTasksComplete.terminate_job) self.assertEqual(job.on_task_failure, models.OnTaskFailure.perform_exit_options_job_action) # Test List Jobs jobs = client.job.list() self.assertIsInstance(jobs, models.CloudJobPaged) self.assertEqual(len(list(jobs)), 2) # Test Disable Job response = client.job.disable(job_param.id, models.DisableJobOption.requeue) self.assertIsNone(response) # Test Enable Job response = client.job.enable(job_param.id) self.assertIsNone(response) # Prep and release task status task_status = client.job.list_preparation_and_release_task_status(job_param.id) self.assertIsInstance(task_status, models.JobPreparationAndReleaseTaskExecutionInformationPaged) self.assertEqual(list(task_status), []) # Test Terminate Job response = client.job.terminate(job_param.id) self.assertIsNone(response) # Test Delete Job response = client.job.delete(job_auto_param.id) self.assertIsNone(response) # Test Job Lifetime Statistics stats = client.job.get_all_lifetime_statistics() self.assertIsInstance(stats, models.JobStatistics) self.assertEqual(stats.num_succeeded_tasks, 0) self.assertEqual(stats.num_failed_tasks, 0)
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id, block_indices, input_files, output_container_sas_url): """Submits a job to the Azure Batch service and adds a task that runs a python script. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. """ print(block_indices) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) block_blob_client.create_container(_CONTAINER_NAME, fail_on_exist=False) sas_url = common.helpers.upload_blob_and_create_sas( block_blob_client, _CONTAINER_NAME, _SIMPLE_TASK_NAME, _SIMPLE_TASK_PATH, _EXPIRY_TIME) tasks = list() # Count how many items are stored in the batch inBatch = 0 for block, input_file in zip(block_indices, input_files): input_file_path = input_file.file_path output_file_path = "".join( (input_file_path).split('.')[:-1]) + '_model.dat' task_file = batchmodels.ResourceFile(file_path=_SIMPLE_TASK_NAME, http_url=sas_url) print(type(input_file), type(task_file)) tasks.append( batchmodels.TaskAddParameter( id='Task{}'.format(block), command_line="python3 %s -b %d" % (_SIMPLE_TASK_NAME, block), resource_files=[task_file, input_file], output_files=[ batchmodels.OutputFile( file_pattern=output_file_path, destination=batchmodels.OutputFileDestination( container=batchmodels. OutputFileBlobContainerDestination( container_url=output_container_sas_url)), upload_options=batchmodels.OutputFileUploadOptions( upload_condition=batchmodels. OutputFileUploadCondition.task_success)) ])) inBatch += 1 # We can only send batches with up to 100 records if inBatch > 99: batch_client.task.add_collection(job.id, tasks) tasks = [] inBatch = 0 if inBatch > 0: batch_client.task.add_collection(job.id, tasks)