def start_mc_server_job_pool(self, maxNodes=None): if maxNodes is None: maxNodes = int(self.config.get('POOL', 'mincount')) job = batchmodels.JobAddParameter( id=helpers.generate_unique_resource_name(f"MC_server"), pool_info=batchmodels.PoolInformation(pool_id=self.pool_id), on_all_tasks_complete=batchmodels.OnAllTasksComplete.no_action, on_task_failure=batchmodels.OnTaskFailure. perform_exit_options_job_action) self.client.job.add(job) self.job_id = job.id # # constraint = batchmodels.TaskConstraints( # retention_time=datetime.timedelta(hours=24), # ) # # user_identity = batch.models.UserIdentity( # # user_name='azureuser', # auto_user=batch.models.AutoUserSpecification( # scope=batch.models.AutoUserScope.pool, # elevation_level=batch.models.ElevationLevel.admin) # ) for count in range(1, maxNodes + 1): self.add_task_to_start_server()
def create_job(batch_client, job_id, pool_id): job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id), uses_task_dependencies=True) batch_client.job.add(job) return job
def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): """ Create a pool and job :param cluster_conf: the configuration object used to create the cluster :type cluster_conf: aztk.models.ClusterConfiguration :parm software_metadata_key: the id of the software being used on the cluster :param start_task: the start task for the cluster :param VmImageModel: the type of image to provision for the cluster :param wait: wait until the cluster is ready """ self._get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) # reuse pool_id as job_id pool_id = cluster_conf.cluster_id job_id = cluster_conf.cluster_id # Get a verified node agent sku sku_to_use, image_ref_to_use = \ helpers.select_latest_verified_vm_image_with_node_agent_sku( VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) network_conf = None if cluster_conf.subnet_id is not None: network_conf = batch_models.NetworkConfiguration( subnet_id=cluster_conf.subnet_id) auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format( cluster_conf.vm_count, cluster_conf.vm_low_pri_count) # Confiure the pool pool = batch_models.PoolAddParameter( id=pool_id, virtual_machine_configuration=batch_models.VirtualMachineConfiguration( image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), vm_size=cluster_conf.vm_size, enable_auto_scale=True, auto_scale_formula=auto_scale_formula, auto_scale_evaluation_interval=timedelta(minutes=5), start_task=start_task, enable_inter_node_communication=True if not cluster_conf.subnet_id else False, max_tasks_per_node=4, network_configuration=network_conf, metadata=[ batch_models.MetadataItem( name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), batch_models.MetadataItem( name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA) ]) # Create the pool + create user for the pool helpers.create_pool_if_not_exist(pool, self.batch_client) # Create job job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id)) # Add job to batch self.batch_client.job.add(job) return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client)
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id): """Submits a job to the Azure Batch service and adds a task that runs a python script. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. """ job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) block_blob_client.create_container(_CONTAINER_NAME, fail_on_exist=False) sas_url = common.helpers.upload_blob_and_create_sas( block_blob_client, _CONTAINER_NAME, _SIMPLE_TASK_NAME, _SIMPLE_TASK_PATH, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) task = batchmodels.TaskAddParameter( id="MyPythonTask", command_line="python " + _SIMPLE_TASK_NAME, resource_files=[ batchmodels.ResourceFile(file_path=_SIMPLE_TASK_NAME, http_url=sas_url) ]) batch_client.task.add(job_id=job.id, task=task)
def create_job(self, pool_id, job_id): """ Creates a job with the specified ID, associated with the specified pool. :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ if job_id in self.active_jobs or job_id in self.get_job_list(): return self.logger.info("creating job {}".format(job_id)) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) try: self.batch_client.job.add(job) except batchmodels.BatchErrorException as err: self.__print_batch_exception(err) raise for task in self.get_task_list(job_id): self.batch_client.task.delete(job_id, task.id) self.active_jobs.add(job_id)
def submit_job_and_add_task(batch_client, job_id, vm_size, vm_count): """Submits a job to the Azure Batch service and adds a simple task with preparation task :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str job_id: The id of the job to create. """ pool_info = batchmodels.PoolInformation( auto_pool_specification=batchmodels.AutoPoolSpecification( auto_pool_id_prefix="Helloworld_jobprep", pool=batchmodels.PoolSpecification( vm_size=vm_size, target_dedicated_nodes=vm_count, cloud_service_configuration={'os_family': "4"}), keep_alive=False, pool_lifetime_option=batchmodels.PoolLifetimeOption.job)) job = batchmodels.JobAddParameter( id=job_id, pool_info=pool_info, job_preparation_task=batch.models.JobPreparationTask( command_line=prep_task_command, wait_for_success=True)) batch_client.job.add(job) task = batchmodels.TaskAddParameter( id="HelloWorld_Task", command_line=common.helpers.wrap_commands_in_shell( 'windows', ['echo Hello world from the Batch Hello world sample!'])) batch_client.task.add(job_id=job.id, task=task)
def run_commands(batch_client, block_blob_client, job_id, pool_id): """Run the start commands listed in the file "start_commands" on all the nodes of the Azure Batch service. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. """ task_commands = get_list_from_file('configs/start_commands') logging.info(task_commands) user = batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.pool, elevation_level=batchmodels.ElevationLevel.admin) start = time.time() job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) logging.info('job created in seconds {}'.format(time.time() - start)) start = time.time() nodes = list(batch_client.compute_node.list(pool_id)) tasks = [batchmodels.TaskAddParameter( id="EBOTask-{}".format(i), command_line=common.helpers.wrap_commands_in_shell('linux', task_commands), user_identity=batchmodels.UserIdentity(auto_user=user)) \ for i in xrange(len(nodes))] batch_client.task.add_collection(job.id, tasks) logging.info('task created in seconds {}'.format(time.time() - start))
def submit_job_and_add_task(batch_client, block_blob_client, storage_account_name, storage_account_key, container, resourcefile, job_id, pool_id, sha1_cert_tp): """Submits a job to the Azure Batch service and adds a task that decrypts the file stored in Azure Storage. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str storage_account_name: storage account name :param str storage_account_key: storage account key :param str container: blob storage container :param str resourcefile: resource file to add to task :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. :param str sha1_cert_tp: sha1 cert thumbprint for cert ref """ job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) # generate short-lived sas key for blobxfer sastoken = common.helpers.create_sas_token(block_blob_client, container, _RESOURCE_NAME, azureblob.BlobPermissions.READ) # issue the following commands for the task: # 1. convert pfx installed by the Azure Batch Service to pem # 2. transfer the encrypted blob from Azure Storage to local disk and # decrypt contents using the private key # 3. output decrypted secret.txt file # Note: certs on Linux Batch Compute Nodes are placed in: # $AZ_BATCH_CERTIFICATES_DIR where the cert itself has a naming convention # of <thumbprint algorithm>-<lowercase thumbprint>.<certificate format> task_commands = [ ('openssl pkcs12 -in $AZ_BATCH_CERTIFICATES_DIR/sha1-{tp}.pfx -out ' '$AZ_BATCH_CERTIFICATES_DIR/privatekey.pem -nodes -password ' 'file:$AZ_BATCH_CERTIFICATES_DIR/sha1-{tp}.pfx.pw').format( tp=sha1_cert_tp), ('blobxfer {sa} {cont} . --saskey "{sas}" --download ' '--remoteresource {rf} --rsaprivatekey ' '$AZ_BATCH_CERTIFICATES_DIR/privatekey.pem').format( sa=storage_account_name, cont=container, sas=sastoken, rf=resourcefile), 'echo secret.txt contents:', 'cat {}'.format(resourcefile) ] task = batchmodels.TaskAddParameter( id="MyEncryptedResourceTask", command_line=common.helpers.wrap_commands_in_shell( 'linux', task_commands), ) batch_client.task.add(job_id=job.id, task=task)
def designate_master_docker_swarm_node(batch_client, pool_id, nodes, job_id): """Designate a master docker swarm node by selecting a node in the pool to be the swarm manager. This is accomplished via IP selection in the pool of nodes and running the swarm init command via an affinitized task. This is for Docker 1.12+. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str pool_id: The id of the pool. :param list nodes: list of `batchserviceclient.models.ComputeNode` :param str job_id: The id of the job to create. :rtype: tuple :return: ((master ipaddress, master node id), swarm token) """ # designate the lowest ip address node as the master nodes = sorted(nodes, key=lambda node: node.ip_address) master_node_ip_address = nodes[0].ip_address master_node_id = nodes[0].id master_node_affinity_id = nodes[0].affinity_id master_node = (master_node_ip_address, master_node_id) print('master node is: {}'.format(master_node)) # create job job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) # add docker swarm manage as an affinitized task to run on the master node # NOTE: task affinity is weak. if the node has no available scheduling # slots, the task may be executed on a different node. for this example, # it is not an issue since this node should be available for scheduling. task_commands = [ 'docker swarm init --advertise-addr {}'.format(master_node_ip_address), 'docker swarm join-token -q worker', ] print('initializing docker swarm cluster via Azure Batch task...') task = batchmodels.TaskAddParameter( id='swarm-manager', affinity_info=batchmodels.AffinityInformation( affinity_id=master_node_affinity_id), command_line=common.helpers.wrap_commands_in_shell( 'linux', task_commands), run_elevated=True, ) batch_client.task.add(job_id=job.id, task=task) # wait for task to complete common.helpers.wait_for_tasks_to_complete(batch_client, job_id, datetime.timedelta(minutes=5)) # retrieve the swarm token stdout = common.helpers.read_task_file_as_string( batch_client, job.id, task.id, common.helpers._STANDARD_OUT_FILE_NAME) token = stdout.splitlines()[-1].strip() print('swarm token: {}'.format(token)) return master_node, token
def designate_master_docker_swarm_node(batch_client, pool_id, nodes, job_id): """Designate a master docker swarm node by selecting a node in the pool to be the swarm manager. This is accomplished via IP selection in the pool of nodes and running the swarm manage command via an affinitized task. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str pool_id: The id of the pool to create. :param list nodes: list of `batchserviceclient.models.ComputeNode` :param str job_id: The id of the job to create. :rtype: tuple :return: (ipaddress, master node id) """ # designate the lowest ip address node as the master nodes = sorted(nodes, key=lambda node: node.ip_address) master_node_ip_address = nodes[0].ip_address master_node_id = nodes[0].id master_node = (master_node_ip_address, master_node_id) # create a node list based on the number of nodes and master lastoctet = int(master_node_ip_address.split('.')[-1]) nodelist = '10.0.0.[{}:{}]'.format(lastoctet, lastoctet + len(nodes) - 1) print('master node is: {} nodelist is: {}'.format(master_node, nodelist)) # create job job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) # add docker swarm manage as an affinitized task to run on the master node task_commands = [ ('docker run -d -p 3375:3375 -t swarm manage -H tcp://0.0.0.0:3375 ' '"nodes://{}:2375"').format(nodelist) ] print('creating docker swarm cluster via Azure Batch task...') task = batchmodels.TaskAddParameter( id="swarm-master", affinity_info=batchmodels.AffinityInformation( affinity_id=master_node_id), command_line=common.helpers.wrap_commands_in_shell( 'linux', task_commands), run_elevated=True, ) batch_client.task.add(job_id=job.id, task=task) # wait for task to complete common.helpers.wait_for_tasks_to_complete( batch_client, job_id, datetime.timedelta(minutes=5)) print('docker swarm cluster created.') return master_node
def add_docker_batch_task(batch_client, block_blob_client, job_id, pool_id): """Submits a docker task via Batch scheduler :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to use. :param str pool_id: The id of the pool to use. :rtype: list :return: a list of task_id of the task added. """ task_resource_sas_url = common.helpers.upload_blob_and_create_sas( block_blob_client, _CONTAINER_NAME, _TASK_RESOURCE_FILE, _TASK_RESOURCE_FILE_PATH, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) output_container_sas_key = common.helpers.create_container_and_create_sas( block_blob_client=block_blob_client, container_name=_OUTPUT_CONTAINER_NAME, permission=azureblob.ContainerPermissions.WRITE | azureblob.ContainerPermissions.LIST, expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1)) # The start task pulls docker image yidingz/ffmpeg:v3 job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id), job_preparation_task=batchmodels.JobPreparationTask( command_line=_JOB_STARTTASK_CLI, run_elevated=True)) batch_client.job.add(job) task_id_list = [] index = 0 for url in _INPUT_FILE_URLS: filename = urllib.parse.urlsplit(url).path.split('/')[-1] parameters = "'{0}' '{1}' '{2}' '{3}'".format( url, filename, output_container_sas_key, block_blob_client.account_name) # Each task will download a video from chanel9, # transcode, and upload to specified output container task = batchmodels.TaskAddParameter( id=str(index).zfill(4) + '_' + filename.split('.')[0], command_line=_TASK_CLI.format(_TASK_RESOURCE_FILE, _FFMPEG_IMAGE, parameters), run_elevated=True, resource_files=[ batchmodels.ResourceFile(file_path=_TASK_RESOURCE_FILE, blob_source=task_resource_sas_url) ]) task_id_list.append(task.id) batch_client.task.add(job_id=job_id, task=task) index += 1 return task_id_list
def create_job(self, job_preparation_commands=None): """Creates a job with the specified ID, associated with the specified pool. Args: job_preparation_commands: commands as list of strings to run before the job starts Returns: success: True if job could be created successfully, False otherwise. """ if job_preparation_commands is None: job = batch_models.JobAddParameter( id=self.job_id, pool_info=batch_models.PoolInformation(pool_id=self.pool_id)) else: job_prep_task = batch_models.JobPreparationTask( command_line=job_preparation_commands, wait_for_success=True, rerun_on_node_reboot_after_success=True) job = batch_models.JobAddParameter( id=self.job_id, pool_info=batch_models.PoolInformation(pool_id=self.pool_id), job_preparation_task=job_prep_task) try: logging.info('Attempting to create job [{}]...'.format( self.job_id)) self.batch_client.job.add(job) logging.info('Job [{}] created successfully...'.format( self.job_id)) return True except batch_models.batch_error.BatchErrorException as err: if err.error.code == "JobExists": logging.info("Job [{}] already exists".format(self.job_id)) return False else: logging.exception( "Unknown error occurred while trying to create job [{}]". format(self.job_id)) raise
def create_job(batch_service_client, job_id, pool_id): """Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool.""" print() print('Creating job [{}]'.format(job_id)) job = batchmodels.JobAddParameter(id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id), on_all_tasks_complete='terminateJob') batch_service_client.job.add(job)
def submit_job_and_add_tasks(batch_client, block_blob_client, job_id, pool_id, in_files, out_container_name, app_files, storage_account_name, out_sas_token): """Submits jobs to the Azure Batch service and adds tasks that runs a python script. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. :param list in_files: The list of the file paths of the inputs. :param str out_container_name: The name of the output container. :param list app_files: The list of all the other scripts to upload. :param str storage_account_name: The name of the storage account. :param str out_sas_token: A SAS token granting the specified permissions to the output container. """ start = time.time() job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) logging.info('job created in seconds {}'.format(time.time() - start)) start = time.time() tasks = [batchmodels.TaskAddParameter( id="EBOTask-{}".format(i), command_line='python {} --filepath {} --storageaccount {} --storagecontainer {} --sastoken "{}"'.format( _TASK_FILE, in_file.file_path, storage_account_name, out_container_name, out_sas_token), resource_files=[in_file] + app_files) \ for i, in_file in enumerate(in_files)] cnt = 0 tot_tasks = len(tasks) while cnt < tot_tasks: try: batch_client.task.add_collection(job.id, tasks[cnt:cnt + 100]) cnt += 100 except Exception as e: print("Adding task failed... Going to try again in 5 seconds") logging.error(e) time.sleep(5) logging.info('task created in seconds {}'.format(time.time() - start))
def create_job(batch_client, name_job, name_pool, cmd_prep_task=None): user = models.UserIdentity(auto_user=models.AutoUserSpecification( elevation_level=models.ElevationLevel.admin, scope=models.AutoUserScope.task)) prepare_task = models.JobPreparationTask(command_line=cmd_prep_task, id=None, user_identity=user) job = models.JobAddParameter( id=name_job, pool_info=models.PoolInformation(pool_id=name_pool), job_preparation_task=prepare_task) batch_client.job.add(job)
def __create_pool_and_job(self, cluster_conf, software_metadata_key: str, start_task, VmImageModel): """ Create a pool and job :param cluster_conf: the configuration object used to create the cluster :type cluster_conf: aztk.models.ClusterConfiguration :parm software_metadata_key: the id of the software being used on the cluster :param start_task: the start task for the cluster :param VmImageModel: the type of image to provision for the cluster :param wait: wait until the cluster is ready """ # reuse pool_id as job_id pool_id = cluster_conf.cluster_id job_id = cluster_conf.cluster_id # Get a verified node agent sku sku_to_use, image_ref_to_use = \ helpers.select_latest_verified_vm_image_with_node_agent_sku( VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) # Confiure the pool pool = batch_models.PoolAddParameter( id=pool_id, virtual_machine_configuration=batch_models. VirtualMachineConfiguration(image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), vm_size=cluster_conf.vm_size, target_dedicated_nodes=cluster_conf.vm_count, target_low_priority_nodes=cluster_conf.vm_low_pri_count, start_task=start_task, enable_inter_node_communication=True, max_tasks_per_node=1, metadata=[ batch_models.MetadataItem( name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), ]) # Create the pool + create user for the pool helpers.create_pool_if_not_exist(pool, self.batch_client) # Create job job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id)) # Add job to batch self.batch_client.job.add(job) return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client)
def create_job(batch_service_client, job_id, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ print("Creating job [{}]...".format(job_id)) job_description = models.JobAddParameter( id=job_id, pool_info=models.PoolInformation(pool_id=pool_id)) batch_service_client.job.add(job_description)
def create_job(batch_service_client: BatchServiceClient, job_id: str, pool_id: str): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ print(f'Creating job [{job_id}]...') job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_service_client.job.add(job)
def create_job(batch_service_client, job_id, pool_id): print('Creating job [{}]...'.format(job_id)) job = batchmodels.JobAddParameter( job_id, batchmodels.PoolInformation(pool_id=pool_id)) try: batch_service_client.job.add(job) except batchmodels.batch_error.BatchErrorException as err: print_batch_exception(err) if err.error.code != "JobExists": raise else: print("Job {!r} already exists".format(job_id))
def run(self, wait=True, **kwargs) -> None: r""" :param config: A :class:`BatchConfig` instance with the Azure Batch run parameters :type config: :class:BatchConfig :param boolean wait: If true, wait for the batch to complete and then download the results to file :raises BatchErrorException: If raised by the Azure Batch Python SDK """ # replace any missing values in the configuration with environment variables if not hasattr(self, "tasks"): raise ValueError( "Client restored from data cannot be used to run the job") try: # Create the pool that will contain the compute nodes that will execute the # tasks. if not (self.config.POOL_VM_SIZE and (self.config.POOL_NODE_COUNT or self.config.POOL_LOW_PRIORITY_NODE_COUNT)): print("Using existing pool: ", self.config.POOL_ID) else: try: self._create_pool() print("Created pool: ", self.config.POOL_ID) except models.BatchErrorException: print("Using pool: ", self.config.POOL_ID) # Create the job that will run the tasks. job_description = models.JobAddParameter( id=self.config.JOB_ID, pool_info=models.PoolInformation(pool_id=self.config.POOL_ID), ) self.batch_client.job.add(job_description) # Add the tasks to the job. self.batch_client.task.add_collection(self.config.JOB_ID, self.tasks) except models.BatchErrorException as err: print_batch_exception(err) raise err if wait: self.load_results(**kwargs)
def create_job(self, job_id, pool_id, total_nodes, is_linux_pool): client = self._get_batch_client() try: pool_info = batchmodels.PoolInformation(pool_id=pool_id) job = batchmodels.JobAddParameter(id=job_id, pool_info=pool_info) try: client.job.add(job) except batchmodels.BatchErrorException as be: if be.error and be.error.code == 'JobExists': pass else: print('Error creating job, code={}, message={}'.format( be.error.code, be.error.message)) raise if is_linux_pool: cmd_line = '/bin/bash -c azure-batch-ses.sh' script = 'azure-batch-ses.sh' script_url = 'https://raw.githubusercontent.com/Azure/azure-deadline/master/CloudProviderPlugin/Scripts/azure-batch-ses.sh' else: cmd_line = 'powershell.exe -file azure-batch-ses.ps1' script = 'azure-batch-ses.ps1' script_url = 'https://raw.githubusercontent.com/Azure/azure-deadline/master/CloudProviderPlugin/Scripts/azure-batch-ses.ps1' task = batchmodels.TaskAddParameter( id='', command_line=cmd_line, resource_files=[batchmodels.ResourceFile(script_url, script)], constraints=batchmodels.TaskConstraints( max_task_retry_count=3), user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.pool, elevation_level=batchmodels.ElevationLevel.admin))) for i in range(total_nodes): task.id = str(uuid.uuid4()) client.task.add(job_id=job.id, task=task) except batchmodels.BatchErrorException as be: if be.error: print('Error creating job, code={}, message={}'.format( be.error.code, be.error.message)) if be.error.values: for e in be.error.values: print('Key={}, Value={}'.format(e.key, e.value)) raise
def create_job(self, pool_id: str): from azure.batch import models as batchmodels job_queue_name = pool_id + '-queue' job = batchmodels.JobAddParameter( id=job_queue_name, display_name=job_queue_name, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) try: self.batch_client.job.add(job) except batchmodels.BatchErrorException as err: if err.error.code != "JobExists": raise else: logging.info("Job {!r} already exists".format(job_queue_name)) return job
def create_job(batch_service_client, job_id, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ print('Creating job [{}]...'.format(job_id)) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) try: batch_service_client.job.add(job) except batchmodels.batch_error.BatchErrorException as err: print_batch_exception(err) if err.error.code != "JobExists": raise else: print("Job {!r} already exists".format(job_id))
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id): job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) block_blob_client.create_container(CONTAINER_NAME, fail_on_exist=False) sas_url = upload_blob_and_create_sas( block_blob_client, CONTAINER_NAME, TASK_NAME, TASK_PATH, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) task = batchmodels.TaskAddParameter(id="SliceTask", command_line="python3 " + TASK_NAME, resource_files=[ batchmodels.ResourceFile( file_path=TASK_NAME, blob_source=sas_url) ]) batch_client.task.add(job_id=job.id, task=task)
def configure_job( self, job_id: str, pool_id: str, display_name: Optional[str] = None, **kwargs, ) -> JobAddParameter: """ Configures a job for use in the pool :param job_id: A string that uniquely identifies the job within the account :param pool_id: A string that identifies the pool :param display_name: The display name for the job """ job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id), display_name=display_name, **kwargs, ) return job
def create_job(batch_service_client, job_id, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ LOGGER.info('Creating job [{}]...'.format(job_id)) job = batch_models.JobAddParameter( id=job_id, pool_info=batch_models.PoolInformation(pool_id=pool_id), uses_task_dependencies=True) try: batch_service_client.job.add(job) LOGGER.info("Job Created") except batch_models.BatchErrorException as err: if 'The specified job already exists.' in err.error.message.value: LOGGER.info("Job already exists...") else: raise
def add_tasks(batch_service_client, pool_id, task_id, docker_image, storage_account, storage_key, container_name, file_name, output_container): job_id = "batchjob" try: job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_service_client.job.add(job) logging.info('Adding job {} to pool...'.format(job_id)) except Exception: logging.info( 'Job ID: {} already exists and associated with pool...'.format( job_id)) pass logging.info('Adding tasks to job [{}]...'.format(job_id)) # This is the user who run the command inside the container. # An unprivileged one user = batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.task, elevation_level=batchmodels.ElevationLevel.admin) # This is the docker image we want to run task_container_settings = batchmodels.TaskContainerSettings( image_name=docker_image, container_run_options='--rm -v /scratch:/scratch') # The container needs this argument to be executed task = batchmodels.TaskAddParameter( id=task_id, command_line='/opt/azureblobworker.sh %s %s %s %s %s %s' % (storage_account, storage_key, task_id, container_name, file_name, output_container), container_settings=task_container_settings, user_identity=batchmodels.UserIdentity(auto_user=user)) batch_service_client.task.add(job_id, task)
def test_batch_jobs(self, **kwargs): client = self.create_sharedkey_client(**kwargs) # Test Create Job auto_pool = models.AutoPoolSpecification( pool_lifetime_option=models.PoolLifetimeOption.job, pool=models.PoolSpecification( vm_size='small', cloud_service_configuration=models.CloudServiceConfiguration( os_family='5' ) ) ) job_prep = models.JobPreparationTask(command_line="cmd /c \"echo hello world\"") job_release = models.JobReleaseTask(command_line="cmd /c \"echo goodbye world\"") job_param = models.JobAddParameter( id=self.get_resource_name('batch_job1_'), pool_info=models.PoolInformation( auto_pool_specification=auto_pool ), job_preparation_task=job_prep, job_release_task=job_release ) response = client.job.add(job_param) self.assertIsNone(response) # Test Update Job constraints = models.JobConstraints(max_task_retry_count=3) options = models.JobUpdateParameter( priority=500, constraints=constraints, pool_info=models.PoolInformation( auto_pool_specification=auto_pool ) ) response = client.job.update(job_param.id, options) self.assertIsNone(response) # Test Patch Job options = models.JobPatchParameter(priority=900) response = client.job.patch(job_param.id, options) self.assertIsNone(response) job = client.job.get(job_param.id) self.assertIsInstance(job, models.CloudJob) self.assertEqual(job.id, job_param.id) self.assertEqual(job.constraints.max_task_retry_count, 3) self.assertEqual(job.priority, 900) # Test Create Job with Auto Complete job_auto_param = models.JobAddParameter( id=self.get_resource_name('batch_job2_'), on_all_tasks_complete=models.OnAllTasksComplete.terminate_job, on_task_failure=models.OnTaskFailure.perform_exit_options_job_action, pool_info=models.PoolInformation( auto_pool_specification=auto_pool ) ) response = client.job.add(job_auto_param) self.assertIsNone(response) job = client.job.get(job_auto_param.id) self.assertIsInstance(job, models.CloudJob) self.assertEqual(job.on_all_tasks_complete, models.OnAllTasksComplete.terminate_job) self.assertEqual(job.on_task_failure, models.OnTaskFailure.perform_exit_options_job_action) # Test List Jobs jobs = client.job.list() self.assertIsInstance(jobs, models.CloudJobPaged) self.assertEqual(len(list(jobs)), 2) # Test Disable Job response = client.job.disable(job_param.id, models.DisableJobOption.requeue) self.assertIsNone(response) # Test Enable Job response = client.job.enable(job_param.id) self.assertIsNone(response) # Prep and release task status task_status = client.job.list_preparation_and_release_task_status(job_param.id) self.assertIsInstance(task_status, models.JobPreparationAndReleaseTaskExecutionInformationPaged) self.assertEqual(list(task_status), []) # Test Terminate Job response = client.job.terminate(job_param.id) self.assertIsNone(response) # Test Delete Job response = client.job.delete(job_auto_param.id) self.assertIsNone(response) # Test Job Lifetime Statistics stats = client.job.get_all_lifetime_statistics() self.assertIsInstance(stats, models.JobStatistics) self.assertEqual(stats.num_succeeded_tasks, 0) self.assertEqual(stats.num_failed_tasks, 0)
def _setup_job(self, distributable_list, pool_id, name, log_writer=None): ''' This is the main method for submitting to AzureBatch. ''' job_id = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + "-" + name.replace("_","-").replace("/","-").replace(".","-").replace("+","-").replace("(","").replace(")","") job_id_etc_list = [] if True: # Pickle the things-to-run - put them in a local directory under the current directory called "runs/[jobid]" where the jobid is based on the date. if log_writer is not None: log_writer("{0}: Pickle the thing to run".format(name)) run_dir_rel = os.path.join("runs",job_id) pstutil.create_directory_if_necessary(run_dir_rel, isfile=False) for index, distributable in enumerate(distributable_list): distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index)) with open(distributablep_filename, mode='wb') as f: pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL) if True: # Copy (update) any (small) input files to the blob if log_writer is not None: log_writer("{0}: Upload small input files".format(name)) data_blob_fn = "{0}-data-v{1}".format(self.container,self.data_version) inputOutputCopier = AzureBatchCopier(data_blob_fn, self.storage_key, self.storage_account_name) script_list = ["",""] #These will be scripts for copying to and from AzureStorage and the cluster nodes. inputOutputCopier2 = AzureBatchCopierNodeLocal(data_blob_fn, self.container, self.data_version, self.storage_key, self.storage_account_name, script_list) for index, distributable in enumerate(distributable_list): inputOutputCopier.input(distributable) inputOutputCopier2.input(distributable) inputOutputCopier2.output(distributable) output_blobfn = "{0}/output{1}".format(run_dir_rel.replace("\\","/"),index) #The name of the directory of return values in Azure Storage. job_id_etc_list.append((job_id, inputOutputCopier, output_blobfn, run_dir_rel)) if True: # Create the jobprep program -- sets the python path and downloads the pythonpath code. Also create node-local folder for return values. if log_writer is not None: log_writer("{0}: Create jobprep.bat script".format(name)) localpythonpath = os.environ.get("PYTHONPATH") #!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception. jobprep_filename = os.path.join(run_dir_rel, "jobprep.bat") # It only copies down files that are needed, but with some probability (about 1 in 50, say) fails, so we repeat three times. with open(jobprep_filename, mode='w') as f2: f2.write(r"""set set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% for /l %%t in (0,1,3) do FOR /L %%i IN (0,1,{7}) DO python.exe %AZ_BATCH_TASK_WORKING_DIR%\blobxfer.py --skipskip --delete --storageaccountkey {2} --download {3} {4}-pp-v{5}-%%i %AZ_BATCH_NODE_SHARED_DIR%\{4}\pp\v{5}\%%i --remoteresource . {6} mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{8} exit /b 0 """ .format( None, #0 - not used None, #1 - not used self.storage_key, #2 self.storage_account_name, #3 self.container, #4 self.pp_version, #5 script_list[0], #6 len(localpythonpath.split(';'))-1, #7 index, #8 )) if True: #Split the taskcount roughly evenly among the distributables subtaskcount_list = deal(len(distributable_list),self.taskcount) if True: # Create the map.bat and reduce.bat programs to run. if log_writer is not None: log_writer("{0}: Create map.bat and reduce.bat script".format(name)) pythonpath_string = "set pythonpath=" + ";".join(r"%AZ_BATCH_NODE_SHARED_DIR%\{0}\pp\v{1}\{2}".format(self.container,self.pp_version,i) for i in range(len(localpythonpath.split(';')))) for index in range(len(distributable_list)): subtaskcount = subtaskcount_list[index] output_blobfn = job_id_etc_list[index][2] for i, bat_filename in enumerate(["map{0}.bat".format(index),"reduce{0}.bat".format(index)]): bat_filename = os.path.join(run_dir_rel, bat_filename) with open(bat_filename, mode='w') as f1: #note that it's getting distributable.py from site-packages and never from the pythonpath f1.write(r"""set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}FOR /L %%i IN (0,1,{11}) DO python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --download {3} {8}/{10} . --remoteresource %%i.{0}.p cd %AZ_BATCH_NODE_SHARED_DIR%\{8}\data\v{9} {13} python.exe %AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\Lib\site-packages\fastlmm\util\distributable.py %AZ_BATCH_JOB_PREP_WORKING_DIR%\distributable{14}.p LocalInParts(%1,{0},result_file=r\"{4}/result.p\",mkl_num_threads={1},temp_dir=r\"{4}\") IF %ERRORLEVEL% NEQ 0 (EXIT /B %ERRORLEVEL%) {6}{7} cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {5}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} %1.{0}.p --remoteresource {10}/%1.{0}.p {6}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} result.p --remoteresource {10}/result.p """ .format( subtaskcount, #0 self.mkl_num_threads, #1 self.storage_key, #2 self.storage_account_name, #3 "%AZ_BATCH_TASK_WORKING_DIR%/../../output{0}".format(index), #4 "" if i==0 else "@rem ", #5 "" if i==1 else "@rem ", #6 script_list[1], #7 self.container, #8 self.data_version, #9 output_blobfn, #10 subtaskcount-1, #11 self.pp_version, #12 pythonpath_string, #13 index, #14 )) if True: # Upload the thing-to-run to a blob and the blobxfer program if log_writer is not None: log_writer("{0}: Upload the thing to run".format(name)) block_blob_client = azureblob.BlockBlobService(account_name=self.storage_account_name,account_key=self.storage_key) block_blob_client.create_container(self.container, fail_on_exist=False) blobxfer_blobfn = "utils/v{}/blobxfer.py".format(self.utils_version) blobxfer_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, blobxfer_blobfn, os.path.join(os.path.dirname(__file__),"blobxfer.py"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) jobprep_blobfn = "{}/jobprep.bat".format(run_dir_rel.replace("\\","/")) jobprepbat_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, jobprep_blobfn, os.path.join(run_dir_rel, "jobprep.bat"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list = [] for index in range(len(distributable_list)): distributablep_blobfn = "{0}/distributable{1}.p".format(run_dir_rel.replace("\\","/"),index) distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index)) distributablep_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, distributablep_blobfn, distributablep_filename, datetime.datetime.utcnow() + datetime.timedelta(days=30)) #!!!should there be an expiry? map_blobfn = "{0}/map{1}.bat".format(run_dir_rel.replace("\\","/"),index) map_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, map_blobfn, os.path.join(run_dir_rel, "map{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) reduce_blobfn = "{0}/reduce{1}.bat".format(run_dir_rel.replace("\\","/"),index) reduce_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, reduce_blobfn, os.path.join(run_dir_rel, "reduce{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list.append((map_url,reduce_url,distributablep_url)) if True: # Copy everything on PYTHONPATH to a blob if log_writer is not None: log_writer("{0}: Upload items on pythonpath as requested".format(name)) if self.update_python_path == 'every_time': self._update_python_path_function() if True: # Create a job with a job prep task if log_writer is not None: log_writer("{0}: Create jobprep.bat".format(name)) resource_files=[ batchmodels.ResourceFile(blob_source=blobxfer_url, file_path="blobxfer.py"), batchmodels.ResourceFile(blob_source=jobprepbat_url, file_path="jobprep.bat")] for index in range(len(distributable_list)): _, _, distributablep_url = map_reduce_url_list[index] resource_files.append(batchmodels.ResourceFile(blob_source=distributablep_url, file_path="distributable{0}.p".format(index))) job_preparation_task = batchmodels.JobPreparationTask( id="jobprep", #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), resource_files=resource_files, command_line="jobprep.bat", ) job = batchmodels.JobAddParameter( id=job_id, job_preparation_task=job_preparation_task, pool_info=batch.models.PoolInformation(pool_id=pool_id), uses_task_dependencies=True, on_task_failure='performExitOptionsJobAction', ) try: self.batch_client.job.add(job) except batchmodels.BatchErrorException as e: if e.inner_exception.values is not None: raise Exception(e.inner_exception.values[-1].value) else: raise Exception(e.inner_exception) if True: # Add regular tasks to the job if log_writer is not None: log_writer("{0}: Add tasks to job".format(name)) task_factor = int(10**math.ceil(math.log(max(subtaskcount_list),10))) #When we have multiple distributables, this helps us number them e.g. 0,1,2,10,11,12,20,21,22 task_list = [] for index in range(len(distributable_list)): start = len(task_list) map_url, reduce_url, _ = map_reduce_url_list[index] subtaskcount = subtaskcount_list[index] for taskindex in range(subtaskcount): map_task = batchmodels.TaskAddParameter( id=index * task_factor + taskindex, #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), #!!! seems to exit without needing a failure exit_conditions = batchmodels.ExitConditions(default=batchmodels.ExitOptions(job_action='terminate')), resource_files=[batchmodels.ResourceFile(blob_source=map_url, file_path="map{0}.bat".format(index))], command_line=r"map{0}.bat {1}".format(index, taskindex), ) task_list.append(map_task) end = len(task_list)-1 reduce_task = batchmodels.TaskAddParameter( id="reduce{0}".format(index), #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), resource_files=[batchmodels.ResourceFile(blob_source=reduce_url, file_path="reduce{0}.bat".format(index))], command_line=r"reduce{0}.bat {1}".format(index, subtaskcount), depends_on = batchmodels.TaskDependencies(task_id_ranges=[batchmodels.TaskIdRange(task_list[start].id,task_list[end].id)]) ) task_list.append(reduce_task) try: for i in range(0,len(task_list),100): #The Python API only lets us add 100 at a time. self.batch_client.task.add_collection(job_id, task_list[i:i+100]) except Exception as exception: print(exception) raise exception return job_id_etc_list
def submit_job_and_add_task(batch_client, block_blob_client, job_id, pool_id, storage_account_name): """Submits a job to the Azure Batch service and adds a task that runs a python script. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to create. :param str pool_id: The id of the pool to use. """ job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) output_container_sas = common.helpers.create_container_and_create_sas( block_blob_client, job_id, azureblob.BlobPermissions.WRITE, expiry=None, timeout=120) output_container_sas_url = 'https://{}.blob.core.windows.net/{}?{}'.format( storage_account_name, job_id, output_container_sas) app_file_list = get_resource_file_list_from_container( block_blob_client, _APP_CONTAINER_NAME) blob_list = block_blob_client.list_blobs(_RESOURCE_CONTAINER_NAME) for blob in blob_list: (blob_base_name, blob_extension) = os.path.splitext(blob.name) output_file_name = f"{blob_base_name}_out{blob_extension}" command_line = f"{_APP_EXE_NAME} {_APP_EXTRA_ARGS} {blob.name} {output_file_name}" task_id = f"{_APP_EXE_NAME}_{blob_base_name}_Task" resource_sas_url = common.helpers.create_sas_url( block_blob_client, _RESOURCE_CONTAINER_NAME, blob.name, azureblob.BlobPermissions.READ, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) resource_file = batchmodels.ResourceFile(file_path=blob.name, http_url=resource_sas_url) print(resource_sas_url) print(app_file_list) print(f"Creating task ({task_id}): " + command_line) output_file = batchmodels.OutputFile( file_pattern=output_file_name, destination=batchmodels.OutputFileDestination( container=batchmodels.OutputFileBlobContainerDestination( container_url=output_container_sas_url)), upload_options=batchmodels.OutputFileUploadOptions( upload_condition=batchmodels.OutputFileUploadCondition. task_completion)) task = batchmodels.TaskAddParameter(id=task_id, command_line=command_line, resource_files=app_file_list + [resource_file], output_files=[output_file]) batch_client.task.add(job_id=job.id, task=task)