def __get_docker_credentials(spark_client): creds = [] if spark_client.secrets_config.docker_endpoint: creds.append( batch_models.EnvironmentSetting( name="DOCKER_ENDPOINT", value=spark_client.secrets_config.docker_endpoint)) if spark_client.secrets_config.docker_username: creds.append( batch_models.EnvironmentSetting( name="DOCKER_USERNAME", value=spark_client.secrets_config.docker_username)) if spark_client.secrets_config.docker_password: creds.append( batch_models.EnvironmentSetting( name="DOCKER_PASSWORD", value=spark_client.secrets_config.docker_password)) return creds
def __get_secrets_env(core_base_operations): shared_key = core_base_operations.secrets_configuration.shared_key service_principal = core_base_operations.secrets_configuration.service_principal if shared_key: return [ batch_models.EnvironmentSetting(name="BATCH_SERVICE_URL", value=shared_key.batch_service_url), batch_models.EnvironmentSetting(name="BATCH_ACCOUNT_KEY", value=shared_key.batch_account_key), batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_NAME", value=shared_key.storage_account_name), batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_KEY", value=shared_key.storage_account_key), batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_SUFFIX", value=shared_key.storage_account_suffix), ] else: return [ batch_models.EnvironmentSetting(name="SP_TENANT_ID", value=service_principal.tenant_id), batch_models.EnvironmentSetting(name="SP_CLIENT_ID", value=service_principal.client_id), batch_models.EnvironmentSetting(name="SP_CREDENTIAL", value=service_principal.credential), batch_models.EnvironmentSetting( name="SP_BATCH_RESOURCE_ID", value=service_principal.batch_account_resource_id), batch_models.EnvironmentSetting( name="SP_STORAGE_RESOURCE_ID", value=service_principal.storage_account_resource_id), ]
def generate_cluster_start_task( core_base_operations, zip_resource_file: batch_models.ResourceFile, cluster_id: str, gpu_enabled: bool, docker_repo: str = None, docker_run_options: str = None, file_shares: List[models.FileShare] = None, mixed_mode: bool = False, worker_on_master: bool = True, ): """ This will return the start task object for the pool to be created. :param cluster_id str: Id of the cluster(Used for uploading the resource files) :param zip_resource_file: Resource file object pointing to the zip file containing scripts to run on the node """ resource_files = [zip_resource_file] spark_web_ui_port = constants.DOCKER_SPARK_WEB_UI_PORT spark_worker_ui_port = constants.DOCKER_SPARK_WORKER_UI_PORT spark_job_ui_port = constants.DOCKER_SPARK_JOB_UI_PORT spark_container_name = constants.DOCKER_SPARK_CONTAINER_NAME spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE # TODO use certificate environment_settings = ( __get_secrets_env(core_base_operations) + [ batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), batch_models.EnvironmentSetting( name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)), ] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)) # start task command command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, file_shares) return batch_models.StartTask( command_line=helpers.wrap_commands_in_shell(command), resource_files=resource_files, environment_settings=environment_settings, user_identity=POOL_ADMIN_USER_IDENTITY, wait_for_success=True, max_task_retry_count=2, )
def generate_cluster_start_task( spark_client, zip_resource_file: batch_models.ResourceFile, gpu_enabled: bool, docker_repo: str = None, file_shares: List[aztk_models.FileShare] = None, mixed_mode: bool = False): """ This will return the start task object for the pool to be created. :param cluster_id str: Id of the cluster(Used for uploading the resource files) :param zip_resource_file: Resource file object pointing to the zip file containing scripts to run on the node """ resource_files = [zip_resource_file] spark_web_ui_port = constants.DOCKER_SPARK_WEB_UI_PORT spark_worker_ui_port = constants.DOCKER_SPARK_WORKER_UI_PORT spark_jupyter_port = constants.DOCKER_SPARK_JUPYTER_PORT spark_job_ui_port = constants.DOCKER_SPARK_JOB_UI_PORT spark_rstudio_server_port = constants.DOCKER_SPARK_RSTUDIO_SERVER_PORT spark_container_name = constants.DOCKER_SPARK_CONTAINER_NAME spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE # TODO use certificate environment_settings = __get_secrets_env(spark_client) + [ batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), batch_models.EnvironmentSetting(name="SPARK_JUPYTER_PORT", value=spark_jupyter_port), batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), batch_models.EnvironmentSetting(name="SPARK_RSTUDIO_SERVER_PORT", value=spark_rstudio_server_port), ] + __get_docker_credentials(spark_client) # start task command command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, file_shares, mixed_mode) return batch_models.StartTask( command_line=helpers.wrap_commands_in_shell(command), resource_files=resource_files, environment_settings=environment_settings, user_identity=POOL_ADMIN_USER_IDENTITY, wait_for_success=True)
def test_batch_update_pools(self, **kwargs): client = self.create_sharedkey_client(**kwargs) # Test Create Paas Pool test_paas_pool = models.PoolAddParameter( id=self.get_resource_name('batch_paas_'), vm_size='small', cloud_service_configuration=models.CloudServiceConfiguration( os_family='5' ), start_task=models.StartTask( command_line="cmd.exe /c \"echo hello world\"", resource_files=[models.ResourceFile('https://blobsource.com', 'filename.txt')], environment_settings=[models.EnvironmentSetting('ENV_VAR', 'env_value')], user_identity=models.UserIdentity( auto_user=models.AutoUserSpecification( elevation_level=models.ElevationLevel.admin ) ) ) ) response = client.pool.add(test_paas_pool) self.assertIsNone(response) # Test Upgrade Pool OS self.assertBatchError( "PoolVersionEqualsUpgradeVersion", client.pool.upgrade_os, test_paas_pool.id, "*" ) # Test Update Pool Parameters params = models.PoolUpdatePropertiesParameter([], [], [models.MetadataItem('foo', 'bar')]) response = client.pool.update_properties(test_paas_pool.id, params) self.assertIsNone(response) # Test Patch Pool Parameters params = models.PoolPatchParameter(metadata=[models.MetadataItem('foo2', 'bar2')]) response = client.pool.patch(test_paas_pool.id, params) self.assertIsNone(response) # Test Pool Exists response = client.pool.exists(test_paas_pool.id) self.assertTrue(response) # Test Get Pool pool = client.pool.get(test_paas_pool.id) self.assertIsInstance(pool, models.CloudPool) self.assertEqual(pool.id, test_paas_pool.id) self.assertEqual(pool.state, models.PoolState.active) self.assertEqual(pool.allocation_state, models.AllocationState.steady) self.assertEqual(pool.cloud_service_configuration.os_family, '5') self.assertEqual(pool.vm_size, 'small') self.assertIsNone(pool.start_task) self.assertEqual(pool.metadata[0].name, 'foo2') self.assertEqual(pool.metadata[0].value, 'bar2') # Test Get Pool with OData Clauses options = models.PoolGetOptions(select='id,state', expand='stats') pool = client.pool.get(test_paas_pool.id, options) self.assertIsInstance(pool, models.CloudPool) self.assertEqual(pool.id, test_paas_pool.id) self.assertEqual(pool.state, models.PoolState.active) self.assertIsNone(pool.allocation_state) self.assertIsNone(pool.vm_size) # Test Delete Pool response = client.pool.delete(test_paas_pool.id) self.assertIsNone(response)
def create_merge_task(frame, task_id, job_id, depend_start, depend_end): """ Azure Batch task that executes the ImageMagick `convert` command line to combine all of the output tiles into a single output image. This task uses the task dependency model to make sure it doesn't execute before it's dependent tasks have completed. This way we know all of the output image tiles will exist. :param frame: Frame number of the scene that this merge task is processing. :type frame: int :param task_id: Identifier of the task. :type task_id: str :param job_id: Unique identifier of the job. Job identifiers are unique within a single Azure Batch account. :type job_id: str :param depend_start: First task id of the dependency sequence. If each frame is split into 16 tiles, then every 17th task will be a merge task and that merge task will be dependent on the preceeding 16 tasks. tile tasks 1 - 16, then merge, then tiles 18 - 34, then merge, etc. :type depend_start: int :param depend_end: Final task id of the dependency sequence. Explanation for param `depend_start` applies here as well. :type depend_end: int """ x_tiles = int(os.environ["X_TILES"]) y_tiles = int(os.environ["X_TILES"]) output_sas = os.environ["OUTPUT_CONTAINER_SAS"] working_dir = os.environ["AZ_BATCH_TASK_WORKING_DIR"] output_format = os.environ["OUTPUT_FORMAT"] print("working_dir: {}".format(working_dir)) # crop to border means we need to use montage to tile the images. false means # we can use convert -flatten to layer the images with transparent backgrounds # convert is faster but needs RGBA crop = os.environ["CROP_TO_BORDER"].lower() if crop == "true": command_line = montage_command(frame, x_tiles, y_tiles, output_format) else: command_line = convert_command(frame, output_format) print("merge task command line: {}".format(command_line)) return models.TaskAddParameter( id=pad_number(task_id, PAD_LEN_ID), display_name="frame: {} - merge task".format(frame), command_line=os_specific_command_line(command_line), constraints=models.TaskConstraints(max_task_retry_count=2), environment_settings=[ models.EnvironmentSetting("X_TILES", str(x_tiles)), models.EnvironmentSetting("Y_TILES", str(y_tiles)) ], depends_on=models.TaskDependencies( task_ids=get_dependent_tasks(depend_start, depend_end)), resource_files=get_resource_files(x_tiles, y_tiles, frame), output_files=[ models.OutputFile( file_pattern="../stdout.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/merge.stdout.log".format( job_id, pad_number(frame, PAD_LEN_FRAME)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="../stderr.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/merge.stderr.log".format( job_id, pad_number(frame, PAD_LEN_FRAME)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="frame_*", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/outputs/final".format(job_id))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_success)) ])
def create_task(frame, task_id, job_id, tile_num, current_x, current_y): """ Azure Batch task that renders the given tile. Run Blender from the command line and pass in the job manager script and the blend file. :param frame: Frame number of the scene that this merge task is processing. :type frame: int :param task_id: Identifier of the task. :type task_id: str :param job_id: Unique identifier of the job. Job identifiers are unique within a single Azure Batch account. :type job_id: str :param tile_num: Number of the current tile. :type tile_num: int :param current_x: X value of the current tile, used to generate the render border. :type current_x: int :param current_y: Y value of the current tile, used to generate the render border. :type current_y: int """ blend_file = os.environ["BLEND_FILE"] output_sas = os.environ["OUTPUT_CONTAINER_SAS"] optionalParams = os.environ["OPTIONAL_PARAMS"] command_line = blender_command(blend_file, optionalParams) # only print this once if task_id == 1: print("tile task command line: {}".format(command_line)) return models.TaskAddParameter( id=pad_number(task_id, PAD_LEN_ID), display_name="frame: {}, tile: {}".format(frame, tile_num), command_line=os_specific_command_line(command_line), constraints=models.TaskConstraints(max_task_retry_count=2), environment_settings=[ models.EnvironmentSetting("X_TILES", os.environ["X_TILES"]), models.EnvironmentSetting("Y_TILES", os.environ["Y_TILES"]), models.EnvironmentSetting("CROP_TO_BORDER", os.environ["CROP_TO_BORDER"]), models.EnvironmentSetting("OUTPUT_FORMAT", os.environ["OUTPUT_FORMAT"]), models.EnvironmentSetting("BLEND_FILE", os.environ["BLEND_FILE"]), models.EnvironmentSetting("CURRENT_FRAME", str(frame)), models.EnvironmentSetting("CURRENT_TILE", str(tile_num)), models.EnvironmentSetting("CURRENT_X", str(current_x)), models.EnvironmentSetting("CURRENT_Y", str(current_y)) ], resource_files=[ models.ResourceFile( "https://raw.githubusercontent.com/Azure/BatchExplorer-data/master/ncj/blender/scripts/python-task-manager.py", "scripts/python-task-manager.py") ], output_files=[ models.OutputFile( file_pattern="../stdout.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/tile-{}.stdout.log".format( job_id, pad_number(frame, PAD_LEN_FRAME), pad_number(tile_num, PAD_LEN_TILE)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="../stderr.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/tile-{}.stderr.log".format( job_id, pad_number(frame, PAD_LEN_FRAME), pad_number(tile_num, PAD_LEN_TILE)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="../fileuploaderr.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/tile-{}.file_upload_stderr.log". format(job_id, pad_number(frame, PAD_LEN_FRAME), pad_number(tile_num, PAD_LEN_TILE)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="../fileuploadout.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/tile-{}.file_upload_stdout.log". format(job_id, pad_number(frame, PAD_LEN_FRAME), pad_number(tile_num, PAD_LEN_TILE)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="tile_*", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/outputs/frame-{}".format( job_id, pad_number(frame, PAD_LEN_FRAME)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_success)) ])
def add_task(self, job_id: str, default_max_tries=None): """ Adds a task for each input file in the collection to the specified job. :param str job_id: The ID of the job to which to add the tasks. created for each input file. :param int default_max_tries: Fallback max tries. :output task: Azure Batch task """ from azure.batch import models as batchmodels task_id = self.task_definition.get('id', job_id) display_name = self.task_definition.get('displayName', task_id) logging.info('Adding {} tasks to job [{}]...'.format(task_id, job_id)) container_settings = batchmodels.TaskContainerSettings( image_name=self.image, container_run_options='--rm') platform = self.conf[utils.PLATFORM] environment_settings = [ batchmodels.EnvironmentSetting(name='AZURE_SUBSCRIPTION_ID', value=platform['subscription']), batchmodels.EnvironmentSetting(name='AZURE_STORAGE_ACCOUNT', value=platform['storage_account']), batchmodels.EnvironmentSetting( name='AZURE_STORAGE_CONTAINER', value=platform['storage_container']), batchmodels.EnvironmentSetting( name='AZURE_STORAGE_CONNECTION_STRING', value=platform['storage_connection_string']), ] if 'environmentSettings' in self.task_definition and self.task_definition[ 'environmentSettings'] is not None: environment_settings.extend([ batchmodels.EnvironmentSetting(**setting) for setting in self.task_definition['environmentSettings'] ]) constraints = None if 'constraints' in self.task_definition and self.task_definition[ 'constraints']: constraints = batchmodels.TaskConstraints( max_wall_clock_time=self.task_definition['constraints'].get( 'maxWallClockTime', "P1D"), max_task_retry_count=self.task_definition['constraints'].get( 'maxTaskRetryCount', default_max_tries), retention_time=self.task_definition['constraints'].get( 'retentionTime', "P1D"), ), user_identity = batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.pool, elevation_level=batchmodels.ElevationLevel.admin)) task = batchmodels.TaskAddParameter( id=task_id, display_name=display_name, command_line=self.task_definition['commandLine'], constraints=constraints[0], container_settings=container_settings, environment_settings=environment_settings, user_identity=user_identity, ) for validation in task.validate(): logging.info(validation) self.batch_client.task.add(job_id=job_id, task=task) return task
def create_pool(self, pool_id, vm_size, target_dedicated, target_low_priority, batch_image_spec, starttask_cmd, starttask_url, starttask_script, sp_cert_thumb, app_licenses=None, disable_remote_access=True, app_pkgs=None, subnet_id=None, app_insights_app_key=None, app_insights_instrumentation_key=None): pool = batchmodels.PoolAddParameter( id=pool_id, display_name=pool_id, vm_size=vm_size, target_dedicated_nodes=target_dedicated, target_low_priority_nodes=target_low_priority, virtual_machine_configuration=batch_image_spec. get_virtual_machine_configuration(), application_package_references=app_pkgs, certificate_references=[ batchmodels.CertificateReference(sp_cert_thumb, 'sha1') ]) if app_licenses: pool.application_licenses = app_licenses pool.start_task = batchmodels.StartTask( command_line=starttask_cmd, max_task_retry_count=3, user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( scope=batchmodels.AutoUserScope.pool, elevation_level=batchmodels.ElevationLevel.admin)), wait_for_success=True, resource_files=[ batchmodels.ResourceFile(starttask_url, starttask_script) ]) if app_insights_app_key and app_insights_instrumentation_key: pool.start_task.environment_settings = [ batchmodels.EnvironmentSetting('APP_INSIGHTS_APP_ID', app_insights_app_key), batchmodels.EnvironmentSetting( 'APP_INSIGHTS_INSTRUMENTATION_KEY', app_insights_instrumentation_key) ] if subnet_id: pool.network_configuration = batchmodels.NetworkConfiguration( subnet_id=subnet_id) if disable_remote_access: if pool.network_configuration is None: pool.network_configuration = batchmodels.NetworkConfiguration() endpoint_config = batchmodels.PoolEndpointConfiguration( inbound_nat_pools=[ batchmodels.InboundNATPool( 'DisableRDP', batchmodels.InboundEndpointProtocol.tcp, 3389, 60000, 60099, network_security_group_rules=[ batchmodels.NetworkSecurityGroupRule( 150, batchmodels. NetworkSecurityGroupRuleAccess.deny, '*') ]), batchmodels.InboundNATPool( 'DisableSSH', batchmodels.InboundEndpointProtocol.tcp, 22, 61000, 61099, network_security_group_rules=[ batchmodels.NetworkSecurityGroupRule( 151, batchmodels. NetworkSecurityGroupRuleAccess.deny, '*') ]) ]) pool.network_configuration.endpoint_configuration = endpoint_config try: client = self._get_batch_client() client.pool.add(pool) except batchmodels.BatchErrorException as be: if be.error: print('Error creating pool, code={}, message={}'.format( be.error.code, be.error.message)) if be.error.values: for e in be.error.values: print('Key={}, Value={}'.format(e.key, e.value)) raise
def _get_aztk_environment(cluster_id, worker_on_master, mixed_mode): envs = [] envs.append(batch_models.EnvironmentSetting(name="AZTK_MIXED_MODE", value=helpers.bool_env(mixed_mode))) envs.append(batch_models.EnvironmentSetting(name="AZTK_WORKER_ON_MASTER", value=helpers.bool_env(worker_on_master))) envs.append(batch_models.EnvironmentSetting(name="AZTK_CLUSTER_ID", value=cluster_id)) return envs