def create_job(client, resource_group, job_name, json_file, location=None, cluster_name=None, cluster_resource_group=None, raw=False): with open(json_file) as f: json_obj = json.load(f) params = _get_deserializer()('JobCreateParameters', json_obj) if location: params.location = location if not params.location: raise CLIError('Please provide location for job creation.') # If cluster name is specified, find the cluster and use its resource id for the new job. if cluster_name is not None: if cluster_resource_group is None: # The job must be created in the cluster's resource group. cluster_resource_group = resource_group cluster = client.clusters.get(cluster_resource_group, cluster_name) params.cluster = models.ResourceId(cluster.id) if params.cluster is None: raise CLIError( 'Please provide cluster information via command line or configuration file.' ) return client.jobs.create(resource_group, job_name, params, raw=raw)
def create_cluster(cmd, client, # pylint: disable=too-many-locals resource_group, cluster_name, json_file=None, location=None, user_name=None, ssh_key=None, password=None, generate_ssh_keys=None, image=None, custom_image=None, use_auto_storage=False, vm_size=None, vm_priority='dedicated', target=None, min_nodes=None, max_nodes=None, subnet=None, nfs_name=None, nfs_resource_group=None, nfs_mount_path='nfs', azure_file_share=None, afs_mount_path='afs', container_name=None, container_mount_path='bfs', account_name=None, account_key=None, setup_task=None, setup_task_output=None): if generate_ssh_keys: _generate_ssh_keys() if ssh_key is None: ssh_key = _get_default_ssh_public_key_location() _ensure_resource_not_exist(client.clusters, resource_group, cluster_name) _verify_subnet(client, subnet, nfs_name, nfs_resource_group or resource_group) if json_file: with open(json_file) as f: json_obj = json.load(f) params = _get_deserializer()('ClusterCreateParameters', json_obj) else: # noinspection PyTypeChecker params = models.ClusterCreateParameters() if params.node_setup: params.node_setup.mount_volumes = _patch_mount_volumes( cmd.cli_ctx, params.node_setup.mount_volumes, account_name, account_key) params = _update_user_account_settings(params, user_name, ssh_key, password) params.location = location or _get_resource_group_location(cmd.cli_ctx, resource_group) params = _update_nodes_information(params, image, custom_image, vm_size, vm_priority, target, min_nodes, max_nodes) if nfs_name or azure_file_share or container_name: params.node_setup = params.node_setup or models.NodeSetup() mount_volumes = params.node_setup.mount_volumes if params.node_setup else None if nfs_name: file_server = client.file_servers.get(nfs_resource_group or resource_group, nfs_name) mount_volumes = _add_nfs_to_mount_volumes(mount_volumes, file_server.id, nfs_mount_path) if azure_file_share: mount_volumes = _add_azure_file_share_to_mount_volumes(cmd.cli_ctx, mount_volumes, azure_file_share, afs_mount_path, account_name, account_key) if container_name: mount_volumes = _add_azure_container_to_mount_volumes(cmd.cli_ctx, mount_volumes, container_name, container_mount_path, account_name, account_key) if use_auto_storage: auto_storage_account, auto_storage_key = _configure_auto_storage(cmd.cli_ctx, params.location) mount_volumes = _add_azure_file_share_to_mount_volumes( cmd.cli_ctx, mount_volumes, AUTO_STORAGE_SHARE_NAME, AUTO_STORAGE_SHARE_PATH, auto_storage_account, auto_storage_key) mount_volumes = _add_azure_container_to_mount_volumes( cmd.cli_ctx, mount_volumes, AUTO_STORAGE_CONTAINER_NAME, AUTO_STORAGE_CONTAINER_PATH, auto_storage_account, auto_storage_key) if mount_volumes: if params.node_setup is None: params.node_setup = models.NodeSetup() params.node_setup.mount_volumes = mount_volumes if subnet: params.subnet = models.ResourceId(id=subnet) if setup_task: params = _add_setup_task(setup_task, setup_task_output, params) return client.clusters.create(resource_group, cluster_name, params)
def test_experiments_isolation(self, resource_group, location): self.client.workspaces.create(resource_group.name, 'first', location).result() self.client.workspaces.create(resource_group.name, 'second', location).result() # Create a cluster, two experiments and a job in each experiment for workspace in ['first', 'second']: cluster = self.client.clusters.create( resource_group.name, workspace, 'cluster', parameters=models.ClusterCreateParameters( vm_size='STANDARD_D1', scale_settings=models.ScaleSettings( manual=models.ManualScaleSettings( target_node_count=0)), user_account_settings=models.UserAccountSettings( admin_user_name=helpers.ADMIN_USER_NAME, admin_user_password=helpers.ADMIN_USER_PASSWORD), vm_priority='lowpriority')).result() for experiment in ['exp1', 'exp2']: self.client.experiments.create(resource_group.name, workspace, experiment).result() self.client.jobs.create( resource_group.name, workspace, experiment, 'job', parameters=models.JobCreateParameters( cluster=models.ResourceId(id=cluster.id), node_count=1, std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT', custom_toolkit_settings=models.CustomToolkitSettings( command_line='true'))).result() # Delete exp1 in the first workspace self.client.experiments.delete(resource_group.name, 'first', 'exp1').result() # Ensure the experiment was actually deleted self.assertRaises( CloudError, lambda: self.client.experiments.get( resource_group.name, 'first', 'exp1')) for workspace in ['first', 'second']: # Ensure the clusters are not affected self.client.clusters.get(resource_group.name, workspace, 'cluster') # Ensure the other experiments are not affected for experiment in ['exp1', 'exp2']: if workspace == 'first' and experiment == 'exp1': continue self.client.experiments.get(resource_group.name, workspace, experiment) job = self.client.jobs.get(resource_group.name, workspace, experiment, 'job') # And check the job are not terminated self.assertEqual(job.execution_state, models.ExecutionState.queued)
def create_custom_job(client, resource_group, cluster_id, job_name, nodes, cmd, job_preparation_cmd=None, container=None): """Creates custom toolkit job :param BatchAIManagementClient client: client instance. :param str resource_group: resource group name. :param str cluster_id: resource Id of the cluster. :param str job_name: job name. :param int nodes: number of nodes to execute the job. :param str cmd: command line to run. :param str or None job_preparation_cmd: Job preparation command line. :param models.ContainerSettings or None container: container settings to run the job. :return models.Job: the created job. """ job_preparation = None if job_preparation_cmd: job_preparation = models.JobPreparation( command_line=job_preparation_cmd) client.experiments.create(resource_group, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME).result() return client.jobs.create( resource_group, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job_name, parameters=models.JobCreateParameters( cluster=models.ResourceId(id=cluster_id), node_count=nodes, std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format( Helpers.AZURE_FILES_MOUNTING_PATH), output_directories=[ models.OutputDirectory( id=Helpers.JOB_OUTPUT_DIRECTORY_ID, path_prefix=Helpers.JOB_OUTPUT_DIRECTORY_PATH, path_suffix="files") ], input_directories=[ models.InputDirectory( id='INPUT', path='$AZ_BATCHAI_MOUNT_ROOT/{0}/input'.format( Helpers.AZURE_FILES_MOUNTING_PATH)) ], container_settings=container, job_preparation=job_preparation, custom_toolkit_settings=models.CustomToolkitSettings( command_line=cmd))).result()
def test_job_environment_variables_and_secrets(self, resource_group, location, cluster): """Tests if it's possible to mount external file systems for a job.""" job_name = 'job' job = self.client.jobs.create( resource_group.name, helpers.DEFAULT_WORKSPACE_NAME, helpers.DEFAULT_EXPERIMENT_NAME, job_name, parameters=models.JobCreateParameters( cluster=models.ResourceId(id=cluster.id), node_count=1, std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format( helpers.AZURE_FILES_MOUNTING_PATH), environment_variables=[ models.EnvironmentVariable(name='VARIABLE', value='VALUE') ], secrets=[ models.EnvironmentVariableWithSecretValue( name='SECRET_VARIABLE', value='SECRET') ], # Check that the job preparation has access to env variables and secrets. job_preparation=models.JobPreparation( command_line='echo $VARIABLE $SECRET_VARIABLE'), # Check that the job has access to env variables and secrets. custom_toolkit_settings=models.CustomToolkitSettings( command_line='echo $VARIABLE $SECRET_VARIABLE'))).result( ) # type: models.Job self.assertEqual( helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, helpers.MINUTE), models.ExecutionState.succeeded) # Check that environment variables are reported by the server. self.assertEqual(len(job.environment_variables), 1) self.assertEqual(job.environment_variables[0].name, 'VARIABLE') self.assertEqual(job.environment_variables[0].value, 'VALUE') # Check that secrets are reported back by server, but value is not reported. self.assertEqual(len(job.secrets), 1) self.assertEqual(job.secrets[0].name, 'SECRET_VARIABLE') self.assertIsNone(job.secrets[0].value) # Check that job and job prep had access to the env variables and secrets. helpers.assert_job_files_are( self, self.client, resource_group.name, job.name, helpers.STANDARD_OUTPUT_DIRECTORY_ID, { u'stdout.txt': u'VALUE SECRET\n', u'stderr.txt': u'', u'stdout-job_prep.txt': u'VALUE SECRET\n', u'stderr-job_prep.txt': u'' })
def create_job(config, cluster_id, workspace, experiment, job_name, image_name, command, number_of_vms=1): ''' Creates job ''' input_directories = [ models.InputDirectory(id='SCRIPT', path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format( config.fileshare_mount_point, job_name)), models.InputDirectory(id='DATASET', path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format( config.fileshare_mount_point, 'data')) ] std_output_path_prefix = "$AZ_BATCHAI_MOUNT_ROOT/{0}".format( config.fileshare_mount_point) output_directories = [ models.OutputDirectory(id='MODEL', path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format( config.fileshare_mount_point), path_suffix="models"), models.OutputDirectory(id='NOTEBOOKS', path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format( config.fileshare_mount_point), path_suffix="notebooks") ] parameters = models.JobCreateParameters( location=config.location, cluster=models.ResourceId(id=cluster_id), node_count=number_of_vms, input_directories=input_directories, std_out_err_path_prefix=std_output_path_prefix, output_directories=output_directories, container_settings=models.ContainerSettings( image_source_registry=models.ImageSourceRegistry( image=image_name)), custom_toolkit_settings=models.CustomToolkitSettings( command_line=command)) client = client_from(config) _ = client.jobs.create(config.group_name, workspace, experiment, job_name, parameters)
def create_resource(self, name, **kwargs): if self.is_live: self.client = create_batchai_client(self) group = self._get_resource_group(**kwargs) self.resource = create_cluster( self.client, self.location, group.name, name, self.vm_size, self.target_nodes, self._get_storage_account(**kwargs).name, self._get_storage_account_key(**kwargs)) if self.wait: wait_for_nodes(self.is_live, self.client, group.name, name, self.target_nodes, NODE_STARTUP_TIMEOUT_SEC) else: self.resource = models.Cluster() self.resource.id = models.ResourceId(id='fake') return {self.parameter_name: self.resource}
def create_file_server(cmd, client, resource_group, file_server_name, json_file=None, vm_size=None, location=None, user_name=None, ssh_key=None, password=None, generate_ssh_keys=None, disk_count=None, disk_size=None, caching_type=None, storage_sku=None, subnet=None, raw=False): if generate_ssh_keys: _generate_ssh_keys() if ssh_key is None: ssh_key = _get_default_ssh_public_key_location() _ensure_resource_not_exist(client, resource_group, file_server_name) if json_file: with open(json_file) as f: json_obj = json.load(f) params = _get_deserializer()('FileServerCreateParameters', json_obj) else: # noinspection PyTypeChecker params = models.FileServerCreateParameters() params = _update_user_account_settings(params, user_name, ssh_key, password) params.location = location or _get_resource_group_location(cmd.cli_ctx, resource_group) if not params.data_disks: # noinspection PyTypeChecker params.data_disks = models.DataDisks() if disk_size: params.data_disks.disk_size_in_gb = disk_size if not params.data_disks.disk_size_in_gb: raise CLIError('Please provide disk size in Gb.') if disk_count: params.data_disks.disk_count = disk_count if not params.data_disks.disk_count: raise CLIError('Please provide number of data disks (at least one disk is required).') if caching_type: params.data_disks.caching_type = caching_type if storage_sku: params.data_disks.storage_account_type = storage_sku if not params.data_disks.storage_account_type: raise CLIError('Please provide storage account type (storage sku).') if vm_size: params.vm_size = vm_size if not params.vm_size: raise CLIError('Please provide VM size.') if subnet: if not is_valid_resource_id(subnet): raise CLIError('Ill-formed subnet resource id') params.subnet = models.ResourceId(id=subnet) return client.create(resource_group, file_server_name, params, raw=raw)
def _add_nfs_to_mount_volumes(volumes, file_server_id, mount_path): """Adds NFS to the mount volumes. :param models.MountVolumes or None volumes: existing mount volumes. :param str file_server_id: resource id of the file server. :param str mount_path: relative mount path for the file server. :return models.ClusterCreateParameters: updated parameters. """ result = copy.deepcopy(volumes) if volumes else models.MountVolumes() if not mount_path: raise CLIError('File server relative mount path cannot be empty.') if result.file_servers is None: result.file_servers = [] result.file_servers.append(models.FileServerReference( relative_mount_path=mount_path, file_server=models.ResourceId(id=file_server_id), mount_options="rw")) return result
def add_nfs_to_cluster_create_parameters(params, file_server_id, mount_path): """Adds NFS to the cluster create parameters. :param model.ClusterCreateParameters params: cluster create parameters. :param str file_server_id: resource id of the file server. :param str mount_path: relative mount path for the file server. """ if not mount_path: raise CLIError('File server relative mount path cannot be empty.') if params.node_setup is None: params.node_setup = models.NodeSetup() if params.node_setup.mount_volumes is None: params.node_setup.mount_volumes = models.MountVolumes() if params.node_setup.mount_volumes.file_servers is None: params.node_setup.mount_volumes.file_servers = [] params.node_setup.mount_volumes.file_servers.append( models.FileServerReference( relative_mount_path=mount_path, file_server=models.ResourceId(file_server_id), mount_options="rw"))
def create_resource(self, name, **kwargs): if self.is_live: self.client = Helpers.create_batchai_client(self) group = self._get_resource_group(**kwargs) self.resource = Helpers.create_cluster( self.client, self.location, group.name, name, self.vm_size, self.target_nodes, self._get_storage_account(**kwargs).name, self._get_storage_account_key(**kwargs)) self.client.experiments.create( group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME).result() if self.wait: Helpers.wait_for_nodes(self.is_live, self.client, group.name, name, self.target_nodes, Helpers.NODE_STARTUP_TIMEOUT_SEC) else: self.resource = models.Cluster() self.resource.id = models.ResourceId(id='fake') return {self.parameter_name: self.resource}
def create_job(cmd, client, resource_group, job_name, json_file, location=None, cluster_name=None, cluster_resource_group=None, nfs_name=None, nfs_resource_group=None, nfs_mount_path='nfs', azure_file_share=None, afs_mount_path='afs', container_name=None, container_mount_path='bfs', account_name=None, account_key=None): _ensure_resource_not_exist(client.jobs, resource_group, job_name) with open(json_file) as f: json_obj = json.load(f) params = _get_deserializer()('JobCreateParameters', json_obj) # type: models.JobCreateParameters params.location = location or _get_resource_group_location(cmd.cli_ctx, resource_group) # If cluster name is specified, find the cluster and use its resource id for the new job. if cluster_name is not None: if cluster_resource_group is None: # The job must be created in the cluster's resource group. cluster_resource_group = resource_group cluster = client.clusters.get(cluster_resource_group, cluster_name) params.cluster = models.ResourceId(id=cluster.id) if params.cluster is None: raise CLIError('Please provide cluster information via command line or configuration file.') if params.mount_volumes: params.mount_volumes = _patch_mount_volumes( cmd.cli_ctx, params.mount_volumes, account_name, account_key) # Add file systems specified via command line into mount volumes if nfs_name or azure_file_share or container_name: params.mount_volumes = params.mount_volumes or models.MountVolumes() mount_volumes = params.mount_volumes if nfs_name: file_server = client.file_servers.get(nfs_resource_group or resource_group, nfs_name) mount_volumes = _add_nfs_to_mount_volumes(mount_volumes, file_server.id, nfs_mount_path) if azure_file_share: mount_volumes = _add_azure_file_share_to_mount_volumes(cmd.cli_ctx, mount_volumes, azure_file_share, afs_mount_path, account_name, account_key) if container_name: mount_volumes = _add_azure_container_to_mount_volumes(cmd.cli_ctx, mount_volumes, container_name, container_mount_path, account_name, account_key) if mount_volumes: params.mount_volumes = mount_volumes return client.jobs.create(resource_group, job_name, params)
def submit_job(config, pretrained_model_type, retraining_type, output_model_name, num_epochs): ''' Defines and submits a job. Does not check for completion. ''' client = get_client(config) job_name = 'job{}'.format( datetime.datetime.utcnow().strftime('%m_%d_%H_%M_%S')) cluster = client.clusters.get(config.bait_resource_group_name, config.bait_cluster_name) # Define the command line arguments to the retraining script command_line_args = '--input_dir $AZ_BATCHAI_INPUT_TRAININGDATA ' + \ '--validation_dir $AZ_BATCHAI_INPUT_VALIDATIONDATA ' + \ '--output_dir $AZ_BATCHAI_OUTPUT_MODEL ' + \ '--num_epochs {} '.format(num_epochs) + \ '--retraining_type {} '.format(retraining_type) + \ '--model_type {} '.format(pretrained_model_type) + \ '--model_filename $AZ_BATCHAI_INPUT_PRETRAINEDMODELS/' if pretrained_model_type == 'alexnet': command_line_args += 'AlexNet.model' elif pretrained_model_type == 'resnet18': command_line_args += 'ResNet_18.model' # Define the job cntk_settings = tm.CNTKsettings( language_type='python', python_script_file_path='$AZ_BATCHAI_INPUT_SCRIPT/' + 'retrain_model_distributed.py', command_line_args=command_line_args, process_count=config.bait_vms_per_job) # NC6s -- one GPU per VM job_create_params = tm.job_create_parameters.JobCreateParameters( location=config.bait_region, cluster=tm.ResourceId(cluster.id), node_count=config.bait_vms_per_job, std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/afs', output_directories=[ tm.OutputDirectory(id='MODEL', path_prefix='$AZ_BATCHAI_MOUNT_ROOT/afs') ], input_directories=[ tm.InputDirectory(id='SCRIPT', path='$AZ_BATCHAI_MOUNT_ROOT/afs/scripts'), tm.InputDirectory( id='PRETRAINEDMODELS', path='$AZ_BATCHAI_MOUNT_ROOT/afs/pretrainedmodels'), tm.InputDirectory( id='TRAININGDATA', path='$AZ_BATCHAI_MOUNT_ROOT/nfs/training_images'), tm.InputDirectory( id='VALIDATIONDATA', path='$AZ_BATCHAI_MOUNT_ROOT/nfs/validation_images') ], cntk_settings=cntk_settings) # Submit the job job = client.jobs.create( resource_group_name=config.bait_resource_group_name, job_name=job_name, parameters=job_create_params) return (job_name)
ts_from = sys.argv[1] ts_to = sys.argv[2] device_ids = j['device_ids'] tags = j['tags'] job_name_template = j['job_name'] credentials = ServicePrincipalCredentials(client_id=CLIENT, secret=KEY, tenant=TENANT_ID) batchai_client = batchai.BatchAIManagementClient( credentials=credentials, subscription_id=subscription_id) cluster = batchai_client.clusters.get(resource_group_name, cluster_name) # run an async job for each sensor for device_id in device_ids: for tag in tags: job_name = job_name_template.format(device_id, tag) custom_settings = baimodels.CustomToolkitSettings( command_line=command_line.format(device_id, tag, ts_from, ts_to, config_file_path)) print('command line: ' + custom_settings.command_line) params = baimodels.job_create_parameters.JobCreateParameters( location=location, cluster=baimodels.ResourceId(cluster.id), node_count=node_count, std_out_err_path_prefix=std_out_err_path_prefix, custom_toolkit_settings=custom_settings) batchai_client.jobs.create(resource_group_name, job_name, params)
# define grid of tuned hyperparameters param_specs = [ DiscreteParameter(parameter_name="LATENT_DIM", values=[5, 10, 15]), DiscreteParameter(parameter_name="HIDDEN_LAYERS", values=[1, 2, 3]), DiscreteParameter(parameter_name="BATCH_SIZE", values=[8, 16, 32]), DiscreteParameter(parameter_name="T", values=[72, 168, 336]), DiscreteParameter(parameter_name="LEARNING_RATE", values=[0.01, 0.001, 0.0001]), DiscreteParameter(parameter_name="ALPHA", values=[0.1, 0.001, 0]) ] parameters = ParameterSweep(param_specs) # create a template for Batch AI job jcp = models.JobCreateParameters( cluster=models.ResourceId(id=cluster.id), node_count=1, std_out_err_path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/logs', output_directories=[ models.OutputDirectory(id='ALL', path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/output') ], custom_toolkit_settings=models.CustomToolkitSettings( command_line= 'python $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts/FF_multi_step_multivariate.py \ --scriptdir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts \ --datadir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/data \ --outdir $AZ_BATCHAI_OUTPUT_ALL \ -l {0} -n {1} -b {2} -T {3} -r {4} -a {5}'.format( parameters['LATENT_DIM'], parameters['HIDDEN_LAYERS'], parameters['BATCH_SIZE'], parameters['T'],
def test_job_level_mounting(self, resource_group, location, cluster, storage_account, storage_account_key): """Tests if it's possible to mount external file systems for a job.""" job_name = 'job' # Create file share and container to mount on the job level if storage_account.name != FAKE_STORAGE.name: files = FileService(storage_account.name, storage_account_key) files.create_share('jobshare', fail_on_exist=False) blobs = BlockBlobService(storage_account.name, storage_account_key) blobs.create_container('jobcontainer', fail_on_exist=False) job = self.client.jobs.create( resource_group.name, job_name, parameters=models.JobCreateParameters( location=location, cluster=models.ResourceId(id=cluster.id), node_count=1, mount_volumes=models. MountVolumes(azure_file_shares=[ models.AzureFileShareReference( account_name=storage_account.name, azure_file_url='https://{0}.file.core.windows.net/{1}'. format(storage_account.name, 'jobshare'), relative_mount_path='job_afs', credentials=models.AzureStorageCredentialsInfo( account_key=storage_account_key), ) ], azure_blob_file_systems=[ models.AzureBlobFileSystemReference( account_name=storage_account.name, container_name='jobcontainer', relative_mount_path='job_bfs', credentials=models. AzureStorageCredentialsInfo( account_key=storage_account_key), ) ]), # Put standard output on cluster level AFS to check that the job has access to it. std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format( AZURE_FILES_MOUNTING_PATH), # Create two output directories on job level AFS and blobfuse. output_directories=[ models.OutputDirectory( id='OUTPUT1', path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_afs'), models.OutputDirectory( id='OUTPUT2', path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_bfs') ], # Check that the job preparation has access to job level file systems. job_preparation=models.JobPreparation( command_line= 'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/prep_afs.txt; ' 'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/prep_bfs.txt; ' 'echo done'), # Check that the job has access to job custom_toolkit_settings=models.CustomToolkitSettings( command_line= 'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/job_afs.txt; ' 'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/job_bfs.txt; ' 'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT1/afs; ' 'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/afs/job_afs.txt; ' 'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs; ' 'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs/job_bfs.txt; ' 'echo done'))).result() self.assertEqual( wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, MINUTE), models.ExecutionState.succeeded) job = self.client.jobs.get(resource_group.name, job.name) # Assert job and job prep standard output is populated on cluster level filesystem assert_job_files_are( self, self.client, resource_group.name, job.name, STANDARD_OUTPUT_DIRECTORY_ID, { u'stdout.txt': u'done\n', u'stderr.txt': u'', u'stdout-job_prep.txt': u'done\n', u'stderr-job_prep.txt': u'' }) # Assert files are generated on job level AFS assert_job_files_are(self, self.client, resource_group.name, job.name, 'OUTPUT1', { u'job_afs.txt': u'afs\n', u'prep_afs.txt': u'afs\n', u'afs': None }) # Assert files are generated on job level blobfuse assert_job_files_are(self, self.client, resource_group.name, job.name, 'OUTPUT2', { u'job_bfs.txt': u'bfs\n', u'prep_bfs.txt': u'bfs\n', u'bfs': None }) # Assert subfolders are available via API assert_job_files_in_path_are(self, self.client, resource_group.name, job.name, 'OUTPUT1', 'afs', {u'job_afs.txt': u'afs\n'}) assert_job_files_in_path_are(self, self.client, resource_group.name, job.name, 'OUTPUT2', 'bfs', {u'job_bfs.txt': u'bfs\n'}) # Assert that we can access the output files created on job level mount volumes directly in storage using path # segment returned by the server. if storage_account.name != FAKE_STORAGE.name: files = FileService(storage_account.name, storage_account_key) self.assertTrue( files.exists( 'jobshare', job.job_output_directory_path_segment + '/' + OUTPUT_DIRECTORIES_FOLDER_NAME, 'job_afs.txt')) blobs = BlockBlobService(storage_account.name, storage_account_key) self.assertTrue( blobs.exists( 'jobcontainer', job.job_output_directory_path_segment + '/' + OUTPUT_DIRECTORIES_FOLDER_NAME + '/job_bfs.txt')) # After the job is done the filesystems should be unmounted automatically, check this by submitting a new job. checker = self.client.jobs.create( resource_group.name, 'checker', parameters=models.JobCreateParameters( location=location, cluster=models.ResourceId(id=cluster.id), node_count=1, std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format( AZURE_FILES_MOUNTING_PATH), custom_toolkit_settings=models.CustomToolkitSettings( command_line='echo job; df | grep -E "job_bfs|job_afs"')) ).result() # Check the job failed because there are not job level mount volumes anymore self.assertEqual( wait_for_job_completion(self.is_live, self.client, resource_group.name, checker.name, MINUTE), models.ExecutionState.failed) # Check that the cluster level AFS was still mounted assert_job_files_are(self, self.client, resource_group.name, checker.name, STANDARD_OUTPUT_DIRECTORY_ID, { u'stdout.txt': u'job\n', u'stderr.txt': u'' })
def test_file_server(self, resource_group, location, storage_account, storage_account_key): """Tests file server functionality 1. Create file server 2. Create two clusters with this file server 3. Check that the file server is mounted: a. submit tasks (one from host and another from container) on the first cluster to write data to nfs b. submit a task on the second cluster to read the data from nfs """ server = create_file_server( self.client, location, resource_group.name, self.file_server_name) # type: models.FileServer cluster1 = create_cluster( self.client, location, resource_group.name, 'cluster1', 'STANDARD_D1', 1, storage_account.name, storage_account_key, file_servers=[ models.FileServerReference( file_server=models.ResourceId(id=server.id), relative_mount_path='nfs', mount_options="rw") ]) cluster2 = create_cluster( self.client, location, resource_group.name, 'cluster2', 'STANDARD_D1', 1, storage_account.name, storage_account_key, file_servers=[ models.FileServerReference( file_server=models.ResourceId(id=server.id), relative_mount_path='nfs', mount_options="rw") ]) # Verify the file server is reported. assert_existing_file_servers_are(self, self.client, resource_group.name, [self.file_server_name]) # Verify the file server become available in a reasonable time self.assertTrue( wait_for_file_server(self.is_live, self.client, resource_group.name, self.file_server_name, _FILE_SERVER_CREATION_TIMEOUT_SEC)) # Verify the remote login information and private ip are reported server = self.client.file_servers.get( resource_group.name, self.file_server_name) # type: models.FileServer self.assertRegexpMatches(server.mount_settings.file_server_public_ip, RE_ID_ADDRESS) self.assertRegexpMatches(server.mount_settings.file_server_internal_ip, RE_ID_ADDRESS) # Verify the clusters allocated nodes successfully self.assertEqual( wait_for_nodes(self.is_live, self.client, resource_group.name, 'cluster1', 1, NODE_STARTUP_TIMEOUT_SEC), 1) self.assertEqual( wait_for_nodes(self.is_live, self.client, resource_group.name, 'cluster2', 1, NODE_STARTUP_TIMEOUT_SEC), 1) # Execute publishing tasks on the first cluster job1 = create_custom_job( self.client, resource_group.name, location, cluster1.id, 'host_publisher', 1, 'echo hi from host > $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt') self.assertEqual( wait_for_job_completion(self.is_live, self.client, resource_group.name, job1.name, MINUTE), models.ExecutionState.succeeded) job2 = create_custom_job( self.client, resource_group.name, location, cluster1.id, 'container_publisher', 1, 'echo hi from container >> $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt', container=models.ContainerSettings( image_source_registry=models.ImageSourceRegistry( image="ubuntu"))) self.assertEqual( wait_for_job_completion(self.is_live, self.client, resource_group.name, job2.name, MINUTE), models.ExecutionState.succeeded) # Execute consumer task on the second cluster job3 = create_custom_job( self.client, resource_group.name, location, cluster2.id, 'consumer', 1, 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt; ' 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt') self.assertEqual( wait_for_job_completion(self.is_live, self.client, resource_group.name, job3.name, MINUTE), models.ExecutionState.succeeded) # Verify the data assert_job_files_are( self, self.client, resource_group.name, job3.name, STANDARD_OUTPUT_DIRECTORY_ID, { u'stdout.txt': u'hi from host\nhi from container\n', u'stderr.txt': '' }) # Delete clusters self.client.clusters.delete(resource_group.name, 'cluster1').result() self.client.clusters.delete(resource_group.name, 'cluster2').result() # Test deletion self.client.file_servers.delete(resource_group.name, self.file_server_name).result() assert_existing_file_servers_are(self, self.client, resource_group.name, [])