Пример #1
0
 def test_experiments_isolation(self, resource_group, location):
     self.client.workspaces.create(resource_group.name, 'first',
                                   location).result()
     self.client.workspaces.create(resource_group.name, 'second',
                                   location).result()
     # Create a cluster, two experiments and a job in each experiment
     for workspace in ['first', 'second']:
         cluster = self.client.clusters.create(
             resource_group.name,
             workspace,
             'cluster',
             parameters=models.ClusterCreateParameters(
                 vm_size='STANDARD_D1',
                 scale_settings=models.ScaleSettings(
                     manual=models.ManualScaleSettings(
                         target_node_count=0)),
                 user_account_settings=models.UserAccountSettings(
                     admin_user_name=helpers.ADMIN_USER_NAME,
                     admin_user_password=helpers.ADMIN_USER_PASSWORD),
                 vm_priority='lowpriority')).result()
         for experiment in ['exp1', 'exp2']:
             self.client.experiments.create(resource_group.name, workspace,
                                            experiment).result()
             self.client.jobs.create(
                 resource_group.name,
                 workspace,
                 experiment,
                 'job',
                 parameters=models.JobCreateParameters(
                     cluster=models.ResourceId(id=cluster.id),
                     node_count=1,
                     std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT',
                     custom_toolkit_settings=models.CustomToolkitSettings(
                         command_line='true'))).result()
     # Delete exp1 in the first workspace
     self.client.experiments.delete(resource_group.name, 'first',
                                    'exp1').result()
     # Ensure the experiment was actually deleted
     self.assertRaises(
         CloudError, lambda: self.client.experiments.get(
             resource_group.name, 'first', 'exp1'))
     for workspace in ['first', 'second']:
         # Ensure the clusters are not affected
         self.client.clusters.get(resource_group.name, workspace, 'cluster')
         # Ensure the other experiments are not affected
         for experiment in ['exp1', 'exp2']:
             if workspace == 'first' and experiment == 'exp1':
                 continue
             self.client.experiments.get(resource_group.name, workspace,
                                         experiment)
             job = self.client.jobs.get(resource_group.name, workspace,
                                        experiment, 'job')
             # And check the job are not terminated
             self.assertEqual(job.execution_state,
                              models.ExecutionState.queued)
Пример #2
0
    def create_custom_job(client,
                          resource_group,
                          cluster_id,
                          job_name,
                          nodes,
                          cmd,
                          job_preparation_cmd=None,
                          container=None):
        """Creates custom toolkit job

        :param BatchAIManagementClient client: client instance.
        :param str resource_group: resource group name.
        :param str cluster_id: resource Id of the cluster.
        :param str job_name: job name.
        :param int nodes: number of nodes to execute the job.
        :param str cmd: command line to run.
        :param str or None job_preparation_cmd: Job preparation command line.
        :param models.ContainerSettings or None container: container settings to run the job.
        :return models.Job: the created job.
        """
        job_preparation = None
        if job_preparation_cmd:
            job_preparation = models.JobPreparation(
                command_line=job_preparation_cmd)
        client.experiments.create(resource_group,
                                  Helpers.DEFAULT_WORKSPACE_NAME,
                                  Helpers.DEFAULT_EXPERIMENT_NAME).result()
        return client.jobs.create(
            resource_group,
            Helpers.DEFAULT_WORKSPACE_NAME,
            Helpers.DEFAULT_EXPERIMENT_NAME,
            job_name,
            parameters=models.JobCreateParameters(
                cluster=models.ResourceId(id=cluster_id),
                node_count=nodes,
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    Helpers.AZURE_FILES_MOUNTING_PATH),
                output_directories=[
                    models.OutputDirectory(
                        id=Helpers.JOB_OUTPUT_DIRECTORY_ID,
                        path_prefix=Helpers.JOB_OUTPUT_DIRECTORY_PATH,
                        path_suffix="files")
                ],
                input_directories=[
                    models.InputDirectory(
                        id='INPUT',
                        path='$AZ_BATCHAI_MOUNT_ROOT/{0}/input'.format(
                            Helpers.AZURE_FILES_MOUNTING_PATH))
                ],
                container_settings=container,
                job_preparation=job_preparation,
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line=cmd))).result()
Пример #3
0
 def test_job_environment_variables_and_secrets(self, resource_group,
                                                location, cluster):
     """Tests if it's possible to mount external file systems for a job."""
     job_name = 'job'
     job = self.client.jobs.create(
         resource_group.name,
         helpers.DEFAULT_WORKSPACE_NAME,
         helpers.DEFAULT_EXPERIMENT_NAME,
         job_name,
         parameters=models.JobCreateParameters(
             cluster=models.ResourceId(id=cluster.id),
             node_count=1,
             std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                 helpers.AZURE_FILES_MOUNTING_PATH),
             environment_variables=[
                 models.EnvironmentVariable(name='VARIABLE', value='VALUE')
             ],
             secrets=[
                 models.EnvironmentVariableWithSecretValue(
                     name='SECRET_VARIABLE', value='SECRET')
             ],
             # Check that the job preparation has access to env variables and secrets.
             job_preparation=models.JobPreparation(
                 command_line='echo $VARIABLE $SECRET_VARIABLE'),
             # Check that the job has access to env variables and secrets.
             custom_toolkit_settings=models.CustomToolkitSettings(
                 command_line='echo $VARIABLE $SECRET_VARIABLE'))).result(
                 )  # type: models.Job
     self.assertEqual(
         helpers.wait_for_job_completion(self.is_live, self.client,
                                         resource_group.name, job.name,
                                         helpers.MINUTE),
         models.ExecutionState.succeeded)
     # Check that environment variables are reported by the server.
     self.assertEqual(len(job.environment_variables), 1)
     self.assertEqual(job.environment_variables[0].name, 'VARIABLE')
     self.assertEqual(job.environment_variables[0].value, 'VALUE')
     # Check that secrets are reported back by server, but value is not reported.
     self.assertEqual(len(job.secrets), 1)
     self.assertEqual(job.secrets[0].name, 'SECRET_VARIABLE')
     self.assertIsNone(job.secrets[0].value)
     # Check that job and job prep had access to the env variables and secrets.
     helpers.assert_job_files_are(
         self, self.client, resource_group.name, job.name,
         helpers.STANDARD_OUTPUT_DIRECTORY_ID, {
             u'stdout.txt': u'VALUE SECRET\n',
             u'stderr.txt': u'',
             u'stdout-job_prep.txt': u'VALUE SECRET\n',
             u'stderr-job_prep.txt': u''
         })
Пример #4
0
def create_job(config,
               cluster_id,
               workspace,
               experiment,
               job_name,
               image_name,
               command,
               number_of_vms=1):
    ''' Creates job
    '''
    input_directories = [
        models.InputDirectory(id='SCRIPT',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, job_name)),
        models.InputDirectory(id='DATASET',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, 'data'))
    ]

    std_output_path_prefix = "$AZ_BATCHAI_MOUNT_ROOT/{0}".format(
        config.fileshare_mount_point)

    output_directories = [
        models.OutputDirectory(id='MODEL',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="models"),
        models.OutputDirectory(id='NOTEBOOKS',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="notebooks")
    ]

    parameters = models.JobCreateParameters(
        location=config.location,
        cluster=models.ResourceId(id=cluster_id),
        node_count=number_of_vms,
        input_directories=input_directories,
        std_out_err_path_prefix=std_output_path_prefix,
        output_directories=output_directories,
        container_settings=models.ContainerSettings(
            image_source_registry=models.ImageSourceRegistry(
                image=image_name)),
        custom_toolkit_settings=models.CustomToolkitSettings(
            command_line=command))

    client = client_from(config)
    _ = client.jobs.create(config.group_name, workspace, experiment, job_name,
                           parameters)
parameters = ParameterSweep(param_specs)

# create a template for Batch AI job
jcp = models.JobCreateParameters(
    cluster=models.ResourceId(id=cluster.id),
    node_count=1,
    std_out_err_path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/logs',
    output_directories=[
        models.OutputDirectory(id='ALL',
                               path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/output')
    ],
    custom_toolkit_settings=models.CustomToolkitSettings(
        command_line=
        'python $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts/FF_multi_step_multivariate.py \
        --scriptdir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts \
        --datadir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/data \
        --outdir $AZ_BATCHAI_OUTPUT_ALL \
        -l {0} -n {1} -b {2} -T {3} -r {4} -a {5}'.format(
            parameters['LATENT_DIM'], parameters['HIDDEN_LAYERS'],
            parameters['BATCH_SIZE'], parameters['T'],
            parameters['LEARNING_RATE'], parameters['ALPHA'])),
    container_settings=models.ContainerSettings(
        image_source_registry=models.ImageSourceRegistry(
            image=cfg['docker_image'])),
    mount_volumes=models.MountVolumes(azure_file_shares=[
        models.AzureFileShareReference(
            account_name=cfg['storage_account']['name'],
            credentials=models.AzureStorageCredentialsInfo(
                account_key=cfg['storage_account']['key']),
            azure_file_url='https://' + cfg['storage_account']['name'] +
            '.file.core.windows.net/logs',
            relative_mount_path='logs'),
Пример #6
0
ts_from = sys.argv[1]
ts_to = sys.argv[2]
device_ids = j['device_ids']
tags = j['tags']
job_name_template = j['job_name']

credentials = ServicePrincipalCredentials(client_id=CLIENT,
                                          secret=KEY,
                                          tenant=TENANT_ID)

batchai_client = batchai.BatchAIManagementClient(
    credentials=credentials, subscription_id=subscription_id)
cluster = batchai_client.clusters.get(resource_group_name, cluster_name)

# run an async job for each sensor
for device_id in device_ids:
    for tag in tags:
        job_name = job_name_template.format(device_id, tag)
        custom_settings = baimodels.CustomToolkitSettings(
            command_line=command_line.format(device_id, tag, ts_from, ts_to,
                                             config_file_path))
        print('command line: ' + custom_settings.command_line)
        params = baimodels.job_create_parameters.JobCreateParameters(
            location=location,
            cluster=baimodels.ResourceId(cluster.id),
            node_count=node_count,
            std_out_err_path_prefix=std_out_err_path_prefix,
            custom_toolkit_settings=custom_settings)

        batchai_client.jobs.create(resource_group_name, job_name, params)
Пример #7
0
parameters = ParameterSweep(param_specs)

# create a template for Batch AI job
jcp = models.JobCreateParameters(
    cluster=models.ResourceId(id=cluster.id),
    node_count=1,
    std_out_err_path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/logs',
    output_directories=[
        models.OutputDirectory(id='ALL',
                               path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/output')
    ],
    custom_toolkit_settings=models.CustomToolkitSettings(
        command_line=
        'python $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts/RNN_teacher_forcing.py \
        --scriptdir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts \
        --datadir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/data \
        --outdir $AZ_BATCHAI_OUTPUT_ALL \
        -l {0} -b {1} -T {2} -r {3} -a {4}'.format(
            parameters['LATENT_DIM_1'], parameters['BATCH_SIZE'],
            parameters['T'], parameters['LEARNING_RATE'],
            parameters['ALPHA'])),
    container_settings=models.ContainerSettings(
        image_source_registry=models.ImageSourceRegistry(
            image=cfg['docker_image'])),
    mount_volumes=models.MountVolumes(azure_file_shares=[
        models.AzureFileShareReference(
            account_name=cfg['storage_account']['name'],
            credentials=models.AzureStorageCredentialsInfo(
                account_key=cfg['storage_account']['key']),
            azure_file_url='https://' + cfg['storage_account']['name'] +
            '.file.core.windows.net/logs',
            relative_mount_path='logs'),
Пример #8
0
# create a template for Batch AI job
jcp = models.JobCreateParameters(
    cluster=models.ResourceId(id=cluster.id),
    node_count=1,
    std_out_err_path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/logs',
    output_directories=[
        models.OutputDirectory(
            id='ALL',
            path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/output'
        )
    ],
    custom_toolkit_settings=models.CustomToolkitSettings(
        command_line='python $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts/RNN_multi_step_encoder_decoder_simple.py \
        --scriptdir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts \
        --datadir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/data \
        --outdir $AZ_BATCHAI_OUTPUT_ALL \
        -e1 {0} -e2 {1} -d1 {2} -d2 {3} -b {4} -T {5} -r {6} -a {7}'.format(parameters['ENCODER_DIM_1'], parameters['ENCODER_DIM_2'],
                                                                            parameters['DECODER_DIM_1'], parameters['DECODER_DIM_2'],
                                                                            parameters['BATCH_SIZE'], parameters['T'],
                                                                            parameters['LEARNING_RATE'], parameters['ALPHA'])
    ),
    container_settings=models.ContainerSettings(
        image_source_registry=models.ImageSourceRegistry(image=cfg['docker_image'])
    ),
    mount_volumes = models.MountVolumes(
        azure_file_shares=[
            models.AzureFileShareReference(
                account_name=cfg['storage_account']['name'],
                credentials=models.AzureStorageCredentialsInfo(account_key=cfg['storage_account']['key']),
                azure_file_url='https://'+cfg['storage_account']['name']+'.file.core.windows.net/logs',
                relative_mount_path='logs'),
            models.AzureFileShareReference(
Пример #9
0
    def test_job_level_mounting(self, resource_group, location, cluster,
                                storage_account, storage_account_key):
        """Tests if it's possible to mount external file systems for a job."""
        job_name = 'job'

        # Create file share and container to mount on the job level
        if storage_account.name != FAKE_STORAGE.name:
            files = FileService(storage_account.name, storage_account_key)
            files.create_share('jobshare', fail_on_exist=False)
            blobs = BlockBlobService(storage_account.name, storage_account_key)
            blobs.create_container('jobcontainer', fail_on_exist=False)

        job = self.client.jobs.create(
            resource_group.name,
            job_name,
            parameters=models.JobCreateParameters(
                location=location,
                cluster=models.ResourceId(id=cluster.id),
                node_count=1,
                mount_volumes=models.
                MountVolumes(azure_file_shares=[
                    models.AzureFileShareReference(
                        account_name=storage_account.name,
                        azure_file_url='https://{0}.file.core.windows.net/{1}'.
                        format(storage_account.name, 'jobshare'),
                        relative_mount_path='job_afs',
                        credentials=models.AzureStorageCredentialsInfo(
                            account_key=storage_account_key),
                    )
                ],
                             azure_blob_file_systems=[
                                 models.AzureBlobFileSystemReference(
                                     account_name=storage_account.name,
                                     container_name='jobcontainer',
                                     relative_mount_path='job_bfs',
                                     credentials=models.
                                     AzureStorageCredentialsInfo(
                                         account_key=storage_account_key),
                                 )
                             ]),
                # Put standard output on cluster level AFS to check that the job has access to it.
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    AZURE_FILES_MOUNTING_PATH),
                # Create two output directories on job level AFS and blobfuse.
                output_directories=[
                    models.OutputDirectory(
                        id='OUTPUT1',
                        path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_afs'),
                    models.OutputDirectory(
                        id='OUTPUT2',
                        path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_bfs')
                ],
                # Check that the job preparation has access to job level file systems.
                job_preparation=models.JobPreparation(
                    command_line=
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/prep_afs.txt; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/prep_bfs.txt; '
                    'echo done'),
                # Check that the job has access to job
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line=
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/job_afs.txt; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/job_bfs.txt; '
                    'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT1/afs; '
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/afs/job_afs.txt; '
                    'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs/job_bfs.txt; '
                    'echo done'))).result()
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        job = self.client.jobs.get(resource_group.name, job.name)
        # Assert job and job prep standard output is populated on cluster level filesystem
        assert_job_files_are(
            self, self.client, resource_group.name, job.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'done\n',
                u'stderr.txt': u'',
                u'stdout-job_prep.txt': u'done\n',
                u'stderr-job_prep.txt': u''
            })
        # Assert files are generated on job level AFS
        assert_job_files_are(self, self.client, resource_group.name, job.name,
                             'OUTPUT1', {
                                 u'job_afs.txt': u'afs\n',
                                 u'prep_afs.txt': u'afs\n',
                                 u'afs': None
                             })
        # Assert files are generated on job level blobfuse
        assert_job_files_are(self, self.client, resource_group.name, job.name,
                             'OUTPUT2', {
                                 u'job_bfs.txt': u'bfs\n',
                                 u'prep_bfs.txt': u'bfs\n',
                                 u'bfs': None
                             })
        # Assert subfolders are available via API
        assert_job_files_in_path_are(self, self.client, resource_group.name,
                                     job.name, 'OUTPUT1', 'afs',
                                     {u'job_afs.txt': u'afs\n'})
        assert_job_files_in_path_are(self, self.client, resource_group.name,
                                     job.name, 'OUTPUT2', 'bfs',
                                     {u'job_bfs.txt': u'bfs\n'})

        # Assert that we can access the output files created on job level mount volumes directly in storage using path
        # segment returned by the server.
        if storage_account.name != FAKE_STORAGE.name:
            files = FileService(storage_account.name, storage_account_key)
            self.assertTrue(
                files.exists(
                    'jobshare', job.job_output_directory_path_segment + '/' +
                    OUTPUT_DIRECTORIES_FOLDER_NAME, 'job_afs.txt'))
            blobs = BlockBlobService(storage_account.name, storage_account_key)
            self.assertTrue(
                blobs.exists(
                    'jobcontainer', job.job_output_directory_path_segment +
                    '/' + OUTPUT_DIRECTORIES_FOLDER_NAME + '/job_bfs.txt'))
        # After the job is done the filesystems should be unmounted automatically, check this by submitting a new job.
        checker = self.client.jobs.create(
            resource_group.name,
            'checker',
            parameters=models.JobCreateParameters(
                location=location,
                cluster=models.ResourceId(id=cluster.id),
                node_count=1,
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    AZURE_FILES_MOUNTING_PATH),
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line='echo job; df | grep -E "job_bfs|job_afs"'))
        ).result()
        # Check the job failed because there are not job level mount volumes anymore
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, checker.name, MINUTE),
            models.ExecutionState.failed)
        # Check that the cluster level AFS was still mounted
        assert_job_files_are(self, self.client, resource_group.name,
                             checker.name, STANDARD_OUTPUT_DIRECTORY_ID, {
                                 u'stdout.txt': u'job\n',
                                 u'stderr.txt': u''
                             })