Exemplo n.º 1
0
    def assertCanRunJob(self, resource_group, location, cluster_id, job_name, container_settings, timeout_sec):
        Helpers.create_custom_job(self.client, resource_group.name, cluster_id, job_name, 1,
                                  'echo hello | tee $AZ_BATCHAI_OUTPUT_OUTPUTS/hi.txt', container=container_settings)

        # Verify if the job finishes reasonably fast.
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job_name, timeout_sec),
            models.ExecutionState.succeeded)

        # Verify if output files and standard output files are available and contain expected greeting.
        Helpers.assert_job_files_are(self, self.client, resource_group.name, job_name, 'OUTPUTS',
                                     {u'hi.txt': u'hello\n'})
        Helpers.assert_job_files_are(self, self.client, resource_group.name, job_name,
                                     Helpers.STANDARD_OUTPUT_DIRECTORY_ID,
                                     {u'stdout.txt': u'hello\n', u'stderr.txt': ''})
 def test_job_creation_and_deletion(self, resource_group, location, cluster, storage_account, storage_account_key):
     """Tests simple scenario for a job - submit, check results, delete."""
     job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1,
                                     'echo hi | tee {0}/hi.txt'.format(Helpers.JOB_OUTPUT_DIRECTORY_PATH_ENV),
                                     container=models.ContainerSettings(
                                         image_source_registry=models.ImageSourceRegistry(image='ubuntu'))
                                     )  # type: models.Job
     self.assertEqual(
         Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE),
         models.ExecutionState.succeeded)
     # Check standard job output
     Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name,
                                  Helpers.STANDARD_OUTPUT_DIRECTORY_ID,
                                  {u'stdout.txt': u'hi\n', u'stderr.txt': u''})
     # Check job's output
     Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name,
                                  Helpers.JOB_OUTPUT_DIRECTORY_ID,
                                  {u'hi.txt': u'hi\n'})
     # Check that we can access the output files directly in storage using path segment returned by the server
     Helpers.assert_file_in_file_share(self, storage_account.name, storage_account_key,
                                       job.job_output_directory_path_segment + '/' + Helpers.STDOUTERR_FOLDER_NAME,
                                       'stdout.txt', u'hi\n')
     self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                             job.name).result()
     self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_running_job_deletion(self, resource_group, location, cluster):
        """Tests deletion of a running job."""
        job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'sleep 600')
        self.assertEqual(
            Helpers.wait_for_job_start_running(self.is_live, self.client, resource_group.name, job.name,
                                               Helpers.MINUTE),
            models.ExecutionState.running)

        self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                job.name).result()
        self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                   Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_running_job_termination(self, resource_group, location, cluster):
        """Tests termination of a running job."""
        job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'longrunning', 1, 'sleep 600')
        self.assertEqual(
            Helpers.wait_for_job_start_running(self.is_live, self.client, resource_group.name, job.name,
                                               Helpers.MINUTE),
            models.ExecutionState.running)

        self.client.jobs.terminate(
            resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result()
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE),
            models.ExecutionState.failed)
    def test_queued_job_termination(self, resource_group, location, cluster):
        """Tests termination of a job in queued state."""
        # Create a job which will be in queued state because the cluster has no compute nodes.
        job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'true')

        self.client.jobs.terminate(
            resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result()
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE),
            models.ExecutionState.failed)

        self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                job.name).result()
        self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                   Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_failed_job_reporting(self, resource_group, location, cluster):
        """Tests if job failure is reported correctly."""
        job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'false')
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name,
                                            Helpers.MINUTE),
            models.ExecutionState.failed)

        job = self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                   job.name)
        self.assertEqual(job.execution_info.exit_code, 1)
        self.assertEqual(len(job.execution_info.errors), 1)
        self.assertEqual(job.execution_info.errors[0].code, 'JobFailed')
        self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                job.name).result()
        self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                   Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_completed_job_termination(self, resource_group, location, cluster):
        """Tests termination of completed job."""
        job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'true')
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE),
            models.ExecutionState.succeeded)

        # termination of completed job is NOP and must not change the execution state.
        self.client.jobs.terminate(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                   job.name).result()
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE),
            models.ExecutionState.succeeded)

        self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                job.name).result()
        self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                   Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_password_less_ssh(self, resource_group, location, cluster):
        """Tests if password-less ssh is configured on hosts."""
        job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 2,
                                        'ssh 10.0.0.4 echo done && ssh 10.0.0.5 echo done')
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name,
                                            Helpers.MINUTE),
            models.ExecutionState.succeeded)

        job = self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                   job.name)

        Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name,
                                     Helpers.STANDARD_OUTPUT_DIRECTORY_ID,
                                     {u'stdout.txt': u'done\ndone\n',
                                      u'stderr.txt': re.compile('Permanently added.*')})
        self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                job.name).result()
        self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                   Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_job_container_preparation_failure_reporting(self, resource_group, location, cluster):
        """Tests if job preparation failure is reported correctly."""
        # create a job with failing job preparation
        job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'true', 'false',
                                        container=models.ContainerSettings(
                                            image_source_registry=models.ImageSourceRegistry(image='ubuntu')))
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name,
                                            Helpers.MINUTE),
            models.ExecutionState.failed)

        job = self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                   job.name)
        self.assertEqual(job.execution_info.exit_code, 1)
        self.assertEqual(len(job.execution_info.errors), 1)
        self.assertEqual(job.execution_info.errors[0].code, 'JobPreparationFailed')
        self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                job.name).result()
        self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                   Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_job_preparation_host(self, resource_group, location, cluster):
        """Tests job preparation execution for a job running on a host."""
        # create a job with job preparation which populates input data in $AZ_BATCHAI_INPUT_INPUT/hi.txt
        job = Helpers.create_custom_job(
            self.client, resource_group.name, cluster.id, 'job', 1,
            'cat $AZ_BATCHAI_INPUT_INPUT/hi.txt',
            'mkdir -p $AZ_BATCHAI_INPUT_INPUT && echo hello | tee $AZ_BATCHAI_INPUT_INPUT/hi.txt')
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name,
                                            Helpers.MINUTE),
            models.ExecutionState.succeeded)

        Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name,
                                     Helpers.STANDARD_OUTPUT_DIRECTORY_ID,
                                     {u'stdout.txt': u'hello\n',
                                      u'stderr.txt': u'',
                                      u'stdout-job_prep.txt': u'hello\n',
                                      u'stderr-job_prep.txt': u''})
        self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                                job.name).result()
        self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                   Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
    def test_file_server(self, resource_group, location, storage_account,
                         storage_account_key):
        """Tests file server functionality

        1. Create file server
        2. Create two clusters with this file server
        3. Check that the file server is mounted:
            a. submit tasks (one from host and another from container) on the first cluster to write data to nfs
            b. submit a task on the second cluster to read the data from nfs
        """
        server = Helpers.create_file_server(
            self.client, location, resource_group.name,
            self.file_server_name)  # type: models.FileServer

        cluster1 = Helpers.create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster1',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        cluster2 = Helpers.create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster2',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        # Verify the file server is reported.
        Helpers.assert_existing_file_servers_are(self, self.client,
                                                 resource_group.name,
                                                 [self.file_server_name])

        # Verify the file server become available in a reasonable time
        self.assertTrue(
            Helpers.wait_for_file_server(self.is_live, self.client,
                                         resource_group.name,
                                         self.file_server_name,
                                         _FILE_SERVER_CREATION_TIMEOUT_SEC))

        # Verify the remote login information and private ip are reported
        server = self.client.file_servers.get(resource_group.name,
                                              Helpers.DEFAULT_WORKSPACE_NAME,
                                              self.file_server_name)
        self.assertRegexpMatches(server.mount_settings.file_server_public_ip,
                                 Helpers.RE_ID_ADDRESS)
        self.assertRegexpMatches(server.mount_settings.file_server_internal_ip,
                                 Helpers.RE_ID_ADDRESS)

        # Verify the clusters allocated nodes successfully
        self.assertEqual(
            Helpers.wait_for_nodes(self.is_live, self.client,
                                   resource_group.name, 'cluster1', 1,
                                   Helpers.NODE_STARTUP_TIMEOUT_SEC), 1)
        self.assertEqual(
            Helpers.wait_for_nodes(self.is_live, self.client,
                                   resource_group.name, 'cluster2', 1,
                                   Helpers.NODE_STARTUP_TIMEOUT_SEC), 1)

        # Execute publishing tasks on the first cluster
        job1 = Helpers.create_custom_job(
            self.client, resource_group.name, cluster1.id, 'host_publisher', 1,
            'echo hi from host > $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt')
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client,
                                            resource_group.name, job1.name,
                                            Helpers.MINUTE),
            models.ExecutionState.succeeded)
        job2 = Helpers.create_custom_job(
            self.client,
            resource_group.name,
            cluster1.id,
            'container_publisher',
            1,
            'echo hi from container >> $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image="ubuntu")))
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client,
                                            resource_group.name, job2.name,
                                            Helpers.MINUTE),
            models.ExecutionState.succeeded)

        # Execute consumer task on the second cluster
        job3 = Helpers.create_custom_job(
            self.client, resource_group.name, cluster2.id, 'consumer', 1,
            'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt; '
            'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt')
        self.assertEqual(
            Helpers.wait_for_job_completion(self.is_live, self.client,
                                            resource_group.name, job3.name,
                                            Helpers.MINUTE),
            models.ExecutionState.succeeded)

        # Verify the data
        Helpers.assert_job_files_are(
            self, self.client, resource_group.name, job3.name,
            Helpers.STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'hi from host\nhi from container\n',
                u'stderr.txt': ''
            })

        # Delete clusters
        self.client.clusters.delete(resource_group.name,
                                    Helpers.DEFAULT_WORKSPACE_NAME,
                                    'cluster1').result()
        self.client.clusters.delete(resource_group.name,
                                    Helpers.DEFAULT_WORKSPACE_NAME,
                                    'cluster2').result()

        # Test deletion
        self.client.file_servers.delete(resource_group.name,
                                        Helpers.DEFAULT_WORKSPACE_NAME,
                                        self.file_server_name).result()
        Helpers.assert_existing_file_servers_are(self, self.client,
                                                 resource_group.name, [])