def test_setup_task_execution(self, resource_group, location, storage_account, storage_account_key): """Tests setup task execution. """ cluster = Helpers.create_cluster( self.client, location, resource_group.name, self.cluster_name, 'STANDARD_D1', 1, storage_account.name, storage_account_key, setup_task_cmd='echo $GREETING $SECRET_GREETING', setup_task_env={'GREETING': 'setup task'}, setup_task_secrets={'SECRET_GREETING': 'has a secret'}) # type: models.Cluster # Verify that the cluster is reported in the list of clusters Helpers.assert_existing_clusters_are(self, self.client, resource_group.name, [self.cluster_name]) # Verify that one node is allocated and become available self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, self.cluster_name, 1, Helpers.NODE_STARTUP_TIMEOUT_SEC), 1) # Check that server doesn't return values for secrets self.assertEqual(len(cluster.node_setup.setup_task.secrets), 1) self.assertEqual(cluster.node_setup.setup_task.secrets[0].name, 'SECRET_GREETING') self.assertIsNone(cluster.node_setup.setup_task.secrets[0].value) # Verify that the setup task is completed by checking generated output. BatchAI reports a path which was auto- # generated for storing setup output logs. setup_task_output_path = cluster.node_setup.setup_task.std_out_err_path_suffix nodes = Helpers.get_node_ids(self.client, resource_group.name, self.cluster_name) self.assertEqual(len(nodes), 1) node_id = nodes[0] Helpers.assert_file_in_file_share(self, storage_account.name, storage_account_key, setup_task_output_path, 'stdout-{0}.txt'.format(node_id), u'setup task has a secret\n') Helpers.assert_file_in_file_share(self, storage_account.name, storage_account_key, setup_task_output_path, 'stderr-{0}.txt'.format(node_id), u'') self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.cluster_name).result()
def test_creation_and_deletion(self, resource_group, location, storage_account, storage_account_key): """Tests basic use-case scenario. 1. Create cluster 2. Execute a task on the host 3. Execute a task in a docker container 4. Delete cluster """ cluster = Helpers.create_cluster( self.client, location, resource_group.name, self.cluster_name, 'STANDARD_D1', 1, storage_account.name, storage_account_key) self.assertEqual(cluster.name, self.cluster_name) self.assertIsNone(cluster.errors) self.assertEqual(cluster.vm_size, 'STANDARD_D1') # Verify that the cluster is reported in the list of clusters Helpers.assert_existing_clusters_are(self, self.client, resource_group.name, [self.cluster_name]) # Verify that one node is allocated and become available self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, self.cluster_name, 1, Helpers.NODE_STARTUP_TIMEOUT_SEC), 1) Helpers.assert_remote_login_info_reported_for_nodes(self, self.client, resource_group.name, self.cluster_name, 1) # Verify that the cluster able to run tasks. self.assertCanRunJobOnHost(resource_group, location, cluster.id) self.assertCanRunJobInContainer(resource_group, location, cluster.id) # Test cluster deletion self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.cluster_name).result() Helpers.assert_existing_clusters_are(self, self.client, resource_group.name, [])
def test_cluster_resizing(self, resource_group, location, storage_account, storage_account_key): """Tests manual cluster resizing""" cluster = Helpers.create_cluster( self.client, location, resource_group.name, self.cluster_name, 'STANDARD_D1', 1, storage_account.name, storage_account_key) # Verify that one node is allocated and become available self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, self.cluster_name, 1, Helpers.NODE_STARTUP_TIMEOUT_SEC), 1) Helpers.assert_remote_login_info_reported_for_nodes(self, self.client, resource_group.name, self.cluster_name, 1) self.assertCanResizeCluster(resource_group, 0) self.assertCanResizeCluster(resource_group, 1) # Verify that cluster able to run tasks after resizing. self.assertCanRunJobOnHost(resource_group, location, cluster.id) self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.cluster_name).result()
def test_auto_scaling(self, resource_group, location, storage_account, storage_account_key): """Tests auto-scaling""" # Create the cluster with no nodes. cluster = Helpers.create_cluster( self.client, location, resource_group.name, self.cluster_name, 'STANDARD_D1', 0, storage_account.name, storage_account_key) # Switch the cluster into auto-scale mode self.client.clusters.update(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.cluster_name, scale_settings=models.ScaleSettings( auto_scale=models.AutoScaleSettings( minimum_node_count=0, maximum_node_count=1))) # Submit a task. BatchAI must increase the number of nodes to execute the task. self.assertCanRunJobOnHost(resource_group, location, cluster.id, timeout_sec=Helpers.AUTO_SCALE_TIMEOUT_SEC) # Verify that cluster downsized to zero since there are no more jobs for it self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, self.cluster_name, 0, Helpers.NODE_STARTUP_TIMEOUT_SEC), 0) self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.cluster_name).result()
def test_file_server(self, resource_group, location, storage_account, storage_account_key): """Tests file server functionality 1. Create file server 2. Create two clusters with this file server 3. Check that the file server is mounted: a. submit tasks (one from host and another from container) on the first cluster to write data to nfs b. submit a task on the second cluster to read the data from nfs """ server = Helpers.create_file_server( self.client, location, resource_group.name, self.file_server_name) # type: models.FileServer cluster1 = Helpers.create_cluster( self.client, location, resource_group.name, 'cluster1', 'STANDARD_D1', 1, storage_account.name, storage_account_key, file_servers=[ models.FileServerReference( file_server=models.ResourceId(id=server.id), relative_mount_path='nfs', mount_options="rw") ]) cluster2 = Helpers.create_cluster( self.client, location, resource_group.name, 'cluster2', 'STANDARD_D1', 1, storage_account.name, storage_account_key, file_servers=[ models.FileServerReference( file_server=models.ResourceId(id=server.id), relative_mount_path='nfs', mount_options="rw") ]) # Verify the file server is reported. Helpers.assert_existing_file_servers_are(self, self.client, resource_group.name, [self.file_server_name]) # Verify the file server become available in a reasonable time self.assertTrue( Helpers.wait_for_file_server(self.is_live, self.client, resource_group.name, self.file_server_name, _FILE_SERVER_CREATION_TIMEOUT_SEC)) # Verify the remote login information and private ip are reported server = self.client.file_servers.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.file_server_name) self.assertRegexpMatches(server.mount_settings.file_server_public_ip, Helpers.RE_ID_ADDRESS) self.assertRegexpMatches(server.mount_settings.file_server_internal_ip, Helpers.RE_ID_ADDRESS) # Verify the clusters allocated nodes successfully self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, 'cluster1', 1, Helpers.NODE_STARTUP_TIMEOUT_SEC), 1) self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, 'cluster2', 1, Helpers.NODE_STARTUP_TIMEOUT_SEC), 1) # Execute publishing tasks on the first cluster job1 = Helpers.create_custom_job( self.client, resource_group.name, cluster1.id, 'host_publisher', 1, 'echo hi from host > $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job1.name, Helpers.MINUTE), models.ExecutionState.succeeded) job2 = Helpers.create_custom_job( self.client, resource_group.name, cluster1.id, 'container_publisher', 1, 'echo hi from container >> $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt', container=models.ContainerSettings( image_source_registry=models.ImageSourceRegistry( image="ubuntu"))) self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job2.name, Helpers.MINUTE), models.ExecutionState.succeeded) # Execute consumer task on the second cluster job3 = Helpers.create_custom_job( self.client, resource_group.name, cluster2.id, 'consumer', 1, 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt; ' 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job3.name, Helpers.MINUTE), models.ExecutionState.succeeded) # Verify the data Helpers.assert_job_files_are( self, self.client, resource_group.name, job3.name, Helpers.STANDARD_OUTPUT_DIRECTORY_ID, { u'stdout.txt': u'hi from host\nhi from container\n', u'stderr.txt': '' }) # Delete clusters self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, 'cluster1').result() self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, 'cluster2').result() # Test deletion self.client.file_servers.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.file_server_name).result() Helpers.assert_existing_file_servers_are(self, self.client, resource_group.name, [])