def remote_join_cluster(node_username: str, node_hostname: str, node_ssh_port: int, master_private_ip_address: str, master_api_server_port: int, deployment_path: str) -> None: """Remote join cluster. Install required runtime env first, then download the /lib/scripts/node/join_cluster.py from master_api_server, and execute remotely. Args: node_username (str): username of the MARO Node VM. node_hostname (str): hostname of the MARO Node VM. node_ssh_port (str): ssh port of the MARO Node VM. master_private_ip_address (str): private ip address of the MARO Master VM, (master and nodes must in the same virtual network). master_api_server_port (int): port of the master_api_server. deployment_path (str): path of the join_cluster_deployment. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " "'export DEBIAN_FRONTEND=noninteractive; " "sudo -E apt update; " "sudo -E apt install -y python3-pip; " "pip3 install deepdiff redis pyyaml; " f"curl -s GET http://{master_private_ip_address}:{master_api_server_port}/v1/joinClusterScript | " f"python3 - {deployment_path}'") Subprocess.interactive_run(command=command)
def test11_node1(self) -> None: """Scale node spec Standard_D2s_v3 to 1. A Standard_D2s_v3 should be in running state. Returns: None. """ # Run command. command = f"maro grass node scale {self.cluster_name} --debug Standard_D2s_v3 1" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 1) for node_details in nodes_details: self.assertEqual(node_details["state"]["status"], NodeStatus.RUNNING) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test14_stop(self) -> None: """Stop one Standard_D2s_v3. One Standard_D2s_v3 should be in running state, and the other should be in Stopped state. Returns: None. """ # Run command. command = f"maro grass node stop {self.cluster_name} --debug Standard_D2s_v3 1" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 2) running_count = 0 stopped_count = 0 for node_details in nodes_details: if node_details["state"]["status"] == NodeStatus.RUNNING: running_count += 1 if node_details["state"]["status"] == NodeStatus.STOPPED: stopped_count += 1 self.assertEqual(running_count, 1) self.assertEqual(stopped_count, 1) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test11_image1(self) -> None: """Push image alpine:latest to the cluster. Master should load the image_file of alpine and present it to master_details. Returns: None. """ # Run command. command = f"maro grass image push {self.cluster_name} --debug --image-name alpine:latest" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True master_details = self._get_master_details() self.assertIn("alpine_latest", master_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test12_join_cluster(self) -> None: """Join a node to cluster. Returns: None. """ # Run command. command = f"maro grass node join --debug {self.join_cluster_deployment_path}" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 1) for node_details in nodes_details: self.assertEqual(node_details["state"]["status"], NodeStatus.RUNNING) self.assertIn("alpine_latest", node_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test13_image2(self) -> None: """Push image ubuntu:latest to the cluster. The only Running node should have loaded the image ubuntu:latest. Returns: None. """ # Run command. command = f"maro grass image push {self.cluster_name} --debug --image-name ubuntu:latest" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 1) for node_details in nodes_details: if node_details["state"]["status"] == NodeStatus.RUNNING: self.assertIn("alpine_latest", node_details["image_files"]) self.assertIn("ubuntu_latest", node_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def test20_train_env_provision(self): # Build docker image and load docker image. command = ( f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu:test " f"{self.maro_pkg_path}") Subprocess.run(command=command) # Run command. command = f"maro grass image push {self.cluster_name} --debug --image-name maro_runtime_cpu:test" Subprocess.interactive_run(command=command) # Check image status, failed if does not meet the desired state in 1000s. is_loaded = False start_time = time.time() while not is_loaded and start_time + 1000 >= time.time(): try: is_loaded = True nodes_details = self._list_nodes_details() for node_details in nodes_details: self.assertIn("maro_runtime_cpu_test", node_details["image_files"]) except AssertionError: is_loaded = False time.sleep(10) self.assertTrue(is_loaded)
def test16_start(self) -> None: """Start one Standard_D2s_v3. Two Standard_D2s_v3 should be in running state, and they should have loaded the image alpine:latest and ubuntu:latest. Returns: None. """ # Run command. command = f"maro grass node start {self.cluster_name} --debug Standard_D2s_v3 1" Subprocess.interactive_run(command=command) self._gracefully_wait() # Check validity, failed if does not meet the desired state in 120s. is_valid = False start_time = time.time() while not is_valid and start_time + 120 >= time.time(): try: is_valid = True nodes_details = self._list_nodes_details() self.assertEqual(len(nodes_details), 2) for node_details in nodes_details: self.assertEqual(node_details["state"]["status"], NodeStatus.RUNNING) self.assertIn("alpine_latest", node_details["image_files"]) self.assertIn("ubuntu_latest", node_details["image_files"]) except AssertionError: is_valid = False time.sleep(10) self.assertTrue(is_valid)
def setUpClass(cls, file_path: str = os.path.abspath(__file__)) -> None: # Get and set params GlobalParams.LOG_LEVEL = logging.DEBUG cls.test_id = uuid.uuid4().hex[:8] os.makedirs( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"), exist_ok=True) os.makedirs( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}/tar"), exist_ok=True) cls.file_path = os.path.abspath(__file__) cls.dir_path = os.path.dirname(cls.file_path) cls.deployment_template_path = os.path.normpath( os.path.join(cls.dir_path, "../templates/test_k8s_azure_create.yml")) cls.deployment_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/test_k8s_azure_create.yml") cls.config_path = os.path.normpath( os.path.join(cls.dir_path, "../config.yml")) # Load config and save deployment with open(cls.deployment_template_path) as fr: deployment_details = yaml.safe_load(fr) with open(cls.config_path) as fr: config_details = yaml.safe_load(fr) if config_details["cloud/subscription"] and config_details[ "user/admin_public_key"]: deployment_details["cloud"]["subscription"] = config_details[ "cloud/subscription"] deployment_details["user"][ "admin_public_key"] = config_details[ "user/admin_public_key"] else: raise Exception("Invalid config") with open(cls.deployment_path, "w") as fw: yaml.safe_dump(deployment_details, fw) # Get params from deployments cls.cluster_name = deployment_details["name"] # Init test files cls.local_big_file_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/big_file") cls.local_small_files_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/small_files") command = f"dd if=/dev/zero of={cls.local_big_file_path} bs=1 count=0 seek=1G" Subprocess.run(command=command) command = f"git clone [email protected]:microsoft/maro.git {cls.local_small_files_path}" Subprocess.run(command=command) # Create cluster command = f"maro k8s create --debug {cls.deployment_path}" Subprocess.interactive_run(command=command) cls.cluster_details = DetailsReader.load_cluster_details( cluster_name=cls.cluster_name) cls.cluster_id = cls.cluster_details["id"] cls.executor = K8sAksExecutor(cluster_name=cls.cluster_name) time.sleep(15) cls.pod_name = cls._get_redis_pod_name()
def test_1_azcopy_small_files_to_remote(self) -> None: sas = self.executor._check_and_get_account_sas() command = ( f"azcopy copy " f"'{self.local_small_files_path}' " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs" f"/{self.test_id}/test_1_azcopy_small_files_to_remote/?{sas}' " f"--recursive=True") Subprocess.interactive_run(command=command)
def test10_create(self) -> None: # Run command. command = f"maro k8s create --debug {self.create_deployment_path}" Subprocess.interactive_run(command=command) # Check validity. nodes_details = self._get_node_details() self.assertIn("Standard_D2s_v3", nodes_details) self.assertEqual(nodes_details["Standard_D2s_v3"], 1)
def local_leave_cluster() -> None: """Local leave cluster. Exec /lib/scripts/node/activate_leave_cluster.py Returns: None. """ command = "python3 ~/.maro-local/scripts/activate_leave_cluster.py" Subprocess.interactive_run(command=command)
def test_1_rsync_small_files_to_remote(self) -> None: command = (f"ssh -o StrictHostKeyChecking=no " f"{self.admin_username}@{self.master_public_ip_address} " f"'mkdir -p ~/test/{self.test_id}/test_1_rsync_small_files_to_remote'") _ = Subprocess.run(command=command) command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r " f"{self.local_small_files_path} " f"{self.admin_username}@{self.master_public_ip_address}:" f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote") Subprocess.interactive_run(command=command)
def test20_train_env_provision(self): # Build docker image and load docker image. command = ( f"docker build -f {self.maro_pkg_path}/docker_files/cpu.runtime.source.df -t maro_runtime_cpu " f"{self.maro_pkg_path}") Subprocess.run(command=command) # Run command. command = f"maro k8s image push {self.cluster_name} --debug --image-name maro_runtime_cpu" Subprocess.interactive_run(command=command)
def test12_image(self) -> None: # Run command. command = f"maro k8s image push {self.cluster_name} --debug --image-name alpine:latest" Subprocess.interactive_run(command=command) # Check validity. command = f"maro k8s image list {self.cluster_name}" return_str = Subprocess.run(command=command) images = ast.literal_eval(return_str) self.assertIn("alpine", images)
def test11_node(self) -> None: # Run command. command = f"maro k8s node scale {self.cluster_name} --debug Standard_D4s_v3 1" Subprocess.interactive_run(command=command) # Check validity. nodes_details = self._get_node_details() self.assertIn("Standard_D2s_v3", nodes_details) self.assertIn("Standard_D4s_v3", nodes_details) self.assertEqual(nodes_details["Standard_D2s_v3"], 1) self.assertEqual(nodes_details["Standard_D4s_v3"], 1)
def test_2_rsync_small_files_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local" _ = Subprocess.run(command=command) command = (f"rsync -e 'ssh -o StrictHostKeyChecking=no' -az -r " f"{self.admin_username}@{self.master_public_ip_address}:" f"~/test/{self.test_id}/test_1_rsync_small_files_to_remote " f"{GlobalPaths.MARO_TEST}/{self.test_id}/test_2_rsync_small_files_to_local") Subprocess.interactive_run(command=command) self.assertTrue(os.path.exists(os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_rsync_small_files_to_local/test_1_rsync_small_files_to_remote/small_files/README.md")))
def test_2_tar_ssh_small_files_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local" _ = Subprocess.run(command=command) basename = os.path.basename(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote") dirname = os.path.dirname(f"~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote") command = (f"ssh {self.admin_username}@{self.master_public_ip_address} 'tar cf - -C {dirname} {basename}' | " f"tar xf - -C {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_tar_ssh_small_files_to_local") Subprocess.interactive_run(command=command) self.assertTrue(os.path.exists(os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_tar_ssh_small_files_to_local/test_1_tar_ssh_small_files_to_remote/small_files/README.md")))
def tearDownClass(cls) -> None: # Delete cluster command = f"maro k8s delete --debug {cls.cluster_name}" Subprocess.interactive_run(command=command) # Print result print(json.dumps(TEST_TO_TIME, indent=4, sort_keys=True)) # Delete tmp test folder shutil.rmtree( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"))
def test_1_tar_ssh_small_files_to_remote(self) -> None: command = (f"ssh -o StrictHostKeyChecking=no " f"{self.admin_username}@{self.master_public_ip_address} " f"'mkdir -p ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'") _ = Subprocess.run(command=command) basename = os.path.basename(self.local_small_files_path) dirname = os.path.dirname(self.local_small_files_path) command = (f"tar cf - -C {dirname} {basename} | " f"ssh {self.admin_username}@{self.master_public_ip_address} " f"'tar xf - -C ~/test/{self.test_id}/test_1_tar_ssh_small_files_to_remote'") Subprocess.interactive_run(command=command)
def test_1_azcopy_tar_small_files_to_remote(self) -> None: # create remote folder command = ( f"kubectl exec -i {self.pod_name} -- " f"mkdir -p /mnt/maro/{self.test_id}/test_1_azcopy_tar_small_files_to_remote" ) Subprocess.interactive_run(command=command) # local tar zip basename = os.path.basename(self.local_small_files_path) dirname = os.path.dirname(self.local_small_files_path) tar_file_name = uuid.uuid4().hex[:8] command = f"tar cf {GlobalPaths.MARO_TEST}/{self.test_id}/tar/{tar_file_name} -C {dirname} {basename}" Subprocess.interactive_run(command=command) # azcopy sas = self.executor._check_and_get_account_sas() local_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/tar/{tar_file_name}") command = ( f"azcopy copy " f"'{local_path}' " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs" f"/tar/{tar_file_name}?{sas}' " f"--recursive=True") Subprocess.interactive_run(command=command) # remote tar unzip command = ( f"kubectl exec -i {self.pod_name} -- " f"tar xf /mnt/maro/tar/{tar_file_name} " f"-C /mnt/maro/{self.test_id}/test_1_azcopy_tar_small_files_to_remote" ) Subprocess.interactive_run(command=command)
def test_1_kubectl_exec_small_files_to_remote(self) -> None: command = ( f"kubectl exec -i {self.pod_name} -- " f"mkdir -p /mnt/maro/{self.test_id}/test_1_kubectl_exec_small_files_to_remote" ) Subprocess.interactive_run(command=command) basename = os.path.basename(self.local_small_files_path) dirname = os.path.dirname(self.local_small_files_path) command = ( f"tar cf - -C {dirname} {basename} | " f"kubectl exec -i {self.pod_name} -- " f"tar xf - -C /mnt/maro/{self.test_id}/test_1_kubectl_exec_small_files_to_remote" ) Subprocess.interactive_run(command=command)
def remote_leave_cluster(node_username: str, node_hostname: str, node_ssh_port: int) -> None: """Remote leave cluster. Exec /lib/scripts/node/activate_leave_cluster.py Args: node_username (str): username of the MARO Node VM. node_hostname (str): hostname of the MARO Node VM. node_ssh_port (str): ssh port of the MARO Node VM. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " f"'python3 ~/.maro-local/scripts/activate_leave_cluster.py'") Subprocess.interactive_run(command=command)
def remote_init_build_node_image_vm(node_username: str, node_hostname: str, node_ssh_port: int) -> None: """Remote init Build Node Image VM. Exec /lib/scripts/build_node_image_vm/init_build_node_image_vm.py remotely. Args: node_username (str): username of the vm. node_hostname (str): hostname of the vm. node_ssh_port (int): ssh port of the vm. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {node_ssh_port} {node_username}@{node_hostname} " "'python3 ~/init_build_node_image_vm.py'") Subprocess.interactive_run(command=command)
def remote_delete_master(master_username: str, master_hostname: str, master_ssh_port: int) -> None: """Remote delete MARO Master. Exec /lib/scripts/master/delete_master.py remotely. Args: master_username (str): username of the MARO Master VM. master_hostname (str): hostname of the MARO Master VM. master_ssh_port (int): ssh port of the MARO Master VM. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {master_ssh_port} {master_username}@{master_hostname} " f"'python3 {GlobalPaths.MARO_LOCAL}/scripts/delete_master.py'") Subprocess.interactive_run(command=command)
def test_2_kubectl_exec_big_file_to_local(self) -> None: command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_kubectl_exec_big_file_to_local" _ = Subprocess.run(command=command) basename = os.path.basename( f"/mnt/maro/{self.test_id}/test_1_kubectl_exec_big_file_to_remote") dirname = os.path.dirname( f"/mnt/maro/{self.test_id}/test_1_kubectl_exec_big_file_to_remote") command = ( f"kubectl exec -i {self.pod_name} -- tar cf - -C {dirname} {basename} | " f"tar xf - -C {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_kubectl_exec_big_file_to_local" ) Subprocess.interactive_run(command=command) self.assertTrue( os.path.exists( os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_kubectl_exec_big_file_to_local/test_1_kubectl_exec_big_file_to_remote/big_file" )))
def local_join_cluster(master_hostname: str, master_private_ip_address: int, deployment_path: str) -> None: """Local join cluster. Download the /lib/scripts/node/join_cluster.py from master_api_server, and execute it locally. Args: master_hostname (str): hostname of the MARO Master VM. master_private_ip_address (str): private ip address of the MARO Master VM, (master and nodes must in the same virtual network). deployment_path (str): path of the join_cluster_deployment. Returns: None. """ command = ( f"'curl -s GET http://{master_hostname}:{master_private_ip_address}/v1/joinClusterScript | " f"python3 - {deployment_path}'") Subprocess.interactive_run(command=command)
def remote_init_master(master_username: str, master_hostname: str, master_ssh_port: int, cluster_name: str) -> None: """Remote init MARO Master. Exec /lib/scripts/master/init_master.py remotely. Args: master_username (str): username of the MARO Master VM. master_hostname (str): hostname of the MARO Master VM. master_ssh_port (int): ssh port of the MARO Master VM. cluster_name (str): name of the MARO Cluster. Returns: None. """ command = ( f"ssh -o StrictHostKeyChecking=no -p {master_ssh_port} {master_username}@{master_hostname} " f"'cd {GlobalPaths.MARO_SHARED}/lib/grass; python3 -m scripts.master.init_master {cluster_name}'" ) Subprocess.interactive_run(command=command)
def test_2_azcopy_small_files_to_local(self) -> None: sas = self.executor._check_and_get_account_sas() command = f"mkdir -p {GlobalPaths.MARO_TEST}/{self.test_id}/test_2_azcopy_small_files_to_local" _ = Subprocess.run(command=command) local_path = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/test_2_azcopy_small_files_to_local" ) command = ( f"azcopy copy " f"'https://{self.cluster_id}st.file.core.windows.net/{self.cluster_id}-fs" f"/{self.test_id}/test_1_azcopy_small_files_to_remote?{sas}' " f"'{local_path}' " f"--recursive=True") Subprocess.interactive_run(command=command) self.assertTrue( os.path.exists( os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{self.test_id}/" f"test_2_azcopy_small_files_to_local/test_1_azcopy_small_files_to_remote/small_files" )))
def test30_delete(self) -> None: # Run command. command = f"maro grass delete --debug {self.cluster_name}" Subprocess.interactive_run(command=command)