def _start_node(self, node_name: str): logger.info(f"Starting node {node_name}") # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] node_details = self.grass_executor.remote_get_node_details( node_name=node_name) node_public_ip_address = node_details['public_ip_address'] # Start node AzureExecutor.start_vm(resource_group=resource_group, vm_name=f"{cluster_id}-{node_name}-vm") # Update node status self.grass_executor.remote_update_node_status(node_name=node_name, action='start') # Make sure the node is able to connect self.grass_executor.retry_until_connected( node_ip_address=node_public_ip_address) # Load images self.grass_executor.remote_load_images( node_name=node_name, parallels=GlobalParams.PARALLELS, node_ip_address=node_public_ip_address) # Load node agent service self.grass_executor.remote_load_node_agent_service( node_name=node_name, node_ip_address=node_public_ip_address) logger.info_green(f"Node {node_name} is started")
def delete(self): logger.info(f"Deleting cluster {self.cluster_name}") # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Get resource list resource_list = AzureExecutor.list_resources(resource_group=resource_group) # Filter resources deletable_ids = [] for resource in resource_list: if resource['name'].startswith(cluster_id): deletable_ids.append(resource['id']) # Delete resources if deletable_ids: AzureExecutor.delete_resources(resources=deletable_ids) # Delete cluster folder rmtree(os.path.expanduser(f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}")) logger.info_green(f"Cluster {self.cluster_name} is deleted")
def _load_k8s_context(self): # Load details cluster_details = self.cluster_details resource_group = cluster_details['cloud']['resource_group'] cluster_id = cluster_details['id'] # Load context AzureExecutor.load_aks_context(resource_group=resource_group, aks_name=f"{cluster_id}-aks")
def tearDownClass(cls) -> None: # Print result print(json.dumps(cls.test_func_to_time, indent=4, sort_keys=True)) # Delete resource group AzureExecutor.delete_resource_group(resource_group=cls.resource_group) # Delete tmp test folder shutil.rmtree( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"))
def _attach_acr(self): # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Attach ACR AzureExecutor.attach_acr(resource_group=resource_group, aks_name=f"{cluster_id}-aks", acr_name=f"{cluster_id}acr")
def _create_master(self): logger.info("Creating master VM") # Load details cluster_details = self.cluster_details master_details = cluster_details['master'] cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] admin_username = cluster_details['user']['admin_username'] node_size = cluster_details['master']['node_size'] # Create ARM parameters self._create_deployment_parameters( node_name='master', cluster_details=cluster_details, node_size=node_size, export_dir=os.path.expanduser(f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters") ) # Start deployment template_file_location = f"{GlobalPaths.MARO_GRASS_LIB}/azure/grass-create-default-node-template.json" parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/master.json" AzureExecutor.start_deployment( resource_group=resource_group, deployment_name='master', template_file=template_file_location, parameters_file=parameters_file_location ) # Get master IP addresses ip_addresses = AzureExecutor.list_ip_addresses( resource_group=resource_group, vm_name=f"{cluster_id}-master-vm" ) public_ip_address = ip_addresses[0]["virtualMachine"]["network"]['publicIpAddresses'][0]['ipAddress'] private_ip_address = ip_addresses[0]["virtualMachine"]["network"]['privateIpAddresses'][0] hostname = f"{cluster_id}-master-vm" master_details['public_ip_address'] = public_ip_address master_details['private_ip_address'] = private_ip_address master_details['hostname'] = hostname master_details['resource_name'] = f"{cluster_id}-master-vm" logger.info_green(f"You can login to your master node with: ssh {admin_username}@{public_ip_address}") # Save details save_cluster_details( cluster_name=self.cluster_name, cluster_details=cluster_details, sync=False ) logger.info_green("Master VM is created")
def push_image(self, image_name: str): # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] remote_image_name = f"{cluster_id}acr.azurecr.io/{image_name}" # ACR login AzureExecutor.login_acr(acr_name=f"{cluster_id}acr") # Tag image command = f"docker tag {image_name} {remote_image_name}" _ = SubProcess.run(command) # Push image to ACR command = f"docker push {remote_image_name}" _ = SubProcess.run(command)
def list_image(self): # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] # List acr repository acr_repositories = AzureExecutor.list_acr_repositories(acr_name=f"{cluster_id}acr") logger.info(acr_repositories)
def _stop_node(self, node_name: str): logger.info(f"Stopping node {node_name}") # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Stop node AzureExecutor.stop_vm(resource_group=resource_group, vm_name=f"{cluster_id}-{node_name}-vm") # Update node status self.grass_executor.remote_update_node_status(node_name=node_name, action='stop') logger.info_green(f"Node {node_name} is stopped")
def _scale_node_pool(self, replicas: int, node_size: str, node_size_to_info: dict): logger.info(f"Scaling {node_size} NodePool") # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Scale node pool AzureExecutor.scale_nodepool( resource_group=resource_group, aks_name=f"{cluster_id}-aks", nodepool_name=node_size_to_info[node_size]['name'], node_count=replicas ) logger.info_green(f"{node_size} NodePool is scaled")
def _build_node_pool(self, replicas: int, node_size: str): logger.info(f"Building {node_size} NodePool") # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Build nodepool AzureExecutor.add_nodepool( resource_group=resource_group, aks_name=f"{cluster_id}-aks", nodepool_name=K8sAzureExecutor._generate_nodepool_name(key=node_size), node_count=replicas, node_size=node_size ) logger.info_green(f"{node_size} NodePool is built")
def _build_image_address(self, image_name: str) -> str: # Load details cluster_id = self.cluster_details['id'] # Get repositories acr_repositories = AzureExecutor.list_acr_repositories(acr_name=f"{cluster_id}acr") # Build address if image_name in acr_repositories: return f"{cluster_id}acr.azurecr.io/{image_name}" else: return image_name
def _create_resource_group(self): # Load and reload details cluster_details = self.cluster_details subscription = cluster_details['cloud']['subscription'] resource_group = cluster_details['cloud']['resource_group'] location = cluster_details['cloud']['location'] # Check if Azure CLI is installed version_details = AzureExecutor.get_version() logger.info_green( f"Your Azure CLI version: {version_details['azure-cli']}") # Set subscription id AzureExecutor.set_subscription(subscription=subscription) logger.info_green(f"Set subscription to: {subscription}") # Check and create resource group resource_group_details = AzureExecutor.get_resource_group( resource_group=resource_group) if resource_group_details is not None: logger.warning_yellow( f"Azure resource group {resource_group} already exists") else: AzureExecutor.create_resource_group(resource_group=resource_group, location=location) logger.info_green(f"Resource group: {resource_group} is created")
def _delete_node(self, node_name: str): logger.info(f"Deleting node {node_name}") # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Get resource list resource_list = AzureExecutor.list_resources(resource_group=resource_group) # Filter resources deletable_ids = [] for resource_info in resource_list: if resource_info['name'].startswith(f"{cluster_id}-{node_name}"): deletable_ids.append(resource_info['id']) # Delete resources if len(deletable_ids) > 0: AzureExecutor.delete_resources(resources=deletable_ids) # Delete azure deployment AzureExecutor.delete_deployment( resource_group=resource_group, deployment_name=node_name ) # Delete parameters_file parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/{node_name}.json" command = f"rm {parameters_file_location}" _ = SubProcess.run(command) # Update node status self.grass_executor.remote_update_node_status( node_name=node_name, action='delete' ) logger.info_green(f"Node {node_name} is deleted")
def _create_k8s_cluster(self): logger.info(f"Creating k8s cluster") # Load details cluster_details = self.cluster_details resource_group = cluster_details['cloud']['resource_group'] # Create ARM parameters self._create_deployment_parameters(export_dir=os.path.expanduser( f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters")) # Start deployment template_file_location = f"{GlobalPaths.MARO_K8S_LIB}/azure/k8s-create-template.json" parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/aks_cluster.json" AzureExecutor.start_deployment( resource_group=resource_group, deployment_name='aks_cluster', template_file=template_file_location, parameters_file=parameters_file_location) # Attach ACR self._attach_acr()
def _get_node_size_to_spec(self) -> dict: # Load details cluster_details = self.cluster_details location = cluster_details['cloud']['location'] # List available sizes for VM specs = AzureExecutor.list_vm_sizes(location=location) # Build node_size_to_spec node_size_to_spec = {} for spec in specs: node_size_to_spec[spec['name']] = spec return node_size_to_spec
def _get_node_size_to_info(self): # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # List nodepool nodepools = AzureExecutor.list_nodepool(resource_group=resource_group, aks_name=f"{cluster_id}-aks") # Build node_size_to_count node_size_to_count = {} for nodepool in nodepools: node_size_to_count[nodepool['vmSize']] = nodepool return node_size_to_count
def list_node(self): # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Get aks details aks_details = AzureExecutor.get_aks(resource_group=resource_group, aks_name=f"{cluster_id}-aks") agent_pools_details = aks_details['agentPoolProfiles'] # Filter and print node_details = {} for agent_pool_details in agent_pools_details: node_details[ agent_pool_details['vmSize']] = agent_pool_details['count'] logger.info(json.dumps(node_details, indent=4, sort_keys=True))
def _create_k8s_secret(self): # Load details cluster_details = self.cluster_details cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Get storage account key storage_account_keys = AzureExecutor.get_storage_account_keys( resource_group=resource_group, storage_account_name=f"{cluster_id}st") storage_key = storage_account_keys[0]['value'] # Create k8s secret command = f'kubectl create secret generic {cluster_id}-k8s-secret ' \ f'--from-literal=azurestorageaccountname={cluster_id}st ' \ f'--from-literal=azurestorageaccountkey={storage_key}' _ = SubProcess.run(command) logger.debug(command)
def _check_and_get_account_sas(self): """ Ref: https://msdn.microsoft.com/library/azure/mt584140.aspx """ # Load details cluster_details = self.cluster_details cloud_details = cluster_details['cloud'] cluster_id = cluster_details['id'] # Regenerate sas if the key is None or expired TODO: if 'account_sas' not in cloud_details: account_sas = AzureExecutor.get_storage_account_sas( account_name=f'{cluster_id}st') cloud_details['account_sas'] = account_sas save_cluster_details(cluster_name=self.cluster_name, cluster_details=cluster_details) return cloud_details['account_sas']
def setUpClass(cls) -> None: # Get and set params GlobalParams.LOG_LEVEL = logging.DEBUG cls.test_id = uuid.uuid4().hex[:8] os.makedirs( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"), exist_ok=True) cls.test_file_path = os.path.abspath(__file__) cls.test_dir_path = os.path.dirname(cls.test_file_path) # Load config cls.config_path = os.path.normpath( os.path.join(cls.test_dir_path, "./config.yml")) # Load config with open(cls.config_path) as fr: config_details = yaml.safe_load(fr) if config_details["cloud/subscription"] and config_details[ "user/admin_public_key"]: pass else: raise Exception("Invalid config") # Create resource group AzureExecutor.create_resource_group(cls.resource_group, cls.location) # Create ARM params template_file_location = f"{cls.test_dir_path}/test_checkpoint_template.json" base_parameters_file_location = f"{cls.test_dir_path}/test_checkpoint_parameters.json" parameters_file_location = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/test_checkpoint_parameters.json" ) with open(base_parameters_file_location, "r") as f: base_parameters = json.load(f) with open(parameters_file_location, "w") as fw: parameters = base_parameters["parameters"] parameters["location"]["value"] = cls.location parameters["networkInterfaceName"]["value"] = f"{cls.test_id}-nic" parameters["networkSecurityGroupName"][ "value"] = f"{cls.test_id}-nsg" parameters["virtualNetworkName"]["value"] = f"{cls.test_id}-vnet" parameters["publicIpAddressName"]["value"] = f"{cls.test_id}-pip" parameters["virtualMachineName"]["value"] = f"{cls.test_id}-vm" parameters["virtualMachineSize"]["value"] = "Standard_B2s" parameters["adminUsername"]["value"] = cls.admin_username parameters["adminPublicKey"]["value"] = config_details[ "user/admin_public_key"] parameters["storageAccountName"]["value"] = f"{cls.test_id}st" json.dump(base_parameters, fw, indent=4) # Start ARM deployment AzureExecutor.start_deployment( resource_group=cls.resource_group, deployment_name=cls.test_id, template_file=template_file_location, parameters_file=parameters_file_location) cls._gracefully_wait(15) # Get params after ARM deployment cls.conn_str = AzureExecutor.get_connection_string( storage_account_name=f"{cls.test_id}st") ip_addresses = AzureExecutor.list_ip_addresses( resource_group=cls.resource_group, vm_name=f"{cls.test_id}-vm") cls.ip_address = ip_addresses[0]["virtualMachine"]["network"][ "publicIpAddresses"][0]["ipAddress"]
def _create_vm(self, node_name: str, node_size: str, node_size_to_spec: dict): logger.info(message=f"Creating VM {node_name}") # Load details cluster_details = self.cluster_details location = cluster_details['cloud']['location'] cluster_id = cluster_details['id'] resource_group = cluster_details['cloud']['resource_group'] # Create ARM parameters GrassAzureExecutor._create_deployment_parameters( node_name=node_name, cluster_details=cluster_details, node_size=node_size, export_dir=os.path.expanduser( f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters")) # Get sku and check gpu nums gpu_nums = 0 node_size_sku = AzureExecutor.get_sku(vm_size=node_size, location=location) if node_size_sku is not None: for capability in node_size_sku["capabilities"]: if capability["name"] == "GPUs": gpu_nums = int(capability["value"]) break # Start deployment if gpu_nums > 0: template_file_location = f"{GlobalPaths.MARO_GRASS_LIB}/azure/grass-create-gpu-node-template.json" else: template_file_location = f"{GlobalPaths.MARO_GRASS_LIB}/azure/grass-create-default-node-template.json" parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/{node_name}.json" AzureExecutor.start_deployment( resource_group=resource_group, deployment_name=node_name, template_file=template_file_location, parameters_file=parameters_file_location) # Get node IP addresses ip_addresses = AzureExecutor.list_ip_addresses( resource_group=resource_group, vm_name=f"{cluster_id}-{node_name}-vm") # Save details node_details = { 'public_ip_address': ip_addresses[0]["virtualMachine"]["network"]['publicIpAddresses'] [0]['ipAddress'], 'private_ip_address': ip_addresses[0]["virtualMachine"]["network"]['privateIpAddresses'] [0], 'node_size': node_size, 'resource_name': f"{cluster_id}-{node_name}-vm", 'hostname': f"{cluster_id}-{node_name}-vm", 'resources': { 'cpu': node_size_to_spec[node_size]['numberOfCores'], 'memory': node_size_to_spec[node_size]['memoryInMb'], 'gpu': gpu_nums } } self.grass_executor.remote_set_node_details( node_name=node_name, node_details=node_details, ) logger.info_green(f"VM {node_name} is created")
def tearDownClass(cls) -> None: # Delete resource group after the test AzureExecutor.delete_resource_group(cls.resource_group)