def _create_master_vm(cluster_details: dict) -> None: """Create MARO Master VM. Args: cluster_details (dict): details of the MARO Cluster. Returns: None. """ logger.info("Creating Master VM") # Build params vm_name = f"{cluster_details['id']}-master-vm" # Create ARM parameters and start deployment template_file_path = f"{GrassPaths.ABS_MARO_GRASS_LIB}/modes/azure/create_master/template.json" parameters_file_path = ( f"{GlobalPaths.ABS_MARO_CLUSTERS}/{cluster_details['name']}" f"/master/arm_create_master_parameters.json") ArmTemplateParameterBuilder.create_master( cluster_details=cluster_details, node_size=cluster_details["master"]["node_size"], export_path=parameters_file_path) AzureController.start_deployment( resource_group=cluster_details["cloud"]["resource_group"], deployment_name="master", template_file_path=template_file_path, parameters_file_path=parameters_file_path) # Get master IP addresses ip_addresses = AzureController.list_ip_addresses( resource_group=cluster_details["cloud"]["resource_group"], vm_name=vm_name) public_ip_address = ip_addresses[0]["virtualMachine"]["network"][ "publicIpAddresses"][0]["ipAddress"] private_ip_address = ip_addresses[0]["virtualMachine"]["network"][ "privateIpAddresses"][0] # Get other params and fill them to master_details hostname = vm_name username = cluster_details["cloud"]["default_username"] cluster_details["master"]["hostname"] = hostname cluster_details["master"]["username"] = username cluster_details["master"]["public_ip_address"] = public_ip_address cluster_details["master"]["private_ip_address"] = private_ip_address cluster_details["master"]["resource_name"] = vm_name cluster_details["master"]["ssh"] = { "port": cluster_details["connection"]["ssh"]["port"] } cluster_details["master"]["api_server"] = { "port": cluster_details["connection"]["api_server"]["port"] } logger.info_green( f"You can login to your master node with: {username}@{public_ip_address}" ) logger.info_green("Master VM is created")
def _prepare_join_cluster_deployment(cls, join_cluster_deployment: dict): # Get params. ip_addresses = AzureController.list_ip_addresses( resource_group=cls.resource_group, vm_name="node-vm") # Saved join cluster deployment. join_cluster_deployment["node"]["hostname"] = "node-vm" join_cluster_deployment["node"]["public_ip_address"] = ( ip_addresses[0]["virtualMachine"]["network"]["publicIpAddresses"] [0]["ipAddress"]) join_cluster_deployment["node"]["private_ip_address"] = ( ip_addresses[0]["virtualMachine"]["network"]["privateIpAddresses"] [0]) with open(file=cls.join_cluster_deployment_path, mode="w") as fw: yaml.safe_dump(data=join_cluster_deployment, stream=fw)
def setUpClass(cls) -> None: # Get and set params GlobalParams.LOG_LEVEL = logging.DEBUG cls.test_id = uuid.uuid4().hex[:8] os.makedirs( os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"), exist_ok=True) cls.test_file_path = os.path.abspath(__file__) cls.test_dir_path = os.path.dirname(cls.test_file_path) # Load config cls.config_path = os.path.normpath( os.path.join(cls.test_dir_path, "./config.yml")) # Load config with open(cls.config_path) as fr: config_details = yaml.safe_load(fr) if config_details["cloud/subscription"] and config_details[ "user/admin_public_key"]: pass else: raise Exception("Invalid config") # Create resource group AzureController.create_resource_group(cls.resource_group, cls.location) # Create ARM params template_file_location = f"{cls.test_dir_path}/test_checkpoint_template.json" base_parameters_file_location = f"{cls.test_dir_path}/test_checkpoint_parameters.json" parameters_file_location = os.path.expanduser( f"{GlobalPaths.MARO_TEST}/{cls.test_id}/test_checkpoint_parameters.json" ) with open(base_parameters_file_location, "r") as f: base_parameters = json.load(f) with open(parameters_file_location, "w") as fw: parameters = base_parameters["parameters"] parameters["location"]["value"] = cls.location parameters["networkInterfaceName"]["value"] = f"{cls.test_id}-nic" parameters["networkSecurityGroupName"][ "value"] = f"{cls.test_id}-nsg" parameters["virtualNetworkName"]["value"] = f"{cls.test_id}-vnet" parameters["publicIpAddressName"]["value"] = f"{cls.test_id}-pip" parameters["virtualMachineName"]["value"] = f"{cls.test_id}-vm" parameters["virtualMachineSize"]["value"] = "Standard_B2s" parameters["adminUsername"]["value"] = cls.admin_username parameters["adminPublicKey"]["value"] = config_details[ "user/admin_public_key"] parameters["storageAccountName"]["value"] = f"{cls.test_id}st" json.dump(base_parameters, fw, indent=4) # Start ARM deployment AzureController.start_deployment( resource_group=cls.resource_group, deployment_name=cls.test_id, template_file=template_file_location, parameters_file=parameters_file_location) cls._gracefully_wait(15) # Get params after ARM deployment cls.conn_str = AzureController.get_connection_string( storage_account_name=f"{cls.test_id}st") ip_addresses = AzureController.list_ip_addresses( resource_group=cls.resource_group, vm_name=f"{cls.test_id}-vm") cls.ip_address = ip_addresses[0]["virtualMachine"]["network"][ "publicIpAddresses"][0]["ipAddress"]
def _create_vm(self, node_name: str, node_size: str) -> dict: """Create MARO Node VM. Args: node_name (str): name of the MARO Node. Also the id of the MARO Node. node_size (str): size of the MARO Node VM. Returns: dict: join_cluster_deployment that needed in "join cluster" operation. See /lib/scripts/join_cluster.py for reference. """ logger.info(message=f"Creating VM '{node_name}'") # Create ARM parameters and start deployment os.makedirs( name= f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/nodes/{node_name}", exist_ok=True) template_file_path = f"{GrassPaths.ABS_MARO_GRASS_LIB}/modes/azure/create_node/template.json" parameters_file_path = ( f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/nodes/{node_name}/arm_create_node_parameters.json" ) ArmTemplateParameterBuilder.create_node( node_name=node_name, cluster_details=self.cluster_details, node_size=node_size, export_path=parameters_file_path) AzureController.start_deployment( resource_group=self.resource_group, deployment_name=node_name, template_file_path=template_file_path, parameters_file_path=parameters_file_path) # Get node IP addresses ip_addresses = AzureController.list_ip_addresses( resource_group=self.resource_group, vm_name=f"{self.cluster_id}-{node_name}-vm") logger.info_green(f"VM '{node_name}' is created") # Build join_cluster_deployment. join_cluster_deployment = { "mode": "grass/azure", "master": { "private_ip_address": self.master_private_ip_address, "api_server": { "port": self.master_api_server_port }, "redis": { "port": self.master_redis_port } }, "node": { "name": node_name, "id": node_name, "username": self.default_username, "public_ip_address": ip_addresses[0]["virtualMachine"]["network"] ["publicIpAddresses"][0]["ipAddress"], "private_ip_address": ip_addresses[0]["virtualMachine"]["network"] ["privateIpAddresses"][0], "node_size": node_size, "resource_name": f"{self.cluster_id}-{node_name}-vm", "hostname": f"{self.cluster_id}-{node_name}-vm", "resources": { "cpu": "all", "memory": "all", "gpu": "all" }, "api_server": { "port": self.api_server_port }, "ssh": { "port": self.ssh_port } }, "configs": { "install_node_runtime": False, "install_node_gpu_support": False } } with open( file= f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/nodes/{node_name}/join_cluster_deployment.yml", mode="w") as fw: yaml.safe_dump(data=join_cluster_deployment, stream=fw) return join_cluster_deployment
def _build_node_image(cluster_details: dict) -> None: """Build Azure Image for MARO Node. The built image will contain required Node runtime environment including GPU support. See https://docs.microsoft.com/en-us/azure/virtual-machines/linux/capture-image for reference. Args: cluster_details (dict): details of the MARO Cluster. Returns: None. """ logger.info("Building MARO Node image") # Build params resource_name = "build-node-image" image_name = f"{cluster_details['id']}-node-image" vm_name = f"{cluster_details['id']}-{resource_name}-vm" # Create ARM parameters and start deployment. # For simplicity, we use master_node_size as the size of build_node_image_vm here template_file_path = f"{GrassPaths.ABS_MARO_GRASS_LIB}/modes/azure/create_build_node_image_vm/template.json" parameters_file_path = ( f"{GlobalPaths.ABS_MARO_CLUSTERS}/{cluster_details['name']}" f"/build_node_image_vm/arm_create_build_node_image_vm_parameters.json" ) ArmTemplateParameterBuilder.create_build_node_image_vm( cluster_details=cluster_details, node_size=cluster_details["master"]["node_size"], export_path=parameters_file_path) AzureController.start_deployment( resource_group=cluster_details["cloud"]["resource_group"], deployment_name=resource_name, template_file_path=template_file_path, parameters_file_path=parameters_file_path) # Gracefully wait time.sleep(10) # Get public ip address ip_addresses = AzureController.list_ip_addresses( resource_group=cluster_details["cloud"]["resource_group"], vm_name=vm_name) public_ip_address = ip_addresses[0]["virtualMachine"]["network"][ "publicIpAddresses"][0]["ipAddress"] # Make sure build_node_image_vm is able to connect GrassAzureExecutor.retry_connection( node_username=cluster_details["cloud"]["default_username"], node_hostname=public_ip_address, node_ssh_port=cluster_details["connection"]["ssh"]["port"]) # Run init image script FileSynchronizer.copy_files_to_node( local_path= f"{GrassPaths.MARO_GRASS_LIB}/scripts/build_node_image_vm/init_build_node_image_vm.py", remote_dir="~/", node_username=cluster_details["cloud"]["default_username"], node_hostname=public_ip_address, node_ssh_port=cluster_details["connection"]["ssh"]["port"]) GrassAzureExecutor.remote_init_build_node_image_vm( node_username=cluster_details["cloud"]["default_username"], node_hostname=public_ip_address, node_ssh_port=cluster_details["connection"]["ssh"]["port"]) # Extract image AzureController.deallocate_vm( resource_group=cluster_details["cloud"]["resource_group"], vm_name=vm_name) AzureController.generalize_vm( resource_group=cluster_details["cloud"]["resource_group"], vm_name=vm_name) AzureController.create_image_from_vm( resource_group=cluster_details["cloud"]["resource_group"], image_name=image_name, vm_name=vm_name) # Delete resources GrassAzureExecutor._delete_resources( resource_group=cluster_details["cloud"]["resource_group"], resource_name=resource_name, cluster_id=cluster_details["id"]) logger.info_green("MARO Node Image is built")