示例#1
0
    def _start_node(self, node_name: str):
        logger.info(f"Starting node {node_name}")

        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']
        node_details = self.grass_executor.remote_get_node_details(
            node_name=node_name)
        node_public_ip_address = node_details['public_ip_address']

        # Start node
        AzureExecutor.start_vm(resource_group=resource_group,
                               vm_name=f"{cluster_id}-{node_name}-vm")

        # Update node status
        self.grass_executor.remote_update_node_status(node_name=node_name,
                                                      action='start')

        # Make sure the node is able to connect
        self.grass_executor.retry_until_connected(
            node_ip_address=node_public_ip_address)

        # Load images
        self.grass_executor.remote_load_images(
            node_name=node_name,
            parallels=GlobalParams.PARALLELS,
            node_ip_address=node_public_ip_address)

        # Load node agent service
        self.grass_executor.remote_load_node_agent_service(
            node_name=node_name, node_ip_address=node_public_ip_address)

        logger.info_green(f"Node {node_name} is started")
示例#2
0
    def delete(self):
        logger.info(f"Deleting cluster {self.cluster_name}")

        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Get resource list
        resource_list = AzureExecutor.list_resources(resource_group=resource_group)

        # Filter resources
        deletable_ids = []
        for resource in resource_list:
            if resource['name'].startswith(cluster_id):
                deletable_ids.append(resource['id'])

        # Delete resources
        if deletable_ids:
            AzureExecutor.delete_resources(resources=deletable_ids)

        # Delete cluster folder
        rmtree(os.path.expanduser(f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}"))

        logger.info_green(f"Cluster {self.cluster_name} is deleted")
示例#3
0
    def _load_k8s_context(self):
        # Load details
        cluster_details = self.cluster_details
        resource_group = cluster_details['cloud']['resource_group']
        cluster_id = cluster_details['id']

        # Load context
        AzureExecutor.load_aks_context(resource_group=resource_group,
                                       aks_name=f"{cluster_id}-aks")
示例#4
0
    def tearDownClass(cls) -> None:
        # Print result
        print(json.dumps(cls.test_func_to_time, indent=4, sort_keys=True))

        # Delete resource group
        AzureExecutor.delete_resource_group(resource_group=cls.resource_group)

        # Delete tmp test folder
        shutil.rmtree(
            os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"))
示例#5
0
    def _attach_acr(self):
        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Attach ACR
        AzureExecutor.attach_acr(resource_group=resource_group,
                                 aks_name=f"{cluster_id}-aks",
                                 acr_name=f"{cluster_id}acr")
示例#6
0
    def _create_master(self):
        logger.info("Creating master VM")

        # Load details
        cluster_details = self.cluster_details
        master_details = cluster_details['master']
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']
        admin_username = cluster_details['user']['admin_username']
        node_size = cluster_details['master']['node_size']

        # Create ARM parameters
        self._create_deployment_parameters(
            node_name='master',
            cluster_details=cluster_details,
            node_size=node_size,
            export_dir=os.path.expanduser(f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters")
        )

        # Start deployment
        template_file_location = f"{GlobalPaths.MARO_GRASS_LIB}/azure/grass-create-default-node-template.json"
        parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/master.json"
        AzureExecutor.start_deployment(
            resource_group=resource_group,
            deployment_name='master',
            template_file=template_file_location,
            parameters_file=parameters_file_location
        )

        # Get master IP addresses
        ip_addresses = AzureExecutor.list_ip_addresses(
            resource_group=resource_group,
            vm_name=f"{cluster_id}-master-vm"
        )
        public_ip_address = ip_addresses[0]["virtualMachine"]["network"]['publicIpAddresses'][0]['ipAddress']
        private_ip_address = ip_addresses[0]["virtualMachine"]["network"]['privateIpAddresses'][0]
        hostname = f"{cluster_id}-master-vm"
        master_details['public_ip_address'] = public_ip_address
        master_details['private_ip_address'] = private_ip_address
        master_details['hostname'] = hostname
        master_details['resource_name'] = f"{cluster_id}-master-vm"
        logger.info_green(f"You can login to your master node with: ssh {admin_username}@{public_ip_address}")

        # Save details
        save_cluster_details(
            cluster_name=self.cluster_name,
            cluster_details=cluster_details,
            sync=False
        )

        logger.info_green("Master VM is created")
示例#7
0
    def push_image(self, image_name: str):
        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        remote_image_name = f"{cluster_id}acr.azurecr.io/{image_name}"

        # ACR login
        AzureExecutor.login_acr(acr_name=f"{cluster_id}acr")

        # Tag image
        command = f"docker tag {image_name} {remote_image_name}"
        _ = SubProcess.run(command)

        # Push image to ACR
        command = f"docker push {remote_image_name}"
        _ = SubProcess.run(command)
示例#8
0
    def list_image(self):
        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']

        # List acr repository
        acr_repositories = AzureExecutor.list_acr_repositories(acr_name=f"{cluster_id}acr")
        logger.info(acr_repositories)
示例#9
0
    def _stop_node(self, node_name: str):
        logger.info(f"Stopping node {node_name}")

        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Stop node
        AzureExecutor.stop_vm(resource_group=resource_group,
                              vm_name=f"{cluster_id}-{node_name}-vm")

        # Update node status
        self.grass_executor.remote_update_node_status(node_name=node_name,
                                                      action='stop')

        logger.info_green(f"Node {node_name} is stopped")
示例#10
0
    def _scale_node_pool(self, replicas: int, node_size: str, node_size_to_info: dict):
        logger.info(f"Scaling {node_size} NodePool")

        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Scale node pool
        AzureExecutor.scale_nodepool(
            resource_group=resource_group,
            aks_name=f"{cluster_id}-aks",
            nodepool_name=node_size_to_info[node_size]['name'],
            node_count=replicas
        )

        logger.info_green(f"{node_size} NodePool is scaled")
示例#11
0
    def _build_node_pool(self, replicas: int, node_size: str):
        logger.info(f"Building {node_size} NodePool")

        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Build nodepool
        AzureExecutor.add_nodepool(
            resource_group=resource_group,
            aks_name=f"{cluster_id}-aks",
            nodepool_name=K8sAzureExecutor._generate_nodepool_name(key=node_size),
            node_count=replicas,
            node_size=node_size
        )

        logger.info_green(f"{node_size} NodePool is built")
示例#12
0
    def _build_image_address(self, image_name: str) -> str:
        # Load details
        cluster_id = self.cluster_details['id']

        # Get repositories
        acr_repositories = AzureExecutor.list_acr_repositories(acr_name=f"{cluster_id}acr")

        # Build address
        if image_name in acr_repositories:
            return f"{cluster_id}acr.azurecr.io/{image_name}"
        else:
            return image_name
示例#13
0
    def _create_resource_group(self):
        # Load and reload details
        cluster_details = self.cluster_details
        subscription = cluster_details['cloud']['subscription']
        resource_group = cluster_details['cloud']['resource_group']
        location = cluster_details['cloud']['location']

        # Check if Azure CLI is installed
        version_details = AzureExecutor.get_version()
        logger.info_green(
            f"Your Azure CLI version: {version_details['azure-cli']}")

        # Set subscription id
        AzureExecutor.set_subscription(subscription=subscription)
        logger.info_green(f"Set subscription to: {subscription}")

        # Check and create resource group
        resource_group_details = AzureExecutor.get_resource_group(
            resource_group=resource_group)
        if resource_group_details is not None:
            logger.warning_yellow(
                f"Azure resource group {resource_group} already exists")
        else:
            AzureExecutor.create_resource_group(resource_group=resource_group,
                                                location=location)
            logger.info_green(f"Resource group: {resource_group} is created")
示例#14
0
    def _delete_node(self, node_name: str):
        logger.info(f"Deleting node {node_name}")

        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Get resource list
        resource_list = AzureExecutor.list_resources(resource_group=resource_group)

        # Filter resources
        deletable_ids = []
        for resource_info in resource_list:
            if resource_info['name'].startswith(f"{cluster_id}-{node_name}"):
                deletable_ids.append(resource_info['id'])

        # Delete resources
        if len(deletable_ids) > 0:
            AzureExecutor.delete_resources(resources=deletable_ids)

        # Delete azure deployment
        AzureExecutor.delete_deployment(
            resource_group=resource_group,
            deployment_name=node_name
        )

        # Delete parameters_file
        parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/{node_name}.json"
        command = f"rm {parameters_file_location}"
        _ = SubProcess.run(command)

        # Update node status
        self.grass_executor.remote_update_node_status(
            node_name=node_name,
            action='delete'
        )

        logger.info_green(f"Node {node_name} is deleted")
示例#15
0
    def _create_k8s_cluster(self):
        logger.info(f"Creating k8s cluster")

        # Load details
        cluster_details = self.cluster_details
        resource_group = cluster_details['cloud']['resource_group']

        # Create ARM parameters
        self._create_deployment_parameters(export_dir=os.path.expanduser(
            f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters"))

        # Start deployment
        template_file_location = f"{GlobalPaths.MARO_K8S_LIB}/azure/k8s-create-template.json"
        parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/aks_cluster.json"
        AzureExecutor.start_deployment(
            resource_group=resource_group,
            deployment_name='aks_cluster',
            template_file=template_file_location,
            parameters_file=parameters_file_location)

        # Attach ACR
        self._attach_acr()
示例#16
0
    def _get_node_size_to_spec(self) -> dict:
        # Load details
        cluster_details = self.cluster_details
        location = cluster_details['cloud']['location']

        # List available sizes for VM
        specs = AzureExecutor.list_vm_sizes(location=location)

        # Build node_size_to_spec
        node_size_to_spec = {}
        for spec in specs:
            node_size_to_spec[spec['name']] = spec

        return node_size_to_spec
示例#17
0
    def _get_node_size_to_info(self):
        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # List nodepool
        nodepools = AzureExecutor.list_nodepool(resource_group=resource_group,
                                                aks_name=f"{cluster_id}-aks")

        # Build node_size_to_count
        node_size_to_count = {}
        for nodepool in nodepools:
            node_size_to_count[nodepool['vmSize']] = nodepool

        return node_size_to_count
示例#18
0
    def list_node(self):
        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Get aks details
        aks_details = AzureExecutor.get_aks(resource_group=resource_group,
                                            aks_name=f"{cluster_id}-aks")
        agent_pools_details = aks_details['agentPoolProfiles']

        # Filter and print
        node_details = {}
        for agent_pool_details in agent_pools_details:
            node_details[
                agent_pool_details['vmSize']] = agent_pool_details['count']
        logger.info(json.dumps(node_details, indent=4, sort_keys=True))
示例#19
0
    def _create_k8s_secret(self):
        # Load details
        cluster_details = self.cluster_details
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Get storage account key
        storage_account_keys = AzureExecutor.get_storage_account_keys(
            resource_group=resource_group,
            storage_account_name=f"{cluster_id}st")
        storage_key = storage_account_keys[0]['value']

        # Create k8s secret
        command = f'kubectl create secret generic {cluster_id}-k8s-secret ' \
                  f'--from-literal=azurestorageaccountname={cluster_id}st ' \
                  f'--from-literal=azurestorageaccountkey={storage_key}'
        _ = SubProcess.run(command)
        logger.debug(command)
示例#20
0
    def _check_and_get_account_sas(self):
        """
        Ref: https://msdn.microsoft.com/library/azure/mt584140.aspx
        """

        # Load details
        cluster_details = self.cluster_details
        cloud_details = cluster_details['cloud']
        cluster_id = cluster_details['id']

        # Regenerate sas if the key is None or expired TODO:
        if 'account_sas' not in cloud_details:
            account_sas = AzureExecutor.get_storage_account_sas(
                account_name=f'{cluster_id}st')
            cloud_details['account_sas'] = account_sas
            save_cluster_details(cluster_name=self.cluster_name,
                                 cluster_details=cluster_details)

        return cloud_details['account_sas']
示例#21
0
    def setUpClass(cls) -> None:
        # Get and set params
        GlobalParams.LOG_LEVEL = logging.DEBUG
        cls.test_id = uuid.uuid4().hex[:8]
        os.makedirs(
            os.path.expanduser(f"{GlobalPaths.MARO_TEST}/{cls.test_id}"),
            exist_ok=True)
        cls.test_file_path = os.path.abspath(__file__)
        cls.test_dir_path = os.path.dirname(cls.test_file_path)

        # Load config
        cls.config_path = os.path.normpath(
            os.path.join(cls.test_dir_path, "./config.yml"))

        # Load config
        with open(cls.config_path) as fr:
            config_details = yaml.safe_load(fr)
            if config_details["cloud/subscription"] and config_details[
                    "user/admin_public_key"]:
                pass
            else:
                raise Exception("Invalid config")

        # Create resource group
        AzureExecutor.create_resource_group(cls.resource_group, cls.location)

        # Create ARM params
        template_file_location = f"{cls.test_dir_path}/test_checkpoint_template.json"
        base_parameters_file_location = f"{cls.test_dir_path}/test_checkpoint_parameters.json"
        parameters_file_location = os.path.expanduser(
            f"{GlobalPaths.MARO_TEST}/{cls.test_id}/test_checkpoint_parameters.json"
        )
        with open(base_parameters_file_location, "r") as f:
            base_parameters = json.load(f)
        with open(parameters_file_location, "w") as fw:
            parameters = base_parameters["parameters"]
            parameters["location"]["value"] = cls.location
            parameters["networkInterfaceName"]["value"] = f"{cls.test_id}-nic"
            parameters["networkSecurityGroupName"][
                "value"] = f"{cls.test_id}-nsg"
            parameters["virtualNetworkName"]["value"] = f"{cls.test_id}-vnet"
            parameters["publicIpAddressName"]["value"] = f"{cls.test_id}-pip"
            parameters["virtualMachineName"]["value"] = f"{cls.test_id}-vm"
            parameters["virtualMachineSize"]["value"] = "Standard_B2s"
            parameters["adminUsername"]["value"] = cls.admin_username
            parameters["adminPublicKey"]["value"] = config_details[
                "user/admin_public_key"]
            parameters["storageAccountName"]["value"] = f"{cls.test_id}st"
            json.dump(base_parameters, fw, indent=4)

        # Start ARM deployment
        AzureExecutor.start_deployment(
            resource_group=cls.resource_group,
            deployment_name=cls.test_id,
            template_file=template_file_location,
            parameters_file=parameters_file_location)
        cls._gracefully_wait(15)

        # Get params after ARM deployment
        cls.conn_str = AzureExecutor.get_connection_string(
            storage_account_name=f"{cls.test_id}st")
        ip_addresses = AzureExecutor.list_ip_addresses(
            resource_group=cls.resource_group, vm_name=f"{cls.test_id}-vm")
        cls.ip_address = ip_addresses[0]["virtualMachine"]["network"][
            "publicIpAddresses"][0]["ipAddress"]
示例#22
0
    def _create_vm(self, node_name: str, node_size: str,
                   node_size_to_spec: dict):
        logger.info(message=f"Creating VM {node_name}")

        # Load details
        cluster_details = self.cluster_details
        location = cluster_details['cloud']['location']
        cluster_id = cluster_details['id']
        resource_group = cluster_details['cloud']['resource_group']

        # Create ARM parameters
        GrassAzureExecutor._create_deployment_parameters(
            node_name=node_name,
            cluster_details=cluster_details,
            node_size=node_size,
            export_dir=os.path.expanduser(
                f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters"))

        # Get sku and check gpu nums
        gpu_nums = 0
        node_size_sku = AzureExecutor.get_sku(vm_size=node_size,
                                              location=location)
        if node_size_sku is not None:
            for capability in node_size_sku["capabilities"]:
                if capability["name"] == "GPUs":
                    gpu_nums = int(capability["value"])
                    break

        # Start deployment
        if gpu_nums > 0:
            template_file_location = f"{GlobalPaths.MARO_GRASS_LIB}/azure/grass-create-gpu-node-template.json"
        else:
            template_file_location = f"{GlobalPaths.MARO_GRASS_LIB}/azure/grass-create-default-node-template.json"
        parameters_file_location = f"{GlobalPaths.MARO_CLUSTERS}/{self.cluster_name}/parameters/{node_name}.json"
        AzureExecutor.start_deployment(
            resource_group=resource_group,
            deployment_name=node_name,
            template_file=template_file_location,
            parameters_file=parameters_file_location)

        # Get node IP addresses
        ip_addresses = AzureExecutor.list_ip_addresses(
            resource_group=resource_group,
            vm_name=f"{cluster_id}-{node_name}-vm")

        # Save details
        node_details = {
            'public_ip_address':
            ip_addresses[0]["virtualMachine"]["network"]['publicIpAddresses']
            [0]['ipAddress'],
            'private_ip_address':
            ip_addresses[0]["virtualMachine"]["network"]['privateIpAddresses']
            [0],
            'node_size':
            node_size,
            'resource_name':
            f"{cluster_id}-{node_name}-vm",
            'hostname':
            f"{cluster_id}-{node_name}-vm",
            'resources': {
                'cpu': node_size_to_spec[node_size]['numberOfCores'],
                'memory': node_size_to_spec[node_size]['memoryInMb'],
                'gpu': gpu_nums
            }
        }
        self.grass_executor.remote_set_node_details(
            node_name=node_name,
            node_details=node_details,
        )

        logger.info_green(f"VM {node_name} is created")
示例#23
0
 def tearDownClass(cls) -> None:
     # Delete resource group after the test
     AzureExecutor.delete_resource_group(cls.resource_group)