def _add_flannel_cni(self): template_file = os.path.join(self.capz_flannel_dir, "flannel/kube-flannel.yaml.j2") context = { "cluster_network_subnet": self.deployer.cluster_network_subnet, "flannel_mode": self.opts.flannel_mode } kube_flannel = "/tmp/kube-flannel.yaml" utils.render_template(template_file, kube_flannel, context) server_core_tag = "windowsservercore-%s" % ( self.opts.base_container_image_tag) mode = "overlay" if self.opts.flannel_mode == constants.FLANNEL_MODE_L2BRIDGE: mode = "l2bridge" context = { "server_core_tag": server_core_tag, "container_runtime": self.opts.container_runtime, "mode": mode } kube_flannel_windows = "/tmp/kube-flannel-windows.yaml" searchpath = os.path.join(self.capz_flannel_dir, "flannel") utils.render_template("kube-flannel-windows.yaml.j2", kube_flannel_windows, context, searchpath) cmd = [self.kubectl, "apply", "-f", kube_flannel] utils.retry_on_error()(utils.run_shell_cmd)(cmd) cmd = [self.kubectl, "apply", "-f", kube_flannel_windows] utils.retry_on_error()(utils.run_shell_cmd)(cmd) if self.opts.flannel_mode == constants.FLANNEL_MODE_OVERLAY: self._set_vxlan_devices_mtu()
def _create_bootstrap_azure_vm(self): self.logging.info("Setting up the bootstrap Azure VM") vm_nic = self._create_bootstrap_vm_nic() vm_parameters = { "location": self.azure_location, "os_profile": { "computer_name": self.bootstrap_vm_name, "admin_username": "******", "linux_configuration": { "disable_password_authentication": True, "ssh": { "public_keys": [{ "key_data": os.environ["AZURE_SSH_PUBLIC_KEY"], "path": "/home/capi/.ssh/authorized_keys" }] } } }, "hardware_profile": { "vm_size": self.bootstrap_vm_size }, "storage_profile": { "image_reference": { "publisher": "Canonical", "offer": "0001-com-ubuntu-server-focal", "sku": "20_04-lts-gen2", "version": "latest" }, }, "network_profile": { "network_interfaces": [{ "id": vm_nic.id }] } } self.logging.info("Creating bootstrap VM") vm = utils.retry_on_error()( self.compute_client.virtual_machines.begin_create_or_update)( self.cluster_name, self.bootstrap_vm_name, vm_parameters).result() vm = self._wait_for_bootstrap_vm() ip_config = utils.retry_on_error()( self.network_client.network_interfaces.get)( self.cluster_name, vm_nic.name).ip_configurations[0] bootstrap_vm_private_ip = ip_config.private_ip_address public_ip = utils.retry_on_error()( self.network_client.public_ip_addresses.get)( self.cluster_name, self.bootstrap_vm_public_ip_name) bootstrap_vm_public_ip = public_ip.ip_address self.logging.info("Waiting for bootstrap VM SSH port to be reachable") utils.wait_for_port_connectivity(bootstrap_vm_public_ip, 22) self.logging.info("Finished setting up the bootstrap VM") return { 'private_ip': bootstrap_vm_private_ip, 'public_ip': bootstrap_vm_public_ip, 'vm': vm, }
def _set_vxlan_devices_mtu(self): self.logging.info( "Set the proper MTU for the k8s master vxlan devices") ssh_key_path = (os.environ.get("SSH_KEY") or os.path.join(os.environ.get("HOME"), ".ssh/id_rsa")) utils.retry_on_error()(utils.run_shell_cmd)([ "ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-i", ssh_key_path, "capi@%s" % self.deployer.master_public_address, "'sudo bash -s' < %s" % os.path.join( self.e2e_runner_dir, "scripts/set-vxlan-devices-mtu.sh") ])
def _get_agents_private_addresses(self, operating_system): cmd = [ self.kubectl, "get", "nodes", "--kubeconfig", self.capz_kubeconfig_path, "-o", "yaml" ] output, _ = utils.retry_on_error()(utils.run_shell_cmd)(cmd, sensitive=True) addresses = [] nodes = yaml.safe_load(output) for node in nodes['items']: node_os = node['status']['nodeInfo']['operatingSystem'] if node_os != operating_system: continue try: node_addresses = [ n['address'] for n in node['status']['addresses'] if n['type'] == 'InternalIP' ] except Exception as ex: self.logging.warning( "Cannot find private address for node %s. Exception " "details: %s. Skipping", node["metadata"]["name"], ex) continue # pick the first node internal address addresses.append(node_addresses[0]) return addresses
def enable_ip_forwarding(self): self.logging.info("Enabling IP forwarding for the cluster VMs") vm_nics = utils.retry_on_error()( self.network_client.network_interfaces.list)(self.cluster_name) for nic in vm_nics: if nic.name == self.bootstrap_vm_nic_name: continue if nic.enable_ip_forwarding: self.logging.info("IP forwarding is already enabled on nic %s", nic.name) continue self.logging.info("Enabling IP forwarding on nic %s", nic.name) nic_parameters = nic.as_dict() nic_parameters["enable_ip_forwarding"] = True utils.retry_on_error()( self.network_client.network_interfaces.begin_create_or_update)( self.cluster_name, nic.name, nic_parameters).result()
def _add_kube_proxy_windows(self): template_file = os.path.join(self.capz_flannel_dir, "kube-proxy/kube-proxy-windows.yaml.j2") server_core_tag = "windowsservercore-%s" % ( self.opts.base_container_image_tag) enable_ipv6dualstack = str(self.opts.enable_ipv6dualstack).lower() context = { "kubernetes_version": self.kubernetes_version, "server_core_tag": server_core_tag, "enable_win_dsr": str(self.opts.enable_win_dsr).lower(), "enable_ipv6dualstack": enable_ipv6dualstack, "flannel_mode": self.opts.flannel_mode } output_file = "/tmp/kube-proxy-windows.yaml" utils.render_template(template_file, output_file, context) cmd = [self.kubectl, "apply", "-f", output_file] utils.retry_on_error()(utils.run_shell_cmd)(cmd)
def master_public_address(self): cmd = [ self.kubectl, "get", "cluster", "--kubeconfig", self.mgmt_kubeconfig_path, self.cluster_name, "-o", "custom-columns=MASTER_ADDRESS:.spec.controlPlaneEndpoint.host", "--no-headers" ] output, _ = utils.retry_on_error()(utils.run_shell_cmd)(cmd) return output.decode().strip()
def _create_bootstrap_vm_public_ip(self): self.logging.info("Creating bootstrap VM public IP") public_ip_parameters = { "location": self.azure_location, "public_ip_address_version": "IPV4" } return utils.retry_on_error()( self.network_client.public_ip_addresses.begin_create_or_update)( self.cluster_name, self.bootstrap_vm_public_ip_name, public_ip_parameters).result()
def connect_agents_to_controlplane_subnet(self): self.logging.info("Connecting agents VMs to the control-plane subnet") control_plane_subnet = utils.retry_on_error()( self.network_client.subnets.get)( self.cluster_name, "{}-vnet".format(self.cluster_name), "{}-controlplane-subnet".format(self.cluster_name)) subnet_id = control_plane_subnet.id for vm in self._get_agents_vms(): self.logging.info("Connecting VM {}".format(vm.name)) nic_id = vm.network_profile.network_interfaces[0].id vm_nic = self._get_vm_nic(nic_id) nic_address = vm_nic.ip_configurations[0].private_ip_address route = self._get_vm_route(nic_address) self.logging.info("Shutting down VM") utils.retry_on_error()( self.compute_client.virtual_machines.begin_deallocate)( self.cluster_name, vm.name).wait() self.logging.info("Updating VM NIC subnet") nic_parameters = vm_nic.as_dict() nic_model = net_models.NetworkInterface(**nic_parameters) nic_model.ip_configurations[0]['subnet']['id'] = subnet_id utils.retry_on_error()( self.network_client.network_interfaces.begin_create_or_update)( self.cluster_name, vm_nic.name, nic_model).wait() self.logging.info("Starting VM") utils.retry_on_error()( self.compute_client.virtual_machines.begin_start)( self.cluster_name, vm.name).wait() self.logging.info("Updating the node routetable") route_params = route.as_dict() vm_nic = self._get_vm_nic(nic_id) # Refresh NIC info nic_address = vm_nic.ip_configurations[0].private_ip_address route_params["next_hop_ip_address"] = nic_address utils.retry_on_error()( self.network_client.routes.begin_create_or_update)( self.cluster_name, "{}-node-routetable".format(self.cluster_name), route.name, route_params).wait() self.logging.info( "Waiting until VM address is refreshed in the CAPZ cluster") for attempt in Retrying(stop=stop_after_delay(10 * 60), wait=wait_exponential(max=30), reraise=True): with attempt: addresses = self._get_agents_private_addresses("windows") assert nic_address in addresses
def _create_capz_cluster(self): bootstrap_vm_address = "{}:8081".format(self.bootstrap_vm_private_ip) context = { "cluster_name": self.cluster_name, "cluster_network_subnet": self.cluster_network_subnet, "azure_location": self.azure_location, "azure_subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"], "azure_tenant_id": os.environ["AZURE_TENANT_ID"], "azure_client_id": os.environ["AZURE_CLIENT_ID"], "azure_client_secret": os.environ["AZURE_CLIENT_SECRET"], "azure_ssh_public_key": os.environ["AZURE_SSH_PUBLIC_KEY"], "azure_ssh_public_key_b64": os.environ["AZURE_SSH_PUBLIC_KEY_B64"], "master_vm_size": self.master_vm_size, "win_minion_count": self.win_minion_count, "win_minion_size": self.win_minion_size, "win_minion_image_type": self.win_minion_image_type, "bootstrap_vm_address": bootstrap_vm_address, "ci_version": self.ci_version, "flannel_mode": self.flannel_mode, "container_runtime": self.container_runtime, "k8s_bins": "k8sbins" in self.bins_built, "sdn_cni_bins": "sdncnibins" in self.bins_built, "containerd_bins": "containerdbins" in self.bins_built, "containerd_shim_bins": "containerdshim" in self.bins_built, } if self.win_minion_image_type == constants.SHARED_IMAGE_GALLERY_TYPE: parsed = self._parse_win_minion_image_gallery() context["win_minion_image_rg"] = parsed["resource_group"] context["win_minion_image_gallery"] = parsed["gallery_name"] context["win_minion_image_definition"] = parsed["image_definition"] context["win_minion_image_version"] = parsed["image_version"] elif self.win_minion_image_type == constants.MANAGED_IMAGE_TYPE: context["win_minion_image_id"] = self.win_minion_image_id self.logging.info("Create CAPZ cluster") output_file = "/tmp/capz-cluster.yaml" utils.render_template("cluster.yaml.j2", output_file, context, self.capz_dir) utils.retry_on_error()(utils.run_shell_cmd)([ self.kubectl, "apply", "--kubeconfig", self.mgmt_kubeconfig_path, "-f", output_file ])
def _prepull_images(self, timeout=3600): prepull_yaml_path = "/tmp/prepull-windows-images.yaml" utils.download_file(self.opts.prepull_yaml, prepull_yaml_path) self.logging.info("Starting Windows images pre-pull") utils.retry_on_error()(utils.run_shell_cmd)( [self.kubectl, "apply", "-f", prepull_yaml_path]) self.logging.info( "Waiting up to %.2f minutes to pre-pull Windows container images", timeout / 60.0) cmd = [self.kubectl, "get", "-o", "yaml", "-f", prepull_yaml_path] for attempt in Retrying(stop=stop_after_delay(timeout), wait=wait_exponential(max=30), retry=retry_if_exception_type(AssertionError), reraise=True): with attempt: output, _ = utils.run_shell_cmd(cmd, sensitive=True) ds = yaml.safe_load(output.decode()) ready_nr = ds["status"]["numberReady"] desired_ready_nr = ds["status"]["desiredNumberScheduled"] assert ready_nr == desired_ready_nr self.logging.info("Windows images successfully pre-pulled") self.logging.info("Cleaning up") utils.retry_on_error()(utils.run_shell_cmd)( [self.kubectl, "delete", "--wait", "-f", prepull_yaml_path])
def _create_bootstrap_vm_nic(self): self.logging.info("Creating bootstrap VM NIC") public_ip = self._create_bootstrap_vm_public_ip() control_plane_subnet = utils.retry_on_error()( self.network_client.subnets.get)( self.cluster_name, "%s-vnet" % self.cluster_name, "%s-controlplane-subnet" % self.cluster_name) nic_parameters = { "location": self.azure_location, "ip_configurations": [{ "name": "%s-ipconfig" % self.bootstrap_vm_nic_name, "subnet": { "id": control_plane_subnet.id }, "public_ip_address": { "id": public_ip.id } }] } return utils.retry_on_error()( self.network_client.network_interfaces.begin_create_or_update)( self.cluster_name, self.bootstrap_vm_nic_name, nic_parameters).result()
def _create_resource_group(self): self.logging.info("Creating Azure resource group") resource_group_params = { 'location': self.azure_location, 'tags': self.resource_group_tags, } self.resource_mgmt_client.resource_groups.create_or_update( self.cluster_name, resource_group_params) for attempt in Retrying(stop=stop_after_delay(600), wait=wait_exponential(max=30), retry=retry_if_exception_type(AssertionError), reraise=True): with attempt: rg = utils.retry_on_error()( self.resource_mgmt_client.resource_groups.get)( self.cluster_name) assert rg.properties.provisioning_state == "Succeeded"
def _create_node_subnet(self): self.logging.info("Creating Azure vNET node subnet") nsg = self._create_node_secgroup() route_table = self._node_route_table subnet_params = { "address_prefix": self.node_subnet_cidr_block, "network_security_group": { "id": nsg.id }, "route_table": { "id": route_table.id }, } return utils.retry_on_error()( self.network_client.subnets.begin_create_or_update)( self.cluster_name, "{}-vnet".format(self.cluster_name), "{}-node-subnet".format(self.cluster_name), subnet_params).result()
def _validate_k8s_api_versions(self): self.logging.info("Validating K8s API versions") output, _ = utils.retry_on_error()( utils.run_shell_cmd)([self.kubectl, "get", "nodes", "-o", "yaml"]) nodes = yaml.safe_load(output.decode()) for node in nodes["items"]: node_name = node["metadata"]["name"] node_info = node["status"]["nodeInfo"] if node_info["kubeletVersion"] != self.ci_version: raise Exception( "Wrong kubelet version on node %s. " "Expected %s, but found %s" % (node_name, self.ci_version, node_info["kubeletVersion"])) if node_info["kubeProxyVersion"] != self.ci_version: raise Exception( "Wrong kube-proxy version on node %s. " "Expected %s, but found %s" % (node_name, self.ci_version, node_info["kubeletVersion"]))
def _wait_for_bootstrap_vm(self, timeout=900): self.logging.info("Waiting up to %.2f minutes for VM %s to provision", timeout / 60.0, self.bootstrap_vm_name) valid_vm_states = ["Creating", "Updating", "Succeeded"] for attempt in Retrying(stop=stop_after_delay(timeout), wait=wait_exponential(max=30), retry=retry_if_exception_type(AssertionError), reraise=True): with attempt: vm = utils.retry_on_error()( self.compute_client.virtual_machines.get)( self.cluster_name, self.bootstrap_vm_name) if vm.provisioning_state not in valid_vm_states: err_msg = 'VM "{}" entered invalid state: "{}"'.format( self.bootstrap_vm_name, vm.provisioning_state) self.logging.error(err_msg) raise azure_exceptions.AzureError(err_msg) assert vm.provisioning_state == "Succeeded" return vm
def cleanup_bootstrap_vm(self): self.logging.info("Cleaning up the bootstrap VM") self.logging.info("Deleting bootstrap VM") utils.retry_on_error()( self.compute_client.virtual_machines.begin_delete)( self.cluster_name, self.bootstrap_vm_name).wait() self.logging.info("Deleting bootstrap VM NIC") utils.retry_on_error()( self.network_client.network_interfaces.begin_delete)( self.cluster_name, self.bootstrap_vm_nic_name).wait() self.logging.info("Deleting bootstrap VM public IP") utils.retry_on_error()( self.network_client.public_ip_addresses.begin_delete)( self.cluster_name, self.bootstrap_vm_public_ip_name).wait()
def _wait_for_control_plane(self, timeout=2700): self.logging.info( "Waiting up to %.2f minutes for the control-plane to be ready.", timeout / 60.0) machines_list_cmd = [ self.kubectl, "get", "machine", "--kubeconfig", self.mgmt_kubeconfig_path, "--output=custom-columns=NAME:.metadata.name", "--no-headers" ] control_plane_name_prefix = "{}-control-plane".format( self.cluster_name) for attempt in Retrying(stop=stop_after_delay(timeout), wait=wait_exponential(max=30), retry=retry_if_exception_type(AssertionError), reraise=True): with attempt: output, _ = utils.retry_on_error()(utils.run_shell_cmd)( machines_list_cmd, sensitive=True) machines = output.decode().strip().split('\n') control_plane_machines = [ m for m in machines if m.startswith(control_plane_name_prefix) ] assert len(control_plane_machines) > 0 control_plane_ready = True for control_plane_machine in control_plane_machines: try: status_phase = self._get_mgmt_capz_machine_phase( control_plane_machine) except Exception: control_plane_ready = False break if status_phase != "Running": control_plane_ready = False break assert control_plane_ready self.logging.info("Control-plane is ready")
def _validate_k8s_api_container_images(self): self.logging.info("Validating K8s API container images") output, _ = utils.retry_on_error()(utils.run_shell_cmd)([ self.kubectl, "get", "nodes", "-o", "yaml", "-l", "kubernetes.io/os=linux" ]) nodes = yaml.safe_load(output.decode()) images_tag = self.ci_version.replace("+", "_").strip("v") name_regex = re.compile(r"^(k8s.gcr.io/kube-.*):v(.*)$") for node in nodes["items"]: non_ci_images_names = [] for image in node["status"]["images"]: non_ci_images_names += [ name for name in image["names"] if (name_regex.match(name) and name_regex.match(name).group(2) != images_tag) ] if len(non_ci_images_names) > 0: self.logging.error( "Found the following non-CI images %s on the " "node %s.", non_ci_images_names, node["metadata"]["name"]) raise Exception("Found non-CI container images on " "node %s" % node["metadata"]["name"])
def _wait_for_ready_pods(self): self.logging.info("Waiting for all the pods to be ready") utils.retry_on_error()(utils.run_shell_cmd)([ self.kubectl, "wait", "--for=condition=Ready", "--timeout", "30m", "pods", "--all", "--all-namespaces" ])