Exemplo n.º 1
0
 def start(self):
     super().start()
     if not self.kwargs.get("name"):
         self.network_view = deployer.select_network(self)
     else:
         self.network_view = deployer.get_network_view(self.kwargs["name"])
         if not self.network_view:
             raise StopChatFlow(f"no network named {self.kwargs['name']}")
Exemplo n.º 2
0
    def start_vmachine_deployment(
        self,
        farm_name,
        solution_name,
        query,
        vm_size,
        ssh_keys,
        enable_public_ip=False,
        solution_uuid=None,
        vmachine_type=None,
        duration=None,
    ):
        """
        search for a pool in the same farm and extend it or create a new one with the required capacity
        """
        old_node_ids = []
        for k8s_node in self.vdc_instance.kubernetes:
            old_node_ids.append(k8s_node.node_id)
        for vmachine in self.vdc_instance.vmachines:
            old_node_ids.append(vmachine.node_id)

        cc = CapacityChecker(farm_name)
        cc.exclude_nodes(*old_node_ids)

        if not cc.add_query(**query):
            raise j.exceptions.Validation(f"Not enough capacity in farm {farm_name} for deploying vmachine")

        duration = (
            duration if duration else self.vdc_instance.expiration_date.timestamp() - j.data.time.utcnow().timestamp
        )
        if duration <= 0:
            raise j.exceptions.Validation(f"invalid duration {duration}")

        scheduler = Scheduler(farm_name=farm_name)
        scheduler.exclude_nodes(*old_node_ids)
        nodes_generator = scheduler.nodes_by_capacity(**query, public_ip=enable_public_ip)

        pool_id = self._preprare_extension_pool(farm_name, vm_size, duration, enable_public_ip)

        network_view = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name)

        vm_res = self.deploy_vmachine(
            solution_name,
            vm_size,
            pool_id,
            nodes_generator,
            ssh_keys,
            solution_uuid,
            network_view,
            enable_public_ip,
            vmachine_type,
            description=self.vdc_deployer.description,
        )
        if not vm_res:
            self.vdc_deployer.error(f"Failed to deploy vmachine")
            raise j.exceptions.Runtime(f"Failed to deploy vmachine")
        return vm_res
Exemplo n.º 3
0
    def redeploy_master(self, old_master_workload=None):
        nv = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name)
        if not old_master_workload:
            old_master_workload = self._get_latest_master_workload()
            if not old_master_workload:
                self.vdc_deployer.error("Couldn't find old master workload")
                return

        # delete old master in case of next action is deploy
        if old_master_workload.info.next_action == NextAction.DEPLOY:
            self.vdc_deployer.info(f"Deleting old master workload {old_master_workload.id}")
            self.zos.workloads.decomission(old_master_workload.id)
            deployer.wait_workload_deletion(old_master_workload.id)

        # deleting network old network
        # old_network_workload = None
        # workloads = self.zos.workloads.list_workloads(self.vdc_instance.identity_tid)
        # for workload in workloads:
        #     if workload.info.workload_type == WorkloadType.Network_resource and self._ip_in_network(
        #         old_master_workload.ipaddress, workload.iprange
        #     ):
        #         old_network_workload = workload
        #         break

        # self.vdc_deployer.info(f"Deleting old network on node {old_network_workload.info.node_id}")
        # nv.delete_node(old_network_workload.info.node_id)

        master_size = VDC_SIZE.VDC_FLAVORS[self.vdc_deployer.flavor]["k8s"]["controller_size"]
        pub_keys = [self.vdc_deployer.ssh_key.public_key.strip()]
        gs = GlobalScheduler()  # need to used nodes
        self.vdc_instance.load_info()
        endpoints = ",".join([f"http://{etcd.ip_address}:2379" for etcd in self.vdc_instance.etcd])
        self.vdc_deployer.info("Deploying new master")
        public_ip_workload = self.zos.workloads.get(old_master_workload.public_ip)
        self.deploy_master(
            old_master_workload.info.pool_id,
            gs,
            master_size,
            self.vdc_instance.get_password(),
            pub_keys,
            self.vdc_uuid,
            nv,
            endpoints,
            public_ip=public_ip_workload.ipaddress,
        )
        return True
Exemplo n.º 4
0
    def reservation(self):
        metadata = {
            "name": self.domain,
            "form_info": {
                "Solution name": self.domain,
                "chatflow": "exposed"
            }
        }
        self.solution_metadata.update(metadata)
        query = {"mru": 1, "cru": 1, "sru": 1}
        self.selected_node = deployer.schedule_container(self.pool_id, **query)
        self.network_name = self.solution["Network"]

        result = deployer.add_network_node(
            self.network_name,
            self.selected_node,
            self.pool_id,
            bot=self,
            owner=self.solution_metadata.get("owner"))
        if result:
            for wid in result["ids"]:
                success = deployer.wait_workload(
                    wid, self, breaking_node_id=self.selected_node.node_id)
                if not success:
                    raise DeploymentFailed(
                        f"Failed to add node to network {wid}", wid=wid)

        self.network_view = deployer.get_network_view(self.network_name)
        self.tcprouter_ip = self.network_view.get_free_ip(self.selected_node)
        if not self.tcprouter_ip:
            raise StopChatFlow(
                f"No available ips one for network {self.network_view.name} node {self.selected_node.node_id}"
            )

        if self.domain_type != "Custom Domain":
            self.dom_id = deployer.create_subdomain(
                pool_id=self.domain_pool.pool_id,
                gateway_id=self.domain_gateway.node_id,
                subdomain=self.domain,
                **self.solution_metadata,
                solution_uuid=self.solution_id,
            )
            success = deployer.wait_workload(self.dom_id, self)
            if not success:
                raise DeploymentFailed(
                    f"Failed to reserve sub-domain workload {self.dom_id}",
                    solution_uuid=self.solution_id)

        self.proxy_id = deployer.create_proxy(
            pool_id=self.domain_pool.pool_id,
            gateway_id=self.domain_gateway.node_id,
            domain_name=self.domain,
            trc_secret=self.secret,
            **self.solution_metadata,
            solution_uuid=self.solution_id,
        )
        success = deployer.wait_workload(self.proxy_id, self)
        if not success:
            raise DeploymentFailed(
                f"Failed to reserve reverse proxy workload {self.proxy_id}",
                solution_uuid=self.solution_id)

        self.tcprouter_id = deployer.expose_address(
            pool_id=self.pool_id,
            gateway_id=self.domain_gateway.node_id,
            network_name=self.network_name,
            local_ip=self.solution_ip,
            port=self.port,
            tls_port=self.tls_port,
            trc_secret=self.secret,
            bot=self,
            **self.solution_metadata,
            solution_uuid=self.solution_id,
        )
        success = deployer.wait_workload(self.tcprouter_id, self)
        if not success:
            raise DeploymentFailed(
                f"Failed to reserve TCP Router container workload {self.tcprouter_id}",
                solution_uuid=self.solution_id,
                wid=self.tcprouter_id,
            )
Exemplo n.º 5
0
    def reservation(self):
        metadata = {
            "name": self.domain,
            "form_info": {
                "Solution name": self.domain,
                "chatflow": "exposed"
            }
        }
        self.solution_metadata.update(metadata)
        query = {"mru": 1, "cru": 1, "sru": 1}
        self.selected_node = deployer.schedule_container(self.pool_id, **query)
        self.network_name = self.solution["Network"]

        result = deployer.add_network_node(
            self.network_name,
            self.selected_node,
            self.pool_id,
            bot=self,
            owner=self.solution_metadata.get("owner"))
        if result:
            for wid in result["ids"]:
                success = deployer.wait_workload(
                    wid, self, breaking_node_id=self.selected_node.node_id)
                if not success:
                    raise DeploymentFailed(
                        f"Failed to add node to network {wid}", wid=wid)

        self.network_view = deployer.get_network_view(self.network_name)
        self.tcprouter_ip = self.network_view.get_free_ip(self.selected_node)
        if not self.tcprouter_ip:
            raise StopChatFlow(
                f"No available ips one for network {self.network_view.name} node {self.selected_node.node_id}"
            )

        if self.domain_type != "Custom Domain":
            self.dom_id = deployer.create_subdomain(
                pool_id=self.domain_pool.pool_id,
                gateway_id=self.domain_gateway.node_id,
                subdomain=self.domain,
                **self.solution_metadata,
                solution_uuid=self.solution_id,
            )
            success = deployer.wait_workload(self.dom_id, self)
            if not success:
                raise DeploymentFailed(
                    f"Failed to reserve sub-domain workload {self.dom_id}",
                    solution_uuid=self.solution_id)

        if self.proxy_type == "TRC":
            self.proxy_id = deployer.create_proxy(
                pool_id=self.domain_pool.pool_id,
                gateway_id=self.domain_gateway.node_id,
                domain_name=self.domain,
                trc_secret=self.secret,
                **self.solution_metadata,
                solution_uuid=self.solution_id,
            )
            success = deployer.wait_workload(self.proxy_id, self)
            if not success:
                raise DeploymentFailed(
                    f"Failed to reserve reverse proxy workload {self.proxy_id}",
                    solution_uuid=self.solution_id)

        trc_log_config = j.core.config.get("LOGGING_SINK", {})
        if trc_log_config:
            trc_log_config[
                "channel_name"] = f"{self.threebot_name}-{self.solution_name}-trc".lower(
                )

        if self.proxy_type == "NGINX":
            self.tcprouter_id = deployer.expose_and_create_certificate(
                domain=self.domain,
                email=self.email,
                pool_id=self.pool_id,
                gateway_id=self.domain_gateway.node_id,
                network_name=self.network_name,
                solution_ip=self.solution_ip,
                solution_port=self.port,
                trc_secret=self.secret,
                bot=self,
                enforce_https=self.force_https,
                log_config=trc_log_config,
                **self.solution_metadata,
                solution_uuid=self.solution_id,
            )
        else:
            self.tcprouter_id, _ = deployer.expose_address(
                pool_id=self.pool_id,
                gateway_id=self.domain_gateway.node_id,
                network_name=self.network_name,
                local_ip=self.solution_ip,
                port=self.port,
                tls_port=self.tls_port,
                trc_secret=self.secret,
                bot=self,
                log_config=trc_log_config,
                **self.solution_metadata,
                solution_uuid=self.solution_id,
            )
        success = deployer.wait_workload(self.tcprouter_id, self)
        if not success:
            raise DeploymentFailed(
                f"Failed to reserve TCP Router container workload {self.tcprouter_id}",
                solution_uuid=self.solution_id,
                wid=self.tcprouter_id,
            )
Exemplo n.º 6
0
    def deploy_s3_minio_container(self, pool_id, ak, sk, ssh_key, scheduler,
                                  zdb_wids, solution_uuid, password):
        zdb_configs = []
        self.vdc_deployer.info(f"deploying minio for zdbs: {zdb_wids}")
        for zid in zdb_wids:
            zdb_configs.append(
                deployer.get_zdb_url(
                    zid, password, identity_name=self.identity.instance_name))
        self.vdc_deployer.info(f"zdb_configs: {zdb_configs}")

        network_view = deployer.get_network_view(
            self.vdc_name, identity_name=self.identity.instance_name)
        for node in scheduler.nodes_by_capacity(cru=MINIO_CPU,
                                                mru=MINIO_MEMORY / 1024,
                                                sru=MINIO_DISK / 1024,
                                                ip_version="IPv6"):
            self.vdc_deployer.info(f"node {node.node_id} selected for minio")
            try:
                result = deployer.add_network_node(self.vdc_name, node,
                                                   pool_id, network_view,
                                                   self.bot,
                                                   self.identity.instance_name)
                if result:
                    for wid in result["ids"]:
                        success = deployer.wait_workload(
                            wid,
                            self.bot,
                            5,
                            identity_name=self.identity.instance_name,
                            cancel_by_uuid=False)
                        if not success:
                            self.vdc_deployer.error(
                                f"workload {wid} failed when adding node to network"
                            )
                            raise DeploymentFailed()
            except DeploymentFailed:
                self.vdc_deployer.error(
                    f"failed to deploy minio network on node {node.node_id}.")
                continue

            network_view = network_view.copy()
            ip_address = network_view.get_free_ip(node)
            self.vdc_deployer.info(f"minio ip address {ip_address}")
            try:
                result = deployer.deploy_minio_containers(
                    pool_id,
                    self.vdc_name,
                    [node.node_id],
                    [ip_address],
                    zdb_configs,
                    ak,
                    sk,
                    ssh_key,
                    MINIO_CPU,
                    MINIO_MEMORY,
                    S3_NO_DATA_NODES,
                    S3_NO_PARITY_NODES,
                    public_ipv6=True,
                    disk_size=int(MINIO_DISK / 1024),
                    bot=self.bot,
                    identity_name=self.identity.instance_name,
                    # form_info={"chatflow": "minio"},
                    # name=self.vdc_name,
                    solution_uuid=solution_uuid,
                    description=self.vdc_deployer.description,
                )
            except DeploymentFailed as e:
                if e.wid:
                    workload = self.zos.workloads.get(e.wid)
                    self.vdc_deployer.error(
                        f"failed to deploy minio volume wid: {e.wid} on node {workload.info.node_id}"
                    )
                else:
                    self.vdc_deployer.error(
                        f"failed to deploy minio volume due to error {str(e)}")
                continue
            wid = result[0]
            try:
                success = deployer.wait_workload(
                    wid,
                    self.bot,
                    identity_name=self.identity.instance_name,
                    cancel_by_uuid=False)
                if not success:
                    raise DeploymentFailed()
                self.vdc_deployer.info(
                    f"minio container deployed successfully wid: {wid}")
                return wid
            except DeploymentFailed:
                self.vdc_deployer.error(
                    f"failed to deploy minio container wid: {wid}")
                continue
        self.vdc_deployer.error("no nodes available to deploy minio container")
Exemplo n.º 7
0
    def deploy_external_etcd(self, farm_name, no_nodes=ETCD_CLUSTER_SIZE, solution_uuid=None):
        network_view = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name)
        pool_id, _ = self.vdc_deployer.get_pool_id_and_reservation_id(farm_name)
        scheduler = Scheduler(pool_id=pool_id)
        nodes_generator = scheduler.nodes_by_capacity(cru=ETCD_CPU, sru=ETCD_DISK / 1024, mru=ETCD_MEMORY / 1024)
        solution_uuid = solution_uuid or uuid.uuid4().hex
        while True:
            deployment_nodes = self._add_nodes_to_network(pool_id, nodes_generator, [], no_nodes, network_view)
            if not deployment_nodes:
                self.vdc_deployer.error("no available nodes to deploy etcd cluster")
                return
            self.vdc_deployer.info(f"deploying etcd cluster on nodes {[node.node_id for node in deployment_nodes]}")

            network_view = network_view.copy()
            ip_addresses = []
            node_ids = []
            etcd_cluster = ""

            for idx, node in enumerate(deployment_nodes):
                address = network_view.get_free_ip(node)
                ip_addresses.append(address)
                etcd_cluster += f"etcd_{idx+1}=http://{address}:2380,"
                node_ids.append(node.node_id)

            secret_env = None
            # etcd_backup_config = j.core.config.get("VDC_S3_CONFIG", {})
            # restic_url = etcd_backup_config.get("S3_URL", "")
            # restic_bucket = etcd_backup_config.get("S3_BUCKET", "")
            # restic_ak = etcd_backup_config.get("S3_AK", "")
            # restic_sk = etcd_backup_config.get("S3_SK", "")
            # if all([self.vdc_deployer.restore, restic_url, restic_bucket, restic_ak, restic_sk]):
            #     secret_env = {
            #         "RESTIC_REPOSITORY": f"s3:{restic_url}/{restic_bucket}/{self.vdc_instance.owner_tname}/{self.vdc_instance.vdc_name}",
            #         "AWS_ACCESS_KEY_ID": restic_ak,
            #         "AWS_SECRET_ACCESS_KEY": restic_sk,
            #         "RESTIC_PASSWORD": self.vdc_deployer.password_hash,
            #     }

            explorer = None
            if "test" in j.core.identity.me.explorer_url:
                explorer = "test"
            elif "dev" in j.core.identity.me.explorer_url:
                explorer = "dev"
            else:
                explorer = "main"
            log_config = j.core.config.get("VDC_LOG_CONFIG", {})
            if log_config:
                log_config["channel_name"] = f"{self.vdc_instance.instance_name}_{explorer}"
            pool_ids = [pool_id for i in range(no_nodes)]
            wids = deployer.deploy_etcd_containers(
                pool_ids,
                node_ids,
                network_view.name,
                ip_addresses,
                etcd_cluster,
                ETCD_FLIST,
                ETCD_CPU,
                ETCD_MEMORY,
                ETCD_DISK,
                entrypoint="",
                ssh_key=self.vdc_deployer.ssh_key.public_key.strip(),
                identity_name=self.identity.instance_name,
                solution_uuid=solution_uuid,
                description=self.vdc_deployer.description,
                secret_env=secret_env,
                log_config=log_config,
            )
            try:
                for wid in wids:
                    success = deployer.wait_workload(
                        wid, self.bot, identity_name=self.identity.instance_name, cancel_by_uuid=False
                    )
                    if not success:
                        self.vdc_deployer.error(f"etcd cluster workload: {wid} failed to deploy")
                        raise DeploymentFailed()
            except DeploymentFailed:
                for wid in wids:
                    self.zos.workloads.decomission(wid)
                continue
            return ip_addresses
Exemplo n.º 8
0
    def extend_cluster(
        self,
        farm_name,
        master_ip,
        k8s_flavor,
        cluster_secret,
        ssh_keys,
        no_nodes=1,
        duration=None,
        public_ip=False,
        solution_uuid=None,
        external=True,
        nodes_ids=None,
        no_extend_pool=False,
    ):
        """
        search for a pool in the same farm and extend it or create a new one with the required capacity
        """
        old_node_ids = []
        for k8s_node in self.vdc_instance.kubernetes:
            old_node_ids.append(k8s_node.node_id)
        cc = CapacityChecker(farm_name)
        cc.exclude_nodes(*old_node_ids)

        for _ in range(no_nodes):
            if not cc.add_query(**VDC_SIZE.K8S_SIZES[k8s_flavor]):
                raise j.exceptions.Validation(
                    f"Not enough capacity in farm {farm_name} for {no_nodes} kubernetes nodes of flavor {k8s_flavor}"
                )

        duration = (
            duration * 60 * 60 * 24
            if duration
            else self.vdc_instance.expiration_date.timestamp() - j.data.time.utcnow().timestamp
        )
        if duration <= 0:
            raise j.exceptions.Validation(f"invalid duration {duration}")
        scheduler = Scheduler(farm_name=farm_name)
        scheduler.exclude_nodes(*old_node_ids)
        nodes_generator = scheduler.nodes_by_capacity(**VDC_SIZE.K8S_SIZES[k8s_flavor], public_ip=public_ip)
        if nodes_ids:
            nodes_generator = list(nodes_generator)
            nodes_generator_ids = [node.node_id for node in nodes_generator]
            unavailable_nodes_ids = set(nodes_ids) - set(nodes_generator_ids)
            if unavailable_nodes_ids:
                raise j.exceptions.Validation(
                    f"Some nodes: {unavailable_nodes_ids} are not in farm: {farm_name} or don't have capacity"
                )
            nodes_generator = [node for node in nodes_generator if node.node_id in nodes_ids]
        pool_id = self._preprare_extension_pool(
            farm_name, k8s_flavor, no_nodes, duration, public_ip, no_extend=no_extend_pool
        )
        network_view = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name)
        solution_uuid = solution_uuid or uuid.uuid4().hex
        wids = self._add_workers(
            pool_id,
            nodes_generator,
            k8s_flavor,
            cluster_secret,
            ssh_keys,
            solution_uuid,  # use differnet uuid than
            network_view,
            master_ip,
            no_nodes,
            public_ip,
            external,
        )
        if not wids:
            self.vdc_deployer.error(
                f"Failed to extend kubernetes cluster with {no_nodes} nodes of flavor {k8s_flavor}, vdc uuid {self.vdc_uuid}"
            )
            j.sals.reservation_chatflow.solutions.cancel_solution_by_uuid(solution_uuid)
            raise j.exceptions.Runtime(
                f"failed to extend kubernetes cluster with {no_nodes} nodes of flavor {k8s_flavor}, vdc uuid {self.vdc_uuid}"
            )
        return wids
Exemplo n.º 9
0
    def deploy_threebot(self,
                        minio_wid,
                        pool_id,
                        kube_config,
                        embed_trc=True,
                        backup_config=None,
                        zdb_farms=None,
                        cert=None):
        backup_config = backup_config or {}
        etcd_backup_config = j.core.config.get("VDC_S3_CONFIG", {})
        flist = THREEBOT_VDC_FLIST if embed_trc else THREEBOT_FLIST
        # workload = self.zos.workloads.get(minio_wid)
        # if workload.info.workload_type != WorkloadType.Container:
        #     raise j.exceptions.Validation(f"workload {minio_wid} is not container workload")
        # minio_ip_address = workload.network_connection[0].ipaddress
        vdc_dict = self.vdc_instance.to_dict()
        vdc_dict.pop("s3", None)
        vdc_dict.pop("kubernetes", None)
        vdc_dict.pop("threebot", None)
        secret_env = {
            "BACKUP_CONFIG":
            j.data.serializers.json.dumps(backup_config),
            "VDC_OWNER_TNAME":
            self.vdc_deployer.tname,
            "VDC_EMAIL":
            self.vdc_deployer.email,
            "VDC_PASSWORD_HASH":
            self.vdc_deployer.vdc_instance.get_password(),
            "KUBE_CONFIG":
            kube_config,
            "PROVISIONING_WALLET_SECRET":
            self.vdc_deployer.vdc_instance.provision_wallet.secret,
            "PREPAID_WALLET_SECRET":
            self.vdc_deployer.vdc_instance.prepaid_wallet.secret,
            "VDC_INSTANCE":
            j.data.serializers.json.dumps(vdc_dict),
            "THREEBOT_PRIVATE_KEY":
            self.vdc_deployer.ssh_key.private_key.strip(),
            "S3_URL":
            etcd_backup_config.get("S3_URL", ""),
            "S3_BUCKET":
            etcd_backup_config.get("S3_BUCKET", ""),
            "S3_AK":
            etcd_backup_config.get("S3_AK", ""),
            "S3_SK":
            etcd_backup_config.get("S3_SK", ""),
        }

        if cert:
            secret_env["CERT"] = cert.cert
            secret_env["CERT_PRIVATE_KEY"] = cert.private_key
            secret_env["CERT_FULLCHAIN"] = cert.fullchain

        env = {
            "VDC_NAME":
            self.vdc_name,
            "MONITORING_SERVER_URL":
            j.config.get("MONITORING_SERVER_URL", ""),
            "VDC_UUID":
            self.vdc_uuid,
            "EXPLORER_URL":
            j.core.identity.me.explorer_url,
            "VDC_S3_MAX_STORAGE":
            str(
                int(VDC_SIZE.S3_ZDB_SIZES[VDC_SIZE.VDC_FLAVORS[
                    self.vdc_deployer.flavor]["s3"]["size"]]["sru"] *
                    (1 + (S3_NO_PARITY_NODES /
                          (S3_NO_DATA_NODES + S3_NO_PARITY_NODES))))),
            "S3_AUTO_TOPUP_FARMS":
            ",".join(S3_AUTO_TOPUP_FARMS.get())
            if not zdb_farms else ",".join(zdb_farms),
            "NETWORK_FARMS":
            ",".join(NETWORK_FARMS.get()),
            "COMPUTE_FARMS":
            ",".join(COMPUTE_FARMS.get()),
            # "VDC_MINIO_ADDRESS": minio_ip_address,
            "SDK_VERSION":
            self.branch,
            "SSHKEY":
            self.vdc_deployer.ssh_key.public_key.strip(),
            "MINIMAL":
            "true",
            "TEST_CERT":
            "true" if j.core.config.get("TEST_CERT") else "false",
            "ACME_SERVER_URL":
            self.acme_server_url,
        }
        if embed_trc:
            _, secret, remote = self._prepare_proxy()
            if not remote:
                return
            remote_ip, remote_port = remote.split(":")
            env.update({
                "REMOTE_IP": remote_ip,
                "REMOTE_PORT": remote_port,
            })
            secret_env["TRC_SECRET"] = secret
        if not self.vdc_instance.kubernetes:
            self.vdc_instance.load_info()

        scheduler = Scheduler(pool_id=pool_id)
        for node in scheduler.nodes_by_capacity(THREEBOT_CPU,
                                                THREEBOT_DISK / 1024,
                                                THREEBOT_MEMORY / 1024):
            network_view = deployer.get_network_view(
                self.vdc_name, identity_name=self.identity.instance_name)
            self.vdc_deployer.info(
                f"VDC threebot: node {node.node_id} selected")
            result = deployer.add_network_node(network_view.name, node,
                                               pool_id, network_view, self.bot,
                                               self.identity.instance_name)

            self.vdc_deployer.info(
                f"VDC threebot network update result for node {node.node_id} is {result}"
            )
            if result:
                network_updated = True
                try:
                    for wid in result["ids"]:
                        success = deployer.wait_workload(
                            wid,
                            self.bot,
                            expiry=5,
                            breaking_node_id=node.node_id,
                            identity_name=self.identity.instance_name,
                            cancel_by_uuid=False,
                        )
                        network_updated = network_updated and success
                    if not network_updated:
                        raise DeploymentFailed()
                except DeploymentFailed:
                    self.vdc_deployer.error(
                        f"Failed to deploy network on node {node.node_id}")
                    continue
            network_view = network_view.copy()
            ip_address = network_view.get_free_ip(node)
            self.vdc_deployer.info(
                f"VDC threebot container ip address {ip_address}")
            if not ip_address:
                continue
            explorer = None
            if "test" in j.core.identity.me.explorer_url:
                explorer = "test"
            elif "dev" in j.core.identity.me.explorer_url:
                explorer = "dev"
            else:
                explorer = "main"

            log_config = j.core.config.get("VDC_LOG_CONFIG", {})
            if log_config:
                log_config[
                    "channel_name"] = f"{self.vdc_instance.instance_name}_{explorer}"

            wid = deployer.deploy_container(
                pool_id=pool_id,
                node_id=node.node_id,
                network_name=network_view.name,
                ip_address=ip_address,
                flist=flist,
                env=env,
                cpu=THREEBOT_CPU,
                memory=THREEBOT_MEMORY,
                disk_size=THREEBOT_DISK,
                secret_env=secret_env,
                identity_name=self.identity.instance_name,
                description=self.vdc_deployer.description,
                form_info={
                    "chatflow": "threebot",
                    "Solution name": self.vdc_name
                },
                solution_uuid=self.vdc_uuid,
                log_config=log_config,
            )
            self.vdc_deployer.info(f"VDC threebot container wid: {wid}")
            try:
                success = deployer.wait_workload(
                    wid,
                    self.bot,
                    identity_name=self.identity.instance_name,
                    cancel_by_uuid=False)
                if success:
                    return wid
                raise DeploymentFailed()
            except DeploymentFailed:
                self.vdc_deployer.error(
                    f"failed to deploy threebot container on node: {node.node_id} wid: {wid}"
                )
                continue
Exemplo n.º 10
0
    def add_nodes(self):
        zos = j.sals.zos.get()
        workload = zos.workloads.get(self.master_wid)
        metadata = j.sals.reservation_chatflow.reservation_chatflow.decrypt_reservation_metadata(
            workload.info.metadata)
        metadata = j.data.serializers.json.loads(metadata)
        pool_id = workload.info.pool_id
        old_wids = j.sals.marketplace.solutions.get_workloads_by_uuid(
            metadata.get("solution_uuid"))
        old_nodes = [
            wid.info.node_id for wid in old_wids
            if wid.info.result.state == State.Ok
        ]
        if self.enable_public_ip:
            self.node_query["ipv4u"] = self.nodes_count
        nodes, pools = deployer.ask_multi_pool_distribution(
            self, self.nodes_count + len(old_nodes), self.node_query)
        nodes_pools_zip = list(zip(nodes, pools))
        selected_nodes = list(
            filter(lambda x: x[0].node_id not in old_nodes, nodes_pools_zip))
        if len(selected_nodes) < self.nodes_count:
            self.stop(
                f"Failed to find resources to deploy {self.nodes_count}, available nodes are: {len(selected_nodes)}"
            )
        new_nodes = selected_nodes[:self.nodes_count]
        network_view = deployer.get_network_view(workload.network_id)
        master_ip = workload.ipaddress

        self.reservations = []
        for node, pool_id in new_nodes:
            res = deployer.add_network_node(workload.network_id, node, pool_id)
            if res:
                for wid in res["ids"]:
                    success = deployer.wait_workload(
                        wid, breaking_node_id=node.node_id)
                    if not success:
                        raise StopChatFlow(
                            f"Failed to add node {node.node_id} to network {wid}"
                        )
            network_view = network_view.copy()
            ip_address = network_view.get_free_ip(node)
            if not ip_address:
                raise StopChatFlow(
                    f"No free IPs for network {network_name} on the specifed node"
                    f" {node_id}")

            self.md_show_update(f"Deploying worker on node {node.node_id}")
            # Add public ip
            public_id_wid = 0
            if self.enable_public_ip:
                public_id_wid, _ = deployer.create_public_ip(
                    pool_id,
                    node.node_id,
                    solution_uuid=metadata.get("solution_uuid"))

            self.reservations.append(
                deployer.deploy_kubernetes_worker(
                    pool_id,
                    node.node_id,
                    workload.network_id,
                    workload.cluster_secret,
                    workload.ssh_keys,
                    ip_address,
                    master_ip,
                    size=self.cluster_size,
                    identity_name=None,
                    description="",
                    public_ip_wid=public_id_wid,
                    **metadata,
                ))

        self.success_workload_count = 0
        zos = j.sals.zos.get()
        for resv in self.reservations:
            try:
                success = deployer.wait_workload(resv,
                                                 self,
                                                 cancel_by_uuid=False)
                self.success_workload_count += 1
            except DeploymentFailed as ex:
                # Cleaning k8s workloads and public IP workloads in case of failure in deployment
                workload = zos.workloads.get(resv)
                if workload.public_ip:
                    zos.workloads.decomission(workload.public_ip)
                zos.workloads.decomission(wid)
                j.logger.error(
                    f"Failed to deploy  workloads for {resv}, the error: {str(ex)}"
                )

        if not self.success_workload_count:
            raise StopChatFlow(
                msg="Can't extend your cluster, please try again later")

        if self.success_workload_count < len(self.reservations):
            raise StopChatFlow(
                msg=
                f"Some nodes failed to extend, {self.success_workload_count} of {self.nodes_count}, please try again later"
            )