Exemplo n.º 1
0
    def _recover_auto_scaling_groups(self):
        """
        This steps does the following:
            - fetch the previously restored auto scaling group config. If this config cannot be found,
              we can assume that all autoscaling groups have correct configurations. This could happen
              when previous restart failed in the middle but passed this stage already, or the cluster is
              not even paused
            - Wait for all instances to be in service
        :return:
        """
        # Get previously persisted asg status
        logger.info("Fetching last cluster status ...")
        cluster_status_raw = self._cluster_info.download_cluster_status_before_pause(
        )

        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())

        if cluster_status_raw:
            logger.info("Found last cluster status, restoring cluster ...")
            cluster_status = yaml.load(cluster_status_raw)
            all_asg_statuses = cluster_status["asg_status"]

            # Restore minions
            for asg_name in all_asg_statuses.keys():
                asg_status = all_asg_statuses[asg_name]
                min_size = asg_status["min_size"]
                max_size = asg_status["max_size"]
                desired = asg_status["desired_capacity"]
                self._total_nodes += desired
                logger.info(
                    "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s",
                    asg_name, min_size, max_size, desired)
                asg_mgr.set_asg_spec(name=asg_name,
                                     minsize=min_size,
                                     maxsize=max_size,
                                     desired=desired)

            logger.info("Waiting for all auto scaling groups to scale up ...")
            asg_mgr.wait_for_desired_asg_state()
            logger.info("%sAll cluster instances are in service%s",
                        COLOR_GREEN, COLOR_NORM)

            # Delete previously stored cluster status
            self._cluster_info.delete_cluster_status_before_pause()
        else:
            all_asgs = asg_mgr.get_all_asgs()
            for asg in all_asgs:
                self._total_nodes += asg["DesiredCapacity"]

            logger.info(
                "Cannot find last cluster status, cluster already resumed with %s nodes",
                self._total_nodes)
Exemplo n.º 2
0
    def _scale_down_auto_scaling_groups(self):
        """
        This step:
            - Persist autoscaling group states to S3,
            - Scale down all autoscaling groups to zero,
            - Wait for all minion to be terminated
        :return:
        """
        logger.info("Discovering autoscaling groups")
        asg_mgr = AXUserASGManager(cluster_name_id=self._name_id,
                                   aws_profile=self._cfg.cloud_profile,
                                   region=self._cluster_config.get_region())
        all_asgs = asg_mgr.get_all_asgs()

        # Generate cluster status before pause. This is used to recover same amount of nodes
        # when we want to restart cluster
        cluster_status = {"asg_status": {}}
        for asg in all_asgs:
            cluster_status["asg_status"][asg["AutoScalingGroupName"]] = {
                "min_size": asg["MinSize"],
                "max_size": asg["MaxSize"],
                "desired_capacity": asg["DesiredCapacity"]
            }
        self._cluster_info.upload_cluster_status_before_pause(
            status=yaml.dump(cluster_status))

        # Scale down asg
        logger.info("Scaling down autoscaling groups ...")
        for asg in all_asgs:
            asg_name = asg["AutoScalingGroupName"]
            asg_mgr.set_asg_spec(name=asg_name, minsize=0, maxsize=0)

        # Waiting for nodes to be terminated
        logger.info("Waiting for all auto scaling groups to scale down ...")
        asg_mgr.wait_for_desired_asg_state()
        logger.info("%sAll cluster nodes are terminated%s", COLOR_GREEN,
                    COLOR_NORM)