예제 #1
0
    def run(self, state_table, scheduler_config, logger, lambda_account=None, context=None):
        """
        Runs the scheduler for a service
        :param state_table: name of the instance state table
        :param scheduler_config: scheduler configuration data
        :param logger: logger to log output of scheduling process
        :param lambda_account: lambda account
        :param context: Lambda context
        :return:
        """

        self._lambda_account = lambda_account
        self._configuration = scheduler_config
        self._logger = logger
        self._context = context
        self._logger.debug_enabled = self._configuration.trace

        # stored instance desired states
        self._instance_states = InstanceStates(state_table, self._service.service_name, self._logger, self._context)

        # time to use for metrics
        self._schedule_metrics = SchedulerMetrics(datetime.utcnow(), self._context)

        # response to caller, contains list off all processed accounts with started and stopped instances
        response = {}

        for account in self._accounts:
            response[account.name] = self._process_account(account)

        if allow_send_metrics():
            self._send_usage_metrics()

        return response
예제 #2
0
class InstanceScheduler:
    """
    Implements scheduler logic
    """

    def __init__(self, service, scheduler_configuration):
        """
        Initializes instance of instance scheduler
        :param service: service strategy that handles the actual listing, starting and stopping of the instances of that service
        """
        self._service = service
        self._instance_states = None
        self._schedule_metrics = None
        self._valid_regions = []
        self._sts_client = None
        self._scheduled_instances = []
        self._configuration = None
        self._scheduler_start_list = []
        self._scheduler_stop_list = []
        self._schedule_resize_list = []
        self._scheduler_configuration = scheduler_configuration
        self._stack_name = os.getenv(configuration.ENV_STACK, "")
        self._lambda_account = os.getenv(configuration.ENV_ACCOUNT)
        self._logger = None
        self._context = None

        # valid regions for service
#modified for gov-cloud per 
#https://github.com/awslabs/aws-instance-scheduler/issues/11
        self._valid_regions = "us-gov-west-1"

#boto3.Session().get_available_regions(service.service_name)

        self._usage_metrics = {"Started": {}, "Stopped": {}, "Resized": {}}

    @property
    def _regions(self):

        if len(self._configuration.regions) > 0:
            result = []
            regions = self._configuration.regions
            for r in regions:
                if r not in self._valid_regions:
                    self._logger.error(ERR_INVALID_REGION, r)
                else:
                    result.append(r)
            return result

        # no regions, use region of lambda function
        return [boto3.Session().region_name]

    @property
    def _sts(self):
        if self._sts_client is None:
            self._sts_client = get_client_with_retries("sts", ["assume_role"], self._context)
        return self._sts_client

    @property
    def _accounts(self):

        def get_session_for_account(cross_account_role, aws_account):
            # get a token for the cross account role and use it to create a session
            try:
                session_name = "{}-scheduler-{}".format(self._service.service_name, aws_account)
                # assume a role
                token = self._sts.assume_role_with_retries(RoleArn=cross_account_role, RoleSessionName=session_name)
                credentials = token["Credentials"]
                # create a session using the assumed role credentials
                return boto3.Session(aws_access_key_id=credentials["AccessKeyId"],
                                     aws_secret_access_key=credentials["SecretAccessKey"],
                                     aws_session_token=credentials["SessionToken"])
            except Exception as ex:
                self._logger.error(ERR_ASSUMING_ROLE.format(cross_account_role, aws_account, str(ex)))
                return None

        # keep track of accounts processed
        accounts_done = []

        # return session for lambda account if processing instances in that account
        if self._configuration.schedule_lambda_account:
            accounts_done.append(self._lambda_account)
            yield as_namedtuple("Account", {"session": boto3.Session(), "name": self._lambda_account, "role": None})

        # iterate through cross account roles
        for role in self._configuration.cross_account_roles:

            # get the account
            role_elements = role.split(":")
            if len(role_elements) < 5:
                self._logger.error(ERR_INVALID_ARN, role)
                continue

            # test if account already processed
            account = role_elements[4]
            if account in accounts_done:
                self._logger.warning(WARN_DUPLICATE_ACCOUNT, account, role)
                continue

            # get a session for the role
            session = get_session_for_account(role, account)
            if session is not None:
                yield as_namedtuple("Account", {"session": session, "name": account, "role": role})

    def _instance_display_str(self, inst_id, name):
        s = "{}:{}".format(self._service.service_name.upper(), inst_id)
        if name:
            s += " ({})".format(name)
        return s

    def _scheduled_instances_in_region(self, account, region):

        # use service strategy to get a list of instances that can be scheduled for that service
        for instance in self._service.get_schedulable_instances(**{
            schedulers.PARAM_SESSION: account.session,
            schedulers.PARAM_ACCOUNT: account.name,
            schedulers.PARAM_ROLE: account.role,
            schedulers.PARAM_REGION: region,
            schedulers.PARAM_TRACE: self._configuration.trace,
            schedulers.PARAM_TAG_NAME: self._configuration.tag_name,
            schedulers.PARAM_LOGGER: self._logger,
            schedulers.PARAM_CONTEXT: self._context
        }):
            instance["account"] = account.name
            instance["region"] = region
            instance["service"] = self._service.service_name
            instance["instance_str"] = self._instance_display_str(instance["id"], instance["name"])
            inst = as_namedtuple(self._service.service_name + "Instance", instance, excludes=["tags"])
            yield inst

    def run(self, state_table, scheduler_config, logger, lambda_account=None, context=None):
        """
        Runs the scheduler for a service
        :param state_table: name of the instance state table
        :param scheduler_config: scheduler configuration data
        :param logger: logger to log output of scheduling process
        :param lambda_account: lambda account
        :param context: Lambda context
        :return:
        """

        self._lambda_account = lambda_account
        self._configuration = scheduler_config
        self._logger = logger
        self._context = context
        self._logger.debug_enabled = self._configuration.trace

        # stored instance desired states
        self._instance_states = InstanceStates(state_table, self._service.service_name, self._logger, self._context)

        # time to use for metrics
        self._schedule_metrics = SchedulerMetrics(datetime.utcnow(), self._context)

        # response to caller, contains list off all processed accounts with started and stopped instances
        response = {}

        for account in self._accounts:
            response[account.name] = self._process_account(account)

        if allow_send_metrics():
            self._send_usage_metrics()

        return response

    def _process_account(self, account):

        # processes instances for a service in an account
        started_instances = {}
        stopped_instances = {}
        resized_instances = {}

        self._logger.info(INF_PROCESSING_ACCOUNT,
                          self._service.service_name.upper(), account.name, " using role " + account.role if account.role else "",
                          ", ".join(self._configuration.regions))

        # gets the desired state and type, uses caching for each schedule
        def get_desired_state_and_type(schedule, inst):

            # test if the instance has a maintenance window in which it must be running
            if instance.maintenance_window is not None and schedule.use_maintenance_window is True:
                self._logger.info(INF_MAINTENANCE_WINDOW)

                # get the desired start for the maintenance window at current UTC time
                inst_state, inst_type, running_period = instance.maintenance_window.get_desired_state(
                    inst, logger=self._logger, dt=datetime.utcnow().replace(tzinfo=pytz.timezone("UTC")))

                # if we're in the maintenance window return running state
                if inst_state == InstanceSchedule.STATE_RUNNING:
                    return inst_state, inst_type

            # based on the schedule get the desired state and instance type for this instance
            inst_state, inst_type, _ = instance_schedule.get_desired_state(inst, logger=self._logger)

            return inst_state, inst_type

        for region in self._regions:

            state_loaded = False
            instances = []

            self._scheduler_start_list = []
            self._scheduler_stop_list = []
            self._schedule_resize_list = []

            for instance in self._scheduled_instances_in_region(account, region):

                # delay loading instance state until first instance is returned
                if not state_loaded:
                    self._instance_states.load(account.name, region)
                    state_loaded = True

                instances.append(instance)

                # handle terminated instances
                if instance.is_terminated:
                    self._logger.debug(DEBUG_SKIPPING_TERMINATED_INSTANCE, instance.instance_str, region, instance.account)
                    self._instance_states.delete_instance_state(instance.id)
                    continue

                # get the schedule for this instance
                instance_schedule = self._configuration.get_schedule(instance.schedule_name)
                if not instance_schedule:
                    self._logger.warning(WARN_SKIPPING_UNKNOWN_SCHEDULE, instance.instance_str, region, instance.account,
                                         instance.schedule_name)
                    continue

                self._logger.debug(DEBUG_INSTANCE_HEADER, instance.instance_str)
                self._logger.debug(DEBUG_CURRENT_INSTANCE_STATE, instance.current_state, instance.instancetype,
                                   instance_schedule.name)

                # based on the schedule get the desired state and instance type for this instance
                desired_state, desired_type = get_desired_state_and_type(instance_schedule, instance)

                # get the  previous desired instance state
                last_desired_state = self._instance_states.get_instance_state(instance.id)
                self._logger.debug(DEBUG_CURRENT_AND_DESIRED_STATE, instance_schedule.name, desired_state, last_desired_state,
                                   instance.current_state, INF_DESIRED_TYPE.format(desired_type) if desired_type else "")

                # last desired state None means this is the first time the instance is seen by the scheduler
                if last_desired_state is InstanceSchedule.STATE_UNKNOWN:
                    # new instances that are running are optionally not stopped to allow them to finish possible initialization
                    if instance.is_running and desired_state == InstanceSchedule.STATE_STOPPED:
                        if not instance_schedule.stop_new_instances:
                            self._logger.debug(DEBUG_NEW_INSTANCE, instance.instance_str)
                            continue
                        self._process_new_desired_state(account, region, instance, desired_state, desired_type, last_desired_state,
                                                        instance_schedule.retain_running)
                    else:
                        self._process_new_desired_state(account, region, instance, desired_state, desired_type, last_desired_state,
                                                        instance_schedule.retain_running)

                # existing instance

                # if enforced check the actual state with the desired state enforcing the schedule state
                elif instance_schedule.enforced:
                    if (instance.is_running and desired_state == InstanceSchedule.STATE_STOPPED) or (
                                not instance.is_running and desired_state == InstanceSchedule.STATE_RUNNING):
                        self._logger.debug(DEBUG_ENFORCED_STATE, instance.instance_str,
                                           InstanceSchedule.STATE_RUNNING
                                           if instance.is_running
                                           else InstanceSchedule.STATE_STOPPED,
                                           desired_state)
                        self._process_new_desired_state(account, region, instance, desired_state, desired_type, last_desired_state,
                                                        instance_schedule.retain_running)
                # if not enforced then compare the schedule state with the actual state so state of manually started/stopped
                # instance it will honor that state
                elif last_desired_state != desired_state:
                    self._process_new_desired_state(account, region, instance, desired_state, desired_type, last_desired_state,
                                                    instance_schedule.retain_running)

                self._schedule_metrics.add_schedule_metrics(self._service.service_name, instance_schedule, instance)

            # process lists of instances that must be started or stopped
            self._start_and_stop_instances(account, region=region)

            # cleanup desired instance states and save
            self._instance_states.cleanup([i.id for i in instances])
            self._instance_states.save()

            # build output structure, hold started, stopped and resized instances per region
            if len(self._scheduler_start_list) > 0:
                started_instances[region] = [{i.id: {"schedule": i.schedule_name}} for i in self._scheduler_start_list]
            if len(self._scheduler_stop_list):
                stopped_instances[region] = [{i.id: {"schedule": i.schedule_name}} for i in self._scheduler_stop_list]
            if len(self._schedule_resize_list) > 0:
                resized_instances[region] = [{i[0].id: {"schedule": i[0].schedule_name, "old": i[0].instancetype, "new": i[1]}} for
                                             i in self._schedule_resize_list]
            if allow_send_metrics():
                self._collect_usage_metrics()

        # put cloudwatch metrics
        if self._configuration.use_metrics:
            self._schedule_metrics.put_schedule_metrics()

        # output data
        result = {"started": started_instances, "stopped": stopped_instances}
        if self._service.allow_resize:
            result["resized"] = resized_instances
        return result

    def _send_usage_metrics(self):
        for s in self._usage_metrics.keys():
            if len(self._usage_metrics[s]) == 0:
                del self._usage_metrics[s]
        if len(self._usage_metrics) > 0:
            self._usage_metrics["Service"] = self._service.service_name
            send_metrics_data(self._usage_metrics, logger=self._logger)

    def _collect_usage_metrics(self):

        for i in self._scheduler_start_list:
            if i.id in [r[0].id for r in self._schedule_resize_list]:
                instance_type = [r[1] for r in self._schedule_resize_list if r[0].id == i.id][0]
            else:
                instance_type = i.instancetype
            if instance_type in self._usage_metrics["Started"]:
                self._usage_metrics["Started"][instance_type] += 1
            else:
                self._usage_metrics["Started"][instance_type] = 1

        for i in self._scheduler_stop_list:
            if i.instancetype in self._usage_metrics["Stopped"]:
                self._usage_metrics["Stopped"][i.instancetype] += 1
            else:
                self._usage_metrics["Stopped"][i.instancetype] = 1

        for i in self._schedule_resize_list:
            type_change = "{}-{}".format(i[0].instancetype, i[1])
            if type_change in self._usage_metrics["Resized"]:
                self._usage_metrics["Resized"][type_change] += 1
            else:
                self._usage_metrics["Resized"][type_change] = 1

    # handle new state of an instance
    def _process_new_desired_state(self, account, region, instance, desired_state, desired_type, last_desired_state,
                                   retain_running):

        def need_and_can_resize():
            if desired_type is not None and instance.instancetype != desired_type:
                if not instance.allow_resize:
                    self._logger.warning(WARN_RESIZE_NOT_SUPPORTED, instance.instance_str, instance.instancetype)
                    return False
                else:
                    return True
            return False

        def resize_instance(inst, new_type):
            try:

                # adjust instance type before starting using the resize_instance method in the service_strategy
                self._service.resize_instance(**{
                    schedulers.PARAM_SESSION: account.session,
                    schedulers.PARAM_ACCOUNT: account.name,
                    schedulers.PARAM_ROLE: account.role,
                    schedulers.PARAM_REGION: region,
                    schedulers.PARAM_TRACE: self._configuration.trace,
                    schedulers.PARAM_INSTANCE: instance,
                    schedulers.PARAM_DESIRED_TYPE: new_type,
                    schedulers.PARAM_LOGGER: self._logger,
                    schedulers.PARAM_CONTEXT: self._context,
                    schedulers.PARAM_CONFIG: self._scheduler_configuration
                })

                self._schedule_resize_list.append((inst, new_type))
            except Exception as ex:
                # if changing the instance type does fail do not add instance to start list so it is handled a next time
                self._logger.error(ERR_SETTING_INSTANCE_TYPE, str(ex))

        # last desired status was saved as retain-running
        if last_desired_state == InstanceSchedule.STATE_RETAIN_RUNNING:

            # don't change last desired state desired whilst in a running period
            if desired_state == InstanceSchedule.STATE_RUNNING:
                pass

            # save last desired state as stopped (but do not stop) at the end of running period
            elif desired_state == InstanceSchedule.STATE_STOPPED:
                # safe new desired stopped state but keep running
                self._logger.debug(INF_DO_NOT_STOP_RETAINED_INSTANCE, instance.id, InstanceSchedule.STATE_STOPPED)
                self._instance_states.set_instance_state(instance.id, InstanceSchedule.STATE_STOPPED)
            else:
                # just save new desired state
                self._instance_states.set_instance_state(instance.id, desired_state)
        else:
            if desired_state == InstanceSchedule.STATE_RUNNING:
                if not instance.is_running:

                    inst_type = desired_type if desired_type is not None else instance.instancetype
                    self._logger.debug(DEBUG_STARTED_REGION_INSTANCES, instance.instance_str, instance.region, inst_type)

                    # for instances to be started test if resizing is required
                    if need_and_can_resize():
                        resize_instance(instance, desired_type)

                    # append instance to list of instances to start
                    self._scheduler_start_list.append(instance)

                # instance already running with desired state of running
                else:
                    # if retain running option is used in this save desired state as retained running.
                    if last_desired_state == InstanceSchedule.STATE_STOPPED:
                        if retain_running:
                            self._logger.debug(DEBUG_APPLY_RETAIN_RUNNING_STATE, desired_state, instance.id,
                                               InstanceSchedule.STATE_RETAIN_RUNNING)
                            self._instance_states.set_instance_state(instance.id, InstanceSchedule.STATE_RETAIN_RUNNING)
                        else:
                            # instance is running, set last desired state from stopped to started
                            self._instance_states.set_instance_state(instance.id, InstanceSchedule.STATE_RUNNING)

                            # desired state is running but saved state already saves as retain running

            elif desired_state == InstanceSchedule.STATE_STOPPED:
                if instance.is_running:
                    # instance needs to be stopped
                    self._logger.debug(DEBUG_STOPPED_REGION_INSTANCES, instance.instance_str, instance.region)
                    # append instance to list of instances to start
                    self._scheduler_stop_list.append(instance)
                    # stopped instance with desired state of running but in retained state mode
                    # (manually stopped in running period and already running at start)
                else:
                    # just save new desired state
                    self._instance_states.set_instance_state(instance.id, InstanceSchedule.STATE_STOPPED)

            else:
                self._instance_states.set_instance_state(instance.id, desired_state)

    # start and stop listed instances
    def _start_and_stop_instances(self, account, region):

        if len(self._scheduler_start_list) > 0:
            self._logger.info(INF_STARTING_INSTANCES, ", ".join([i.instance_str for i in self._scheduler_start_list]),
                              region)

            for inst_id, state in self._service.start_instances(**{
                schedulers.PARAM_SESSION: account.session,
                schedulers.PARAM_ACCOUNT: account.name,
                schedulers.PARAM_ROLE: account.role,
                schedulers.PARAM_REGION: region,
                schedulers.PARAM_TRACE: self._configuration.trace,
                schedulers.PARAM_STARTED_INSTANCES: self._scheduler_start_list,
                schedulers.PARAM_LOGGER: self._logger,
                schedulers.PARAM_CONTEXT: self._context,
                schedulers.PARAM_STACK: self._stack_name,
                schedulers.PARAM_CONFIG: self._scheduler_configuration
            }):
                # set state based on returned state from start action
                self._instance_states.set_instance_state(inst_id, state)

        if len(self._scheduler_stop_list) > 0:
            self._logger.info(INF_STOPPED_INSTANCES, ", ".join([i.instance_str for i in self._scheduler_stop_list]),
                              region)
            for inst_id, state in self._service.stop_instances(**{
                schedulers.PARAM_SESSION: account.session,
                schedulers.PARAM_ACCOUNT: account.name,
                schedulers.PARAM_ROLE: account.role,
                schedulers.PARAM_REGION: region,
                schedulers.PARAM_TRACE: self._configuration.trace,
                schedulers.PARAM_STOPPED_INSTANCES: self._scheduler_stop_list,
                schedulers.PARAM_LOGGER: self._logger,
                schedulers.PARAM_CONTEXT: self._context,
                schedulers.PARAM_STACK: self._stack_name,
                schedulers.PARAM_CONFIG: self._scheduler_configuration
            }):
                # set state based on start of stop action
                self._instance_states.set_instance_state(inst_id, state)