Exemplo n.º 1
0
    def get_pods_to_schedule(self, pods):
        """
        given a list of KubePod objects,
        return a map of (selectors hash -> pods) to be scheduled
        """

        pending_unassigned_pods = [
            p for p in pods
            if p.is_pending_unassigned_and_scaleworthy(self.scale_label)
        ]

        # we only consider a pod to be schedulable if it's pending and
        # unassigned and feasible
        pods_to_schedule = {}
        for pod in pending_unassigned_pods:
            if capacity.is_possible(pod):
                pods_to_schedule.setdefault(
                    utils.selectors_to_hash(pod.selectors), []).append(pod)
            else:
                recommended_capacity = capacity.max_capacity_for_selectors(
                    pod.selectors)
                logger.warn(
                    "Pending pod %s cannot fit %s. "
                    "Please check that requested resource amount is "
                    "consistent with node selectors (recommended max: %s). "
                    "Scheduling skipped." %
                    (pod.name, pod.selectors, recommended_capacity))
                self.notifier.notify_invalid_pod_capacity(
                    pod, recommended_capacity)
        return pods_to_schedule
Exemplo n.º 2
0
    def get_pods_to_schedule(self, pods):
        """
        given a list of KubePod objects,
        return a map of (selectors hash -> pods) to be scheduled
        """
        pending_unassigned_pods = [
            p for p in pods
            if p.status == KubePodStatus.PENDING and (not p.node_name)
        ]

        # we only consider a pod to be schedulable if it's pending and
        # unassigned and feasible
        pods_to_schedule = {}
        now = datetime.datetime.now(pytz.utc)
        for pod in pending_unassigned_pods:
            age = (now - pod.creation_time).total_seconds()
            self.stats.histogram('autoscaler.scaling_loop.pending_pod_age', age)

            if capacity.is_possible(pod):
                pods_to_schedule.setdefault(
                    utils.selectors_to_hash(pod.selectors), []).append(pod)
            else:
                recommended_capacity = capacity.max_capacity_for_selectors(
                    pod.selectors, pod.resources)
                logger.warn(
                    "Pending pod %s cannot fit %s. "
                    "Please check that requested resource amount is "
                    "consistent with node selectors (recommended max: %s). "
                    "Scheduling skipped." % (pod.name, pod.selectors, recommended_capacity))
                self.notifier.notify_invalid_pod_capacity(
                    pod, recommended_capacity)
        return pods_to_schedule
Exemplo n.º 3
0
    def test_scale_up(self):
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
Exemplo n.º 4
0
    def test_scale_up(self):
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
Exemplo n.º 5
0
 def test_scale_up_notification(self):
     big_pod_spec = copy.deepcopy(self.dummy_pod)
     for container in big_pod_spec['spec']['containers']:
         container['resources']['requests']['cpu'] = '100'
     pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
     big_pod = KubePod(pykube.Pod(self.api, big_pod_spec))
     selectors_hash = utils.selectors_to_hash(pod.selectors)
     asgs = self.cluster.autoscaling_groups.get_all_groups([])
     self.cluster.fulfill_pending(asgs, selectors_hash, [pod, big_pod])
     self.cluster.notifier.notify_scale.assert_called_with(mock.ANY, mock.ANY, [pod])
Exemplo n.º 6
0
    def test_scale_up_selector(self):
        self.dummy_pod['spec']['nodeSelector'] = {'aws/type': 'm4.large'}
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'],
                         0)
Exemplo n.º 7
0
    def test_scale_up_selector(self):
        self.dummy_pod['spec']['nodeSelector'] = {
            'aws/type': 'm4.large'
        }
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 1)
        self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
Exemplo n.º 8
0
    def scale(self, pods_to_schedule, all_nodes, asgs, running_insts_map):
        """
        scale up logic
        """
        # TODO: generalize to azure
        self.autoscaling_timeouts.refresh_timeouts(
            [asg for asg in asgs if asg.provider == 'aws'],
            dry_run=self.dry_run)

        cached_live_nodes = []
        for node in all_nodes:
            # either we know the physical node behind it and know it's alive
            # or we don't know it and assume it's alive
            if (node.instance_id and node.instance_id in running_insts_map) \
                    or (not node.is_managed()):
                cached_live_nodes.append(node)

        # selectors -> pending KubePods
        pending_pods = {}

        # for each pending & unassigned job, try to fit them on current machines or count requested
        #   resources towards future machines
        for selectors_hash, pods in pods_to_schedule.items():
            for pod in pods:
                fitting = None
                for node in cached_live_nodes:
                    if node.unschedulable:
                        continue
                    if node.is_match(pod) and node.can_fit(pod.resources):
                        fitting = node
                        break
                if fitting is None:
                    # because a pod may be able to fit in multiple groups
                    # pick a group now
                    selectors = dict(pod.selectors)
                    pending_pods.setdefault(utils.selectors_to_hash(selectors), []).append(pod)
                    logger.info(
                        "{pod} is pending ({selectors_hash})".format(
                            pod=pod, selectors_hash=selectors_hash))
                else:
                    fitting.count_pod(pod)
                    logger.info("{pod} fits on {node}".format(pod=pod,
                                                              node=fitting))

        # scale each node type to reach the new capacity
        for selectors_hash in set(pending_pods.keys()):
            self.fulfill_pending(asgs,
                                 selectors_hash,
                                 pending_pods.get(selectors_hash, []))
Exemplo n.º 9
0
    def test_timed_out_group(self):
        with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.is_timed_out') as is_timed_out:
            with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.scale') as scale:
                is_timed_out.return_value = True
                scale.return_value = utils.CompletedFuture(None)

                pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
                selectors_hash = utils.selectors_to_hash(pod.selectors)
                asgs = self.cluster.autoscaling_groups.get_all_groups([])
                self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

                scale.assert_not_called()

                response = self.asg_client.describe_auto_scaling_groups()
                self.assertEqual(len(response['AutoScalingGroups']), 1)
                self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
    def test_scale_up(self):
        pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
        selectors_hash = utils.selectors_to_hash(pod.selectors)
        asgs = self.cluster.autoscaling_groups.get_all_groups([])
        self.cluster.fulfill_pending(asgs, selectors_hash, [pod])

        response = self.asg_client.describe_auto_scaling_groups()
        self.assertEqual(len(response['AutoScalingGroups']), 2)
        big_gpu_asg, small_gpu_asg = {}, {}
        if (response['AutoScalingGroups'][0]['AutoScalingGroupName'] ==
                'dummy-asg-small-gpu'):
            small_gpu_asg = response['AutoScalingGroups'][0]
            big_gpu_asg = response['AutoScalingGroups'][1]
        else:
            small_gpu_asg = response['AutoScalingGroups'][1]
            big_gpu_asg = response['AutoScalingGroups'][0]

        self.assertGreater(big_gpu_asg['DesiredCapacity'], 0)
        self.assertEqual(small_gpu_asg['DesiredCapacity'], 0)
Exemplo n.º 11
0
    def get_pods_to_schedule(self, pods):
        """
        given a list of KubePod objects,
        return a map of (selectors hash -> pods) to be scheduled
        """
        pending_unassigned_pods = [
            p for p in pods
            if p.status == KubePodStatus.PENDING and (not p.node_name)
        ]

        # we only consider a pod to be schedulable if it's pending and unassigned and feasible
        pods_to_schedule = {}
        for pod in pending_unassigned_pods:
            if capacity.is_possible(pod):
                pods_to_schedule.setdefault(utils.selectors_to_hash(pod.selectors), []).append(pod)
            else:
                logger.warn(
                    "Pending pod %s cannot fit %s. Ignored" % (pod.name, pod.selectors))
        return pods_to_schedule
Exemplo n.º 12
0
    def get_pods_to_schedule(self, pods):
        """
        given a list of KubePod objects,
        return a map of (selectors hash -> pods) to be scheduled
        """
        pending_unassigned_pods = [
            p for p in pods
            if p.status == KubePodStatus.PENDING and (not p.node_name)
        ]

        # we only consider a pod to be schedulable if it's pending and unassigned and feasible
        pods_to_schedule = {}
        for pod in pending_unassigned_pods:
            if capacity.is_possible(pod):
                pods_to_schedule.setdefault(utils.selectors_to_hash(pod.selectors), []).append(pod)
            else:
                logger.warn(
                    "Pending pod %s cannot fit %s. "
                    "Please check that requested resource amount is "
                    "consistent with node selectors. Scheduling skipped." % (pod.name, pod.selectors))
        return pods_to_schedule
Exemplo n.º 13
0
    def get_pods_to_schedule(self, pods):
        """
        given a list of KubePod objects,
        return a map of (selectors hash -> pods) to be scheduled
        """
        pending_unassigned_pods = [
            p for p in pods
            if p.status == KubePodStatus.PENDING and (not p.node_name)
        ]

        # we only consider a pod to be schedulable if it's pending and
        # unassigned and feasible
        pods_to_schedule = {}
        for pod in pending_unassigned_pods:
            if capacity.is_possible(pod, self.cs_instance_type):
                pods_to_schedule.setdefault(
                    utils.selectors_to_hash(pod.selectors), []).append(pod)
            else:
                logger.warn("Pending pod %s cannot fit %s. "
                            "Please check that requested resource amount is "
                            "consistent with node size."
                            "Scheduling skipped." % (pod.name, pod.selectors))

        return pods_to_schedule
Exemplo n.º 14
0
    def get_node_state(self, node, asg, node_pods, pods_to_schedule,
                       running_insts_map, idle_selector_hash):
        """
        returns the ClusterNodeState for the given node

        params:
        node - KubeNode object
        asg - AutoScalingGroup object that this node belongs in. can be None.
        node_pods - list of KubePods assigned to this node
        pods_to_schedule - list of all pending pods
        running_inst_map - map of all (instance_id -> ec2.Instance object)
        idle_selector_hash - current map of idle nodes by type. may be modified
        """
        pending_list = []
        for pods in pods_to_schedule.values():
            for pod in pods:
                # a pod is considered schedulable onto this node if all the
                # node selectors match
                # AND it doesn't use pod affinity (which we don't support yet)
                if (node.is_match(pod) and
                        'scheduler.alpha.kubernetes.io/affinity' not in pod.annotations):
                    pending_list.append(pod)
        # we consider a node to be busy if it's running any non-DaemonSet pods
        # TODO: we can be a bit more aggressive in killing pods that are
        # replicated
        busy_list = [p for p in node_pods if not p.is_mirrored()]
        undrainable_list = [p for p in node_pods if not p.is_drainable()]
        utilization = sum((p.resources for p in busy_list), KubeResource())
        under_utilized = (self.UTIL_THRESHOLD *
                          node.capacity - utilization).possible
        drainable = not undrainable_list

        maybe_inst = running_insts_map.get(node.instance_id)
        if maybe_inst:
            age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo)
                   - maybe_inst.launch_time).seconds
            logger.warn('AGE: %s', age)
            launch_hour_offset = age % 3600
        else:
            age = None

        instance_type = utils.selectors_to_hash(
            asg.selectors) if asg else node.instance_type

        type_spare_capacity = (instance_type and self.type_idle_threshold and
                               idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT)

        if maybe_inst is None:
            return ClusterNodeState.INSTANCE_TERMINATED

        if node.is_detached():
            return ClusterNodeState.DETACHED

        if node.is_dead():
            return ClusterNodeState.DEAD

        if asg and len(asg.nodes) <= asg.min_size:
            return ClusterNodeState.ASG_MIN_SIZE

        if busy_list and not under_utilized:
            if node.unschedulable:
                return ClusterNodeState.BUSY_UNSCHEDULABLE
            return ClusterNodeState.BUSY

        if pending_list and not node.unschedulable:
            # logger.warn('PENDING: %s', pending_list)
            return ClusterNodeState.POD_PENDING

        if launch_hour_offset < self.LAUNCH_HOUR_THRESHOLD[node.provider] and not node.unschedulable:
            return ClusterNodeState.LAUNCH_HR_GRACE_PERIOD

        # elif node.provider == 'azure':
            # disabling scale down in azure for now while we ramp up
            # TODO: remove once azure is bootstrapped
            # state = ClusterNodeState.GRACE_PERIOD

        if (not type_spare_capacity and age <= self.idle_threshold) and not node.unschedulable:
            # there is already an instance of this type sitting idle
            # so we use the regular idle threshold for the grace period
            return ClusterNodeState.GRACE_PERIOD

        if (type_spare_capacity and age <= self.type_idle_threshold) and not node.unschedulable:
            # we don't have an instance of this type yet!
            # use the type idle threshold for the grace period
            # and mark the type as seen
            idle_selector_hash[instance_type] += 1
            return ClusterNodeState.TYPE_GRACE_PERIOD

        if under_utilized and (busy_list or not node.unschedulable):
            # nodes that are under utilized (but not completely idle)
            # have their own states to tell if we should drain them
            # for better binpacking or not
            if drainable:
                return ClusterNodeState.UNDER_UTILIZED_DRAINABLE
            return ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE

        if node.unschedulable:
            return ClusterNodeState.IDLE_UNSCHEDULABLE
        return ClusterNodeState.IDLE_SCHEDULABLE
Exemplo n.º 15
0
 def __str__(self):
     return "{}: {} ({})".format(self.name, self.instance_id,
                                 utils.selectors_to_hash(self.selectors))
Exemplo n.º 16
0
 def __str__(self):
     return 'AzureVirtualScaleSet({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors))
Exemplo n.º 17
0
 def __str__(self):
     return "{}: {} ({})".format(self.name, self.instance_id,
                                 utils.selectors_to_hash(self.selectors))
Exemplo n.º 18
0
    def get_node_state(self, node, asg, node_pods, pods_to_schedule,
                       running_insts_map, idle_selector_hash):
        """
        returns the ClusterNodeState for the given node

        params:
        node - KubeNode object
        asg - AutoScalingGroup object that this node belongs in. can be None.
        node_pods - list of KubePods assigned to this node
        pods_to_schedule - list of all pending pods
        running_inst_map - map of all (instance_id -> ec2.Instance object)
        idle_selector_hash - current map of idle nodes by type. may be modified.
        """
        pending_list = []
        for pods in pods_to_schedule.values():
            for pod in pods:
                if node.is_match(pod):
                    pending_list.append(pod)
        # we consider a node to be busy if it's running any non-DaemonSet pods
        # TODO: we can be a bit more aggressive in killing pods that are
        # replicated
        busy_list = [p for p in node_pods if not p.is_mirrored()]
        undrainable_list = [p for p in node_pods if not p.is_drainable()]
        utilization = sum((p.resources for p in busy_list), KubeResource())
        under_utilized = (self.UTIL_THRESHOLD * node.capacity - utilization).possible
        drainable = not undrainable_list

        maybe_inst = running_insts_map.get(node.instance_id)
        instance_type = utils.selectors_to_hash(asg.selectors) if asg else None

        if maybe_inst:
            age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo)
                   - maybe_inst.launch_time).seconds
        else:
            age = None

        instance_type = utils.selectors_to_hash(asg.selectors) if asg else node.instance_type

        if maybe_inst is None:
            state = ClusterNodeState.INSTANCE_TERMINATED
        elif asg and len(asg.nodes) <= asg.min_size:
            state = ClusterNodeState.ASG_MIN_SIZE
        elif busy_list and not under_utilized:
            if node.unschedulable:
                state = ClusterNodeState.BUSY_UNSCHEDULABLE
            else:
                state = ClusterNodeState.BUSY
        elif pending_list and not node.unschedulable:
            state = ClusterNodeState.POD_PENDING
        elif ((not self.type_idle_threshold or idle_selector_hash[instance_type] >= self.TYPE_IDLE_COUNT)
              and age <= self.idle_threshold) and not node.unschedulable:
            # there is already an instance of this type sitting idle
            # so we use the regular idle threshold for the grace period
            state = ClusterNodeState.GRACE_PERIOD
        elif (instance_type and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT
              and age <= self.type_idle_threshold) and not node.unschedulable:
            # we don't have an instance of this type yet!
            # use the type idle threshold for the grace period
            # and mark the type as seen
            idle_selector_hash[instance_type] += 1
            state = ClusterNodeState.TYPE_GRACE_PERIOD
        elif under_utilized and not node.unschedulable:
            if drainable:
                state = ClusterNodeState.UNDER_UTILIZED_DRAINABLE
            else:
                state = ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE
        else:
            if node.unschedulable:
                state = ClusterNodeState.IDLE_UNSCHEDULABLE
            else:
                state = ClusterNodeState.IDLE_SCHEDULABLE

        return state
Exemplo n.º 19
0
 def __str__(self):
     return 'AutoScalingGroup({name}, {selectors_hash})'.format(
         name=self.name,
         selectors_hash=utils.selectors_to_hash(self.selectors))
Exemplo n.º 20
0
 def __str__(self):
     return 'AutoScalingGroup({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors))
Exemplo n.º 21
0
    def get_node_state(self, node, asg, node_pods, pods_to_schedule,
                       running_insts_map, idle_selector_hash):
        """
        returns the ClusterNodeState for the given node

        params:
        node - KubeNode object
        asg - AutoScalingGroup object that this node belongs in. can be None.
        node_pods - list of KubePods assigned to this node
        pods_to_schedule - list of all pending pods
        running_inst_map - map of all (instance_id -> ec2.Instance object)
        idle_selector_hash - current map of idle nodes by type. may be modified.
        """
        pending_list = []
        for pods in pods_to_schedule.itervalues():
            for pod in pods:
                if node.is_match(pod):
                    pending_list.append(pod)
        # we consider a node to be busy if it's running any non-DaemonSet pods
        # TODO: we can be a bit more aggressive in killing pods that are
        # replicated
        busy_list = [p for p in node_pods if not p.is_mirrored()]
        maybe_inst = running_insts_map.get(node.instance_id)
        instance_type = utils.selectors_to_hash(asg.selectors) if asg else None

        if maybe_inst:
            age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo)
                   - maybe_inst.launch_time).seconds
        else:
            age = None

        instance_type = utils.selectors_to_hash(asg.selectors) if asg else node.instance_type

        if maybe_inst is None:
            state = ClusterNodeState.INSTANCE_TERMINATED
        elif asg and len(asg.nodes) <= asg.min_size:
            state = ClusterNodeState.ASG_MIN_SIZE
        elif busy_list:
            if node.unschedulable:
                state = ClusterNodeState.BUSY_UNSCHEDULABLE
            else:
                state = ClusterNodeState.BUSY
        elif pending_list:
            state = ClusterNodeState.POD_PENDING
        elif (idle_selector_hash[instance_type] >= self.TYPE_IDLE_COUNT
              and age <= self.idle_threshold):
            # there is already an instance of this type sitting idle
            # so we use the regular idle threshold for the grace period
            state = ClusterNodeState.GRACE_PERIOD
        elif (instance_type and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT
              and age <= self.type_idle_threshold):
            # we don't have an instance of this type yet!
            # use the type idle threshold for the grace period
            # and mark the type as seen
            idle_selector_hash[instance_type] += 1
            state = ClusterNodeState.TYPE_GRACE_PERIOD
        else:
            if node.unschedulable:
                state = ClusterNodeState.IDLE_UNSCHEDULABLE
            else:
                state = ClusterNodeState.IDLE_SCHEDULABLE

        return state