def main(cluster_name, regions, sleep, kubeconfig, aws_access_key, aws_secret_key, idle_threshold, type_idle_threshold, instance_init_time, slack_hook, dry_run, verbose): if verbose > 0: logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.DEBUG)) if not (aws_secret_key and aws_access_key): logger.error("Missing AWS credentials. Please provide aws-access-key and aws-secret-key.") sys.exit(1) cluster = Cluster(aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, regions=regions.split(','), kubeconfig=kubeconfig, idle_threshold=idle_threshold, instance_init_time=instance_init_time, type_idle_threshold=type_idle_threshold, cluster_name=cluster_name, slack_hook=slack_hook, dry_run=dry_run ) backoff = sleep while True: scaled = cluster.scale_loop() if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
class TestCluster(unittest.TestCase): def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) for condition in self.dummy_node['status']['conditions']: if condition['type'] == 'Ready' and condition['status'] == 'True': condition['lastHeartbeatTime'] = datetime.now(condition['lastHeartbeatTime'].tzinfo) # Convert timestamps to strings to match PyKube for condition in self.dummy_node['status']['conditions']: condition['lastHeartbeatTime'] = datetime.isoformat(condition['lastHeartbeatTime']) condition['lastTransitionTime'] = datetime.isoformat(condition['lastTransitionTime']) # this isn't actually used here # only needed to create the KubePod object... dir_path = os.path.dirname(os.path.realpath(__file__)) self.api = pykube.HTTPClient(pykube.KubeConfig.from_file(os.path.join(dir_path, './data/kube_config.yaml'))) self.cluster = Cluster( kubeconfig='~/.kube/config', idle_threshold=60, spare_agents=1, instance_init_time=60, resource_group='my-rg', notifier=None, service_principal_app_id='dummy', service_principal_secret='dummy', service_principal_tenant_id='dummy', subscription_id='dummy', client_private_key='dummy', ca_private_key='dummy', ignore_pools='', over_provision=0 ) def test_get_pending_pods(self): dummy_node = copy.deepcopy(self.dummy_node) dummy_node['metadata']['name'] = 'k8s-agentpool1-16334397-0' node = KubeNode(pykube.Node(self.api, dummy_node)) node.capacity = capacity.get_capacity_for_instance_type(node.instance_type) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) act = self.cluster.get_pending_pods([pod], [node]) self.assertEqual(len(act), 0) node = KubeNode(pykube.Node(self.api, dummy_node)) node.capacity = capacity.get_capacity_for_instance_type(node.instance_type) pod2 = KubePod(pykube.Pod(self.api, self.dummy_pod)) pod3 = KubePod(pykube.Pod(self.api, self.dummy_pod)) act = self.cluster.get_pending_pods([pod, pod2, pod3], [node]) #only one should fit self.assertEqual(len(act), 2)
def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) # this isn't actually used here # only needed to create the KubePod object... self.api = pykube.HTTPClient( pykube.KubeConfig.from_file('~/.kube/config')) # start creating our mock ec2 environment self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] for moto_mock in self.mocks: moto_mock.start() client = boto3.client('autoscaling', region_name='us-west-2') self.asg_client = client client.create_launch_configuration(LaunchConfigurationName='dummy-lc', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='t2.medium') client.create_auto_scaling_group(AutoScalingGroupName='dummy-asg', LaunchConfigurationName='dummy-lc', MinSize=0, MaxSize=10, VPCZoneIdentifier='subnet-beefbeef', Tags=[{ 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True }]) # finally our cluster self.cluster = Cluster(aws_access_key='', aws_secret_key='', regions=['us-west-2', 'us-east-1', 'us-west-1'], kubeconfig='~/.kube/config', pod_namespace=None, idle_threshold=60, instance_init_time=60, type_idle_threshold=60, cluster_name='dummy-cluster', notifier=Notifier(), dry_run=False)
def main(cluster_name, aws_regions, azure_resource_groups, azure_slow_scale_classes, sleep, kubeconfig, azure_client_id, azure_client_secret, azure_subscription_id, azure_tenant_id, aws_access_key, aws_secret_key, pod_namespace, datadog_api_key, idle_threshold, type_idle_threshold, max_scale_in_fraction, drain_utilization, over_provision, instance_init_time, no_scale, no_maintenance, slack_hook, slack_bot_token, dry_run, verbose): logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%dT%H:%M:%S%z')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) aws_regions_list = aws_regions.split(',') if aws_regions else [] if not (aws_secret_key and aws_access_key) and aws_regions_list: logger.error( "Missing AWS credentials. Please provide aws-access-key and aws-secret-key." ) sys.exit(1) notifier = Notifier(slack_hook, slack_bot_token) cluster = Cluster( aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, aws_regions=aws_regions_list, azure_client_id=azure_client_id, azure_client_secret=azure_client_secret, azure_subscription_id=azure_subscription_id, azure_tenant_id=azure_tenant_id, azure_resource_group_names=azure_resource_groups.split(',') if azure_resource_groups else [], azure_slow_scale_classes=azure_slow_scale_classes.split(',') if azure_slow_scale_classes else [], kubeconfig=kubeconfig, pod_namespace=pod_namespace, idle_threshold=idle_threshold, instance_init_time=instance_init_time, type_idle_threshold=type_idle_threshold, cluster_name=cluster_name, max_scale_in_fraction=max_scale_in_fraction, drain_utilization_below=drain_utilization, scale_up=not no_scale, maintainance=not no_maintenance, over_provision=over_provision, datadog_api_key=datadog_api_key, notifier=notifier, dry_run=dry_run, ) backoff = sleep while True: scaled = cluster.scale_loop() if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
def main(resource_group, acs_deployment, sleep, kubeconfig, service_principal_app_id, service_principal_secret, kubeconfig_private_key, client_private_key, service_principal_tenant_id, spare_agents, idle_threshold, no_scale, over_provision, no_maintenance, ignore_pools, slack_hook, slack_bot_token, dry_run, verbose, debug): logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) if not (service_principal_app_id and service_principal_secret and service_principal_tenant_id): logger.error("Missing Azure credentials. Please provide aws-service_principal_app_id, service_principal_secret and service_principal_tenant_id.") sys.exit(1) if not client_private_key: logger.error('Missing client_private_key. Provide it through --client-private-key or CLIENT_PRIVATE_KEY environment variable') if not kubeconfig_private_key: logger.error('Missing kubeconfig_private_key. Provide it through --kubeconfig-private-key or KUBECONFIG_PRIVATE_KEY environment variable') notifier = None if slack_hook and slack_bot_token: notifier = Notifier(slack_hook, slack_bot_token) instance_init_time = 600 cluster = Cluster(kubeconfig=kubeconfig, instance_init_time=instance_init_time, spare_agents=spare_agents, idle_threshold=idle_threshold, resource_group=resource_group, acs_deployment=acs_deployment, service_principal_app_id=service_principal_app_id, service_principal_secret=service_principal_secret, service_principal_tenant_id=service_principal_tenant_id, kubeconfig_private_key=kubeconfig_private_key, client_private_key=client_private_key, scale_up=not no_scale, ignore_pools=ignore_pools, maintainance=not no_maintenance, over_provision=over_provision, notifier=notifier, dry_run=dry_run, ) cluster.login() backoff = sleep while True: scaled = cluster.loop(debug) if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
def main(container_service_name, resource_group, sleep, kubeconfig, service_principal_app_id, service_principal_secret, service_principal_tenant_id, cpu_per_node, datadog_api_key, idle_threshold, reserve_idle_threshold, over_provision, instance_init_time, no_scale, no_maintenance, slack_hook, slack_bot_token, dry_run, verbose): logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) if not (service_principal_app_id and service_principal_secret and service_principal_tenant_id): logger.error( "Missing Azure credentials. Please provide aws-service_principal_app_id, service_principal_secret and service_principal_tenant_id." ) sys.exit(1) notifier = Notifier(slack_hook, slack_bot_token) cluster = Cluster( service_principal_app_id=service_principal_app_id, service_principal_secret=service_principal_secret, service_principal_tenant_id=service_principal_tenant_id, kubeconfig=kubeconfig, idle_threshold=idle_threshold, instance_init_time=instance_init_time, reserve_idle_threshold=reserve_idle_threshold, container_service_name=container_service_name, resource_group=resource_group, scale_up=not no_scale, maintainance=not no_maintenance, over_provision=over_provision, datadog_api_key=datadog_api_key, notifier=notifier, dry_run=dry_run, ) backoff = sleep while True: scaled = cluster.scale_loop() if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) for condition in self.dummy_node['status']['conditions']: if condition['type'] == 'Ready' and condition[ 'status'] == 'True': condition['lastHeartbeatTime'] = datetime.now( condition['lastHeartbeatTime'].tzinfo) # Convert timestamps to strings to match PyKube for condition in self.dummy_node['status']['conditions']: condition['lastHeartbeatTime'] = datetime.isoformat( condition['lastHeartbeatTime']) condition['lastTransitionTime'] = datetime.isoformat( condition['lastTransitionTime']) # this isn't actually used here # only needed to create the KubePod object... dir_path = os.path.dirname(os.path.realpath(__file__)) self.api = pykube.HTTPClient( pykube.KubeConfig.from_file( os.path.join(dir_path, './data/kube_config.yaml'))) self.cluster = Cluster(kubeconfig='~/.kube/config', idle_threshold=60, spare_agents=1, instance_init_time=60, resource_group='my-rg', notifier=None, service_principal_app_id='dummy', service_principal_secret='dummy', service_principal_tenant_id='dummy', etcd_client_private_key='dummy', etcd_server_private_key='dummy', subscription_id='dummy', kubeconfig_private_key='dummy', client_private_key='dummy', ca_private_key='dummy', ignore_pools='', over_provision=0)
def main(cluster_name, regions, sleep, kubeconfig, pod_namespace, aws_access_key, aws_secret_key, datadog_api_key, idle_threshold, type_idle_threshold, over_provision, instance_init_time, no_scale, no_maintenance, slack_hook, slack_bot_token, dry_run, verbose, drainable_labels, scale_label, instance_type_priorities): logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) if not (aws_secret_key and aws_access_key): logger.error( "Missing AWS credentials. Please provide aws-access-key and aws-secret-key." ) sys.exit(1) notifier = Notifier(slack_hook, slack_bot_token) cluster = Cluster(aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, regions=regions.split(','), kubeconfig=kubeconfig, pod_namespace=pod_namespace, idle_threshold=idle_threshold, instance_init_time=instance_init_time, type_idle_threshold=type_idle_threshold, cluster_name=cluster_name, scale_up=not no_scale, maintainance=not no_maintenance, over_provision=over_provision, datadog_api_key=datadog_api_key, notifier=notifier, dry_run=dry_run, drainable_labels=drainable_labels, scale_label=scale_label, instance_type_priorities=instance_type_priorities) backoff = sleep while True: scaled = cluster.scale_loop() if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
def main(sleep, kubeconfig, kubecontext, scale_out_webhook, scale_in_webhook, spare_agents, pool_name_regex, idle_threshold, drain, no_scale, over_provision, no_maintenance, ignore_pools, slack_hook, verbose, debug): logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) notifier = None if slack_hook: notifier = Notifier(slack_hook) cluster = Cluster(kubeconfig=kubeconfig, kubecontext=kubecontext, scale_out_webhook=scale_out_webhook, scale_in_webhook=scale_in_webhook, pool_name_regex=pool_name_regex, spare_agents=spare_agents, idle_threshold=idle_threshold, drain=drain, scale_up=not no_scale, ignore_pools=ignore_pools, maintainance=not no_maintenance, over_provision=over_provision, notifier=notifier) cluster.login() backoff = sleep while True: scaled = cluster.loop(debug) if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
def main(cluster_name, regions, sleep, kubeconfig, aws_access_key, aws_secret_key, datadog_api_key, idle_threshold, type_idle_threshold, instance_init_time, no_scale, no_maintenance, slack_hook, dry_run, verbose): logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) if not (aws_secret_key and aws_access_key): logger.error("Missing AWS credentials. Please provide aws-access-key and aws-secret-key.") sys.exit(1) cluster = Cluster(aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, regions=regions.split(','), kubeconfig=kubeconfig, idle_threshold=idle_threshold, instance_init_time=instance_init_time, type_idle_threshold=type_idle_threshold, cluster_name=cluster_name, scale_up=not no_scale, maintainance=not no_maintenance, datadog_api_key=datadog_api_key, slack_hook=slack_hook, dry_run=dry_run ) backoff = sleep while True: scaled = cluster.scale_loop() if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
def main(provider_name, timer, scale_up_cap, scale_down_cap, scale_max, scale_min, azure_subscription_id, azure_tenant_id, azure_client_id, azure_client_secret, azure_location, azure_resource_group, azure_vmss_name, verbose): #Logger settings logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) logger.debug("Debug mode activated") if not provider_name: logger.error("Provider not specified, ex : --provider Azure") sys.exit(1) logger.debug("Provider Name : " + provider_name) logger.debug("Timer : " + str(timer)) logger.debug("Scale Up Cap : " + str(scale_up_cap)) logger.debug("Scale Down Cap : " + str(scale_down_cap)) logger.debug("Maximum Nodes : " + str(scale_max)) logger.debug("Minimum Nodes : " + str(scale_min)) logger.debug("Azure Subscription ID : " + azure_subscription_id) logger.debug("Azure Tenant ID : " + azure_tenant_id) logger.debug("Azure Client ID : " + azure_client_id) logger.debug("Azure Client Secret : " + azure_client_secret) logger.debug("Azure Resource Group : " + azure_resource_group) logger.debug("Azure Location : " + azure_location) logger.debug("Azure VMSS Targeted : " + azure_vmss_name) logger.info("DC/OS Autoscaler Started") cluster = Cluster(provider_name=provider_name, scale_up_cap=scale_up_cap, scale_down_cap=scale_down_cap, scale_max=scale_max, scale_min=scale_min, azure_subscription_id=azure_subscription_id, azure_tenant_id=azure_tenant_id, azure_client_id=azure_client_id, azure_client_secret=azure_client_secret, azure_location=azure_location, azure_resource_group=azure_resource_group, azure_vmss_name=azure_vmss_name) while True: cluster.check_health() cluster.decide_to_scale() time.sleep(timer) logger.info("DC/OS Autoscaler Stopped")
class TestCluster(unittest.TestCase): def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) for condition in self.dummy_node['status']['conditions']: if condition['type'] == 'Ready' and condition['status'] == 'True': condition['lastHeartbeatTime'] = datetime.now(condition['lastHeartbeatTime'].tzinfo) # Convert timestamps to strings to match PyKube for condition in self.dummy_node['status']['conditions']: condition['lastHeartbeatTime'] = datetime.isoformat(condition['lastHeartbeatTime']) condition['lastTransitionTime'] = datetime.isoformat(condition['lastTransitionTime']) # this isn't actually used here # only needed to create the KubePod object... self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config')) # start creating our mock ec2 environment self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] for moto_mock in self.mocks: moto_mock.start() client = boto3.client('autoscaling', region_name='us-west-2') self.asg_client = client client.create_launch_configuration( LaunchConfigurationName='dummy-lc', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='t2.medium' ) client.create_auto_scaling_group( AutoScalingGroupName='dummy-asg', LaunchConfigurationName='dummy-lc', MinSize=0, MaxSize=10, VPCZoneIdentifier='subnet-beefbeef', Tags=[ { 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True } ] ) # finally our cluster self.cluster = Cluster( aws_access_key='fake', aws_secret_key='fake', aws_regions=['us-west-2', 'us-east-1', 'us-west-1'], azure_client_id='', azure_client_secret='', azure_subscription_id='', azure_tenant_id='', azure_resource_group_names=[], azure_slow_scale_classes=[], kubeconfig='~/.kube/config', pod_namespace=None, idle_threshold=60, instance_init_time=60, type_idle_threshold=60, cluster_name='dummy-cluster', notifier=mock.Mock(), dry_run=False ) def tearDown(self): for moto_mock in self.mocks: moto_mock.stop() def _spin_up_node(self, launch_time=None): return self._spin_up_nodes(1, launch_time=launch_time)[0] def _spin_up_nodes(self, count, launch_time=None): assert count <= 256 # spin up dummy ec2 node self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg', DesiredCapacity=count) response = self.asg_client.describe_auto_scaling_groups() nodes = [] for i, instance in enumerate(response['AutoScalingGroups'][0]['Instances']): instance_id = instance['InstanceId'] dummy_node = copy.deepcopy(self.dummy_node) dummy_node['metadata']['labels']['aws/id'] = instance_id dummy_node['metadata']['name'] = '10.0.' + str(i) + '.228' node = KubeNode(pykube.Node(self.api, dummy_node)) node.cordon = mock.Mock(return_value="mocked stuff") node.drain = mock.Mock(return_value="mocked stuff") node.uncordon = mock.Mock(return_value="mocked stuff") node.delete = mock.Mock(return_value="mocked stuff") nodes.append(node) return nodes def test_reap_dead_node(self): node = copy.deepcopy(self.dummy_node) TestInstance = collections.namedtuple('TestInstance', ['launch_time']) instance = TestInstance(datetime.now(pytz.utc)) ready_condition = None for condition in node['status']['conditions']: if condition['type'] == 'Ready': ready_condition = condition break ready_condition['status'] = 'Unknown' ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(minutes=30)) kube_node = KubeNode(pykube.Node(self.api, node)) kube_node.delete = mock.Mock(return_value="mocked stuff") self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], []) kube_node.delete.assert_not_called() ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2)) kube_node = KubeNode(pykube.Node(self.api, node)) kube_node.delete = mock.Mock(return_value="mocked stuff") self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], []) kube_node.delete.assert_called_once_with() def test_max_scale_in(self): node1 = copy.deepcopy(self.dummy_node) node2 = copy.deepcopy(self.dummy_node) TestInstance = collections.namedtuple('TestInstance', ['launch_time']) instance1 = TestInstance(datetime.now(pytz.utc)) instance2 = TestInstance(datetime.now(pytz.utc)) for node in [node1, node2]: for condition in node['status']['conditions']: if condition['type'] == 'Ready': condition['status'] = 'Unknown' condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2)) break kube_node1 = KubeNode(pykube.Node(self.api, node1)) kube_node1.delete = mock.Mock(return_value="mocked stuff") kube_node2 = KubeNode(pykube.Node(self.api, node2)) kube_node2.delete = mock.Mock(return_value="mocked stuff") self.cluster.maintain([kube_node1, kube_node2], {kube_node1.instance_id: instance1, kube_node2.instance_id: instance2}, {}, [], []) kube_node1.delete.assert_not_called() kube_node2.delete.assert_not_called() def test_scale_up_selector(self): self.dummy_pod['spec']['nodeSelector'] = { 'aws/type': 'm4.large' } pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) def test_scale_up(self): pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) def test_scale_up_notification(self): big_pod_spec = copy.deepcopy(self.dummy_pod) for container in big_pod_spec['spec']['containers']: container['resources']['requests']['cpu'] = '100' pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) big_pod = KubePod(pykube.Pod(self.api, big_pod_spec)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod, big_pod]) self.cluster.notifier.notify_scale.assert_called_with(mock.ANY, mock.ANY, [pod]) def test_timed_out_group(self): with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.is_timed_out') as is_timed_out: with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.scale') as scale: is_timed_out.return_value = True scale.return_value = utils.CompletedFuture(None) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) scale.assert_not_called() response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) def test_scale_down(self): """ kube node with daemonset and no pod --> cordon """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) running_or_pending_assigned_pods = [ds_pod] self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1 self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, running_or_pending_assigned_pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_called_once_with() def test_scale_down_launch_grace_period(self): """ kube node with daemonset and no pod + launch grace period --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) running_or_pending_assigned_pods = [ds_pod] self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = 60*30 self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, running_or_pending_assigned_pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_grace_period(self): """ kube node with daemonset and no pod + grace period --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # kube node with daemonset and no pod --> cordon ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) running_or_pending_assigned_pods = [ds_pod] self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, running_or_pending_assigned_pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_busy(self): """ kube node with daemonset and pod/rc-pod --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # kube node with daemonset and pod --> noop ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pod_scenarios = [ # kube node with daemonset and pod --> noop [ds_pod, pod], # kube node with daemonset and rc pod --> noop [ds_pod, rc_pod] ] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 for pods in pod_scenarios: state = self.cluster.get_node_state( node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.BUSY) self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_under_utilized_undrainable(self): """ kube node with daemonset and pod/rc-pod --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # create some undrainable pods ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) for container in self.dummy_pod['spec']['containers']: container.pop('resources', None) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true' for container in self.dummy_rc_pod['spec']['containers']: container.pop('resources', None) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pod_scenarios = [ # kube node with daemonset and pod with no resource ask --> noop [ds_pod, pod], # kube node with daemonset and critical rc pod --> noop [ds_pod, rc_pod] ] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1 for pods in pod_scenarios: state = self.cluster.get_node_state( node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE) self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_under_utilized_drainable(self): """ kube node with daemonset and rc-pod --> cordon+drain """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # create some undrainable pods ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) for container in self.dummy_rc_pod['spec']['containers']: container.pop('resources', None) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pods = [ds_pod, rc_pod] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1 state = self.cluster.get_node_state( node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE) self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_called_once_with() node.drain.assert_called_once_with(pods, notifier=mock.ANY) def test_prioritization(self): TestingGroup = collections.namedtuple('TestingGroup', ['region', 'name', 'selectors', 'global_priority', 'is_spot']) high_pri = TestingGroup('test', 'test', {}, -1, False) low_pri = TestingGroup('test', 'test', {}, 0, False) self.assertEqual([high_pri, low_pri], list(self.cluster._prioritize_groups([low_pri, high_pri])))
class TestClusterWithPrioritiesConfiguredReversed(unittest.TestCase): def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) # this isn't actually used here # only needed to create the KubePod object... self.api = pykube.HTTPClient( pykube.KubeConfig.from_file('~/.kube/config')) # start creating our mock ec2 environment self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] for moto_mock in self.mocks: moto_mock.start() client = boto3.client('autoscaling', region_name='us-west-2') self.asg_client = client client.create_launch_configuration( LaunchConfigurationName='dummy-lc-large-gpu', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='p2.8xlarge') client.create_launch_configuration( LaunchConfigurationName='dummy-lc-small-gpu', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='p2.xlarge') client.create_auto_scaling_group( AutoScalingGroupName='dummy-asg-large-gpu', LaunchConfigurationName='dummy-lc-large-gpu', MinSize=0, MaxSize=2, VPCZoneIdentifier='subnet-beefbeef', Tags=[{ 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster-with-priorities', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True }]) client.create_auto_scaling_group( AutoScalingGroupName='dummy-asg-small-gpu', LaunchConfigurationName='dummy-lc-small-gpu', MinSize=0, MaxSize=2, VPCZoneIdentifier='subnet-beefbeef', Tags=[{ 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster-with-priorities', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True }]) # Note that instance_type_priorities is set. # p2.8xlarges have a higher priority here. self.cluster = Cluster(aws_access_key='', aws_secret_key='', regions=['us-west-2'], kubeconfig='~/.kube/config', pod_namespace=None, idle_threshold=60, instance_init_time=60, type_idle_threshold=60, cluster_name='dummy-cluster-with-priorities', instance_type_priorities={ 'p2.xlarge': set(['10']), 'p2.8xlarge': set(['2']) }, notifier=Notifier(), dry_run=False) def tearDown(self): for moto_mock in self.mocks: moto_mock.stop() def test_scale_up(self): pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 2) big_gpu_asg, small_gpu_asg = {}, {} if (response['AutoScalingGroups'][0]['AutoScalingGroupName'] == 'dummy-asg-small-gpu'): small_gpu_asg = response['AutoScalingGroups'][0] big_gpu_asg = response['AutoScalingGroups'][1] else: small_gpu_asg = response['AutoScalingGroups'][1] big_gpu_asg = response['AutoScalingGroups'][0] self.assertGreater(big_gpu_asg['DesiredCapacity'], 0) self.assertEqual(small_gpu_asg['DesiredCapacity'], 0)
class TestCluster(unittest.TestCase): def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) # this isn't actually used here # only needed to create the KubePod object... self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config')) # start creating our mock ec2 environment self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] for moto_mock in self.mocks: moto_mock.start() client = boto3.client('autoscaling') self.asg_client = client client.create_launch_configuration( LaunchConfigurationName='dummy-lc', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='t2.medium' ) client.create_auto_scaling_group( AutoScalingGroupName='dummy-asg', LaunchConfigurationName='dummy-lc', MinSize=0, MaxSize=10, VPCZoneIdentifier='subnet-beefbeef', Tags=[ { 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True } ] ) # finally our cluster self.cluster = Cluster( aws_access_key='', aws_secret_key='', regions=['us-west-2', 'us-east-1', 'us-west-1'], kubeconfig='~/.kube/config', idle_threshold=60, instance_init_time=60, type_idle_threshold=60, cluster_name='dummy-cluster', slack_hook='', dry_run=False ) def tearDown(self): for moto_mock in self.mocks: moto_mock.stop() def _spin_up_node(self): # spin up dummy ec2 node self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg', DesiredCapacity=1) response = self.asg_client.describe_auto_scaling_groups() instance_id = response['AutoScalingGroups'][0]['Instances'][0]['InstanceId'] self.dummy_node['metadata']['labels']['aws/id'] = instance_id node = KubeNode(pykube.Node(self.api, self.dummy_node)) node.cordon = mock.Mock(return_value="mocked stuff") node.drain = mock.Mock(return_value="mocked stuff") node.uncordon = mock.Mock(return_value="mocked stuff") node.delete = mock.Mock(return_value="mocked stuff") return node def test_scale_up_selector(self): self.dummy_pod['spec']['nodeSelector'] = { 'aws/type': 'm4.large' } pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) def test_scale_up(self): pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) def test_scale_down(self): """ kube node with daemonset and no pod --> cordon """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) running_or_pending_assigned_pods = [ds_pod] self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, running_or_pending_assigned_pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_called_once_with() def test_scale_down_grace_period(self): """ kube node with daemonset and no pod + grace period --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # kube node with daemonset and no pod --> cordon ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) running_or_pending_assigned_pods = [ds_pod] self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, running_or_pending_assigned_pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_busy(self): """ kube node with daemonset and pod/rc-pod --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # kube node with daemonset and pod --> noop ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pod_scenarios = [ # kube node with daemonset and pod --> noop [ds_pod, pod], # kube node with daemonset and rc pod --> noop [ds_pod, rc_pod] ] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 for pods in pod_scenarios: state = self.cluster.get_node_state( node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.BUSY) self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_under_utilized_undrainable(self): """ kube node with daemonset and pod/rc-pod --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # create some undrainable pods ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) for container in self.dummy_pod['spec']['containers']: container.pop('resources', None) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true' for container in self.dummy_rc_pod['spec']['containers']: container.pop('resources', None) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pod_scenarios = [ # kube node with daemonset and pod with no resource ask --> noop [ds_pod, pod], # kube node with daemonset and critical rc pod --> noop [ds_pod, rc_pod] ] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 for pods in pod_scenarios: state = self.cluster.get_node_state( node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE) self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_under_utilized_drainable(self): """ kube node with daemonset and rc-pod --> cordon+drain """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map(managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # create some undrainable pods ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) for container in self.dummy_rc_pod['spec']['containers']: container.pop('resources', None) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pods = [ds_pod, rc_pod] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 state = self.cluster.get_node_state( node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE) self.cluster.maintain( managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_called_once_with() node.drain.assert_called_once_with(pods)
def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) # this isn't actually used here # only needed to create the KubePod object... self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config')) # start creating our mock ec2 environment self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] for moto_mock in self.mocks: moto_mock.start() client = boto3.client('autoscaling') self.asg_client = client client.create_launch_configuration( LaunchConfigurationName='dummy-lc', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='t2.medium' ) client.create_auto_scaling_group( AutoScalingGroupName='dummy-asg', LaunchConfigurationName='dummy-lc', MinSize=0, MaxSize=10, VPCZoneIdentifier='subnet-beefbeef', Tags=[ { 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True } ] ) # finally our cluster self.cluster = Cluster( aws_access_key='', aws_secret_key='', regions=['us-west-2', 'us-east-1', 'us-west-1'], kubeconfig='~/.kube/config', idle_threshold=60, instance_init_time=60, type_idle_threshold=60, cluster_name='dummy-cluster', slack_hook='', dry_run=False )
class TestCluster(unittest.TestCase): def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) # this isn't actually used here # only needed to create the KubePod object... self.api = pykube.HTTPClient( pykube.KubeConfig.from_file('~/.kube/config')) # start creating our mock ec2 environment self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] for moto_mock in self.mocks: moto_mock.start() client = boto3.client('autoscaling') self.asg_client = client client.create_launch_configuration(LaunchConfigurationName='dummy-lc', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='t2.medium') client.create_auto_scaling_group(AutoScalingGroupName='dummy-asg', LaunchConfigurationName='dummy-lc', MinSize=0, MaxSize=10, VPCZoneIdentifier='subnet-beefbeef', Tags=[{ 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True }]) # finally our cluster self.cluster = Cluster(aws_access_key='', aws_secret_key='', regions=['us-west-2', 'us-east-1', 'us-west-1'], kubeconfig='~/.kube/config', idle_threshold=60, instance_init_time=60, type_idle_threshold=60, cluster_name='dummy-cluster', slack_hook='', dry_run=False) def tearDown(self): for moto_mock in self.mocks: moto_mock.stop() def _spin_up_node(self): # spin up dummy ec2 node self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg', DesiredCapacity=1) response = self.asg_client.describe_auto_scaling_groups() instance_id = response['AutoScalingGroups'][0]['Instances'][0][ 'InstanceId'] self.dummy_node['metadata']['labels']['aws/id'] = instance_id node = KubeNode(pykube.Node(self.api, self.dummy_node)) node.cordon = mock.Mock(return_value="mocked stuff") node.drain = mock.Mock(return_value="mocked stuff") node.uncordon = mock.Mock(return_value="mocked stuff") node.delete = mock.Mock(return_value="mocked stuff") return node def test_scale_up_selector(self): self.dummy_pod['spec']['nodeSelector'] = {'aws/type': 'm4.large'} pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) def test_scale_up(self): pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) def test_scale_down(self): """ kube node with daemonset and no pod --> cordon """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map( managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) running_or_pending_assigned_pods = [ds_pod] self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 self.cluster.maintain(managed_nodes, running_insts_map, pods_to_schedule, running_or_pending_assigned_pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_called_once_with() def test_scale_down_grace_period(self): """ kube node with daemonset and no pod + grace period --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map( managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # kube node with daemonset and no pod --> cordon ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) running_or_pending_assigned_pods = [ds_pod] self.cluster.maintain(managed_nodes, running_insts_map, pods_to_schedule, running_or_pending_assigned_pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_busy(self): """ kube node with daemonset and pod/rc-pod --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map( managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # kube node with daemonset and pod --> noop ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pod_scenarios = [ # kube node with daemonset and pod --> noop [ds_pod, pod], # kube node with daemonset and rc pod --> noop [ds_pod, rc_pod] ] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 for pods in pod_scenarios: state = self.cluster.get_node_state(node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.BUSY) self.cluster.maintain(managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual( response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_under_utilized_undrainable(self): """ kube node with daemonset and pod/rc-pod --> noop """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map( managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # create some undrainable pods ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) for container in self.dummy_pod['spec']['containers']: container.pop('resources', None) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true' for container in self.dummy_rc_pod['spec']['containers']: container.pop('resources', None) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pod_scenarios = [ # kube node with daemonset and pod with no resource ask --> noop [ds_pod, pod], # kube node with daemonset and critical rc pod --> noop [ds_pod, rc_pod] ] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 for pods in pod_scenarios: state = self.cluster.get_node_state(node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE) self.cluster.maintain(managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual( response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_not_called() def test_scale_down_under_utilized_drainable(self): """ kube node with daemonset and rc-pod --> cordon+drain """ node = self._spin_up_node() all_nodes = [node] managed_nodes = [n for n in all_nodes if node.is_managed()] running_insts_map = self.cluster.get_running_instances_map( managed_nodes) pods_to_schedule = {} asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) # create some undrainable pods ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) for container in self.dummy_rc_pod['spec']['containers']: container.pop('resources', None) rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) pods = [ds_pod, rc_pod] # make sure we're not on grace period self.cluster.idle_threshold = -1 self.cluster.type_idle_threshold = -1 state = self.cluster.get_node_state(node, asgs[0], pods, pods_to_schedule, running_insts_map, collections.Counter()) self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE) self.cluster.maintain(managed_nodes, running_insts_map, pods_to_schedule, pods, asgs) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) node.cordon.assert_called_once_with() node.drain.assert_called_once_with(pods)
def main(provider_name, timer, scale_up_cap, scale_down_cap, scale_max, scale_min, endpoint_path, azure_subscription_id, azure_tenant_id, azure_client_id, azure_client_secret, azure_location, azure_resource_group, azure_vmss_name, verbose): #Logger settings logger_handler = logging.StreamHandler(sys.stderr) loginformater = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logger_handler.setFormatter(logging.Formatter(loginformater)) LOGGER.addHandler(logger_handler) LOGGER.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) LOGGER.debug("Debug mode activated") #os.environ.get('DATABASE_NAME', '') if not os.environ.get('AS_PROVIDER_NAME', provider_name): LOGGER.error("Provider not specified, ex : --provider-name Azure") sys.exit(1) LOGGER.debug("Provider Name : " + str(os.environ.get('AS_PROVIDER_NAME', provider_name))) LOGGER.debug("Timer : " + str(os.environ.get('AS_TIMER', timer))) LOGGER.debug("Scale Up Cap : " + str(os.environ.get('AS_SCALE_UP_MAX', scale_up_cap))) LOGGER.debug("Scale Down Cap : " + str(os.environ.get('AS_SCALE_DOWN_MAX', scale_down_cap))) LOGGER.debug("Maximum Nodes : " + str(os.environ.get('AS_SCALE_MAX', scale_max))) LOGGER.debug("Minimum Nodes : " + str(os.environ.get('AS_SCALE_MIN', scale_min))) LOGGER.debug("EndPoint Path : " + str(os.environ.get('AS_ENDPOINT', endpoint_path))) LOGGER.debug( "Azure Subscription ID : " + str(os.environ.get('AZURE_SUBSCRIPTION_ID', azure_subscription_id))) LOGGER.debug("Azure Tenant ID : " + str(os.environ.get('AZURE_TENANT_ID', azure_tenant_id))) LOGGER.debug("Azure Client ID : " + str(os.environ.get('AZURE_CLIENT_ID', azure_client_id))) LOGGER.debug( "Azure Client Secret : " + str(os.environ.get('AZURE_CLIENT_SECRET', azure_client_secret))) LOGGER.debug("Azure Resource Group : " + str(os.environ.get('AZURE_RG', azure_resource_group))) LOGGER.debug("Azure Location : " + str(os.environ.get('AZURE_LOCATION', azure_location))) LOGGER.debug("Azure VMSS Targeted : " + str(os.environ.get('AZURE_VMSS', azure_vmss_name))) LOGGER.info("DC/OS Autoscaler Started") cluster = Cluster( provider_name=os.environ.get('AS_PROVIDER_NAME', provider_name), scale_up_cap=os.environ.get('AS_SCALE_UP_MAX', scale_up_cap), scale_down_cap=os.environ.get('AS_SCALE_DOWN_MAX', scale_down_cap), scale_max=os.environ.get('AS_SCALE_MAX', scale_max), scale_min=os.environ.get('AS_SCALE_MIN', scale_min), endpoint_path=os.environ.get('AS_ENDPOINT', endpoint_path), azure_subscription_id=os.environ.get('AZURE_SUBSCRIPTION_ID', azure_subscription_id), azure_tenant_id=os.environ.get('AZURE_TENANT_ID', azure_tenant_id), azure_client_id=os.environ.get('AZURE_CLIENT_ID', azure_client_id), azure_client_secret=os.environ.get('AZURE_CLIENT_SECRET', azure_client_secret), azure_location=os.environ.get('AZURE_LOCATION', azure_location), azure_resource_group=os.environ.get('AZURE_RG', azure_resource_group), azure_vmss_name=os.environ.get('AZURE_VMSS', azure_vmss_name)) while True: metrics = { "totalCPU": 0, "totalMEM": 0, "usedCPU": 0, "usedMEM": 0, "ratioCPU": 0, "ratioMEM": 0, "nbNodes": 0 } cluster.check_health(metrics) LOGGER.info("Total Cluster CPU = " + str(metrics["totalCPU"]) + " - Total Cluster CPU = " + str(metrics["totalMEM"])) LOGGER.info("Total Used CPU = " + str(metrics["usedCPU"]) + " - Total Cluster MEM = " + str(metrics["usedMEM"])) LOGGER.info("Ratio CPU = " + str(metrics["ratioCPU"]) + "% - Ratio MEM = " + str(metrics["ratioMEM"]) + "%") if cluster.decide_to_scale(metrics) == 1: LOGGER.info("Scale Up Kicked ... In Progress") cluster.scale_cluster_up(metrics) if cluster.decide_to_scale(metrics) == -1: LOGGER.info("Scale Down Kicked... In Progress") cluster.scale_cluster_down(metrics) time.sleep(timer) LOGGER.info("DC/OS Autoscaler Stopped")
def main(container_service_name, resource_group, sleep, kubeconfig, service_principal_app_id, service_principal_secret, service_principal_tenant_id, datadog_api_key,idle_threshold, spare_agents, template_file, parameters_file, template_file_url, parameters_file_url, over_provision, instance_init_time, no_scale, no_maintenance, slack_hook, slack_bot_token, dry_run, verbose, debug): logger_handler = logging.StreamHandler(sys.stderr) logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(logger_handler) logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) if not (service_principal_app_id and service_principal_secret and service_principal_tenant_id): logger.error("Missing Azure credentials. Please provide aws-service_principal_app_id, service_principal_secret and service_principal_tenant_id.") sys.exit(1) if (template_file and not parameters_file) or (not template_file and parameters_file): logger.error("Both --template-file and --parameters-file should be provided when running on acs-engine") sys.exit(1) if (template_file and template_file_url): logger.error('--template-file and --template-file-url are mutually exclusive.') sys.exit(1) if (parameters_file and parameters_file_url): logger.error('--parameters-file and --parameters-file-url are mutually exclusive.') sys.exit(1) if template_file and container_service_name: logger.error("--template-file and --container-service-name cannot be specified simultaneously. Provide --container-service-name when running on ACS, or --template-file and --parameters-file when running on acs-engine") sys.exit(1) notifier = None if slack_hook and slack_bot_token: notifier = Notifier(slack_hook, slack_bot_token) cluster = Cluster(service_principal_app_id=service_principal_app_id, service_principal_secret=service_principal_secret, service_principal_tenant_id=service_principal_tenant_id, kubeconfig=kubeconfig, template_file=template_file, template_file_url=template_file_url, parameters_file_url=parameters_file_url, parameters_file=parameters_file, idle_threshold=idle_threshold, instance_init_time=instance_init_time, spare_agents=spare_agents, container_service_name=container_service_name, resource_group=resource_group, scale_up=not no_scale, maintainance=not no_maintenance, over_provision=over_provision, datadog_api_key=datadog_api_key, notifier=notifier, dry_run=dry_run, ) backoff = sleep while True: scaled = cluster.scale_loop(debug) if scaled: time.sleep(sleep) backoff = sleep else: logger.warn("backoff: %s" % backoff) backoff *= 2 time.sleep(backoff)
def setUp(self): # load dummy kube specs dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: self.dummy_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: self.dummy_ds_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: self.dummy_rc_pod = yaml.load(f.read()) with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: self.dummy_node = yaml.load(f.read()) for condition in self.dummy_node['status']['conditions']: if condition['type'] == 'Ready' and condition[ 'status'] == 'True': condition['lastHeartbeatTime'] = datetime.now( condition['lastHeartbeatTime'].tzinfo) # Convert timestamps to strings to match PyKube for condition in self.dummy_node['status']['conditions']: condition['lastHeartbeatTime'] = datetime.isoformat( condition['lastHeartbeatTime']) condition['lastTransitionTime'] = datetime.isoformat( condition['lastTransitionTime']) # this isn't actually used here # only needed to create the KubePod object... self.api = pykube.HTTPClient( pykube.KubeConfig.from_file('~/.kube/config')) # start creating our mock ec2 environment self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] for moto_mock in self.mocks: moto_mock.start() client = boto3.client('autoscaling', region_name='us-west-2') self.asg_client = client client.create_launch_configuration(LaunchConfigurationName='dummy-lc', ImageId='ami-deadbeef', KeyName='dummy-key', SecurityGroups=[ 'sg-cafebeef', ], InstanceType='t2.medium') client.create_auto_scaling_group(AutoScalingGroupName='dummy-asg', LaunchConfigurationName='dummy-lc', MinSize=0, MaxSize=10, VPCZoneIdentifier='subnet-beefbeef', Tags=[{ 'Key': 'KubernetesCluster', 'Value': 'dummy-cluster', 'PropagateAtLaunch': True }, { 'Key': 'KubernetesRole', 'Value': 'worker', 'PropagateAtLaunch': True }]) # finally our cluster self.cluster = Cluster( aws_access_key='fake', aws_secret_key='fake', aws_regions=['us-west-2', 'us-east-1', 'us-west-1'], azure_client_id='', azure_client_secret='', azure_subscription_id='', azure_tenant_id='', azure_resource_group_names=[], azure_slow_scale_classes=[], kubeconfig='~/.kube/config', pod_namespace=None, drain_utilization_below=0.3, idle_threshold=60, instance_init_time=60, type_idle_threshold=60, cluster_name='dummy-cluster', notifier=mock.Mock(), dry_run=False)