def _get_historical_weighted_resource_value(self) -> ClustermanResources: """ Compute the weighted value of each type of resource in the cluster returns: a ClustermanResources object with the weighted resource value, or 0 if it couldn't be determined """ capacity_history = self._get_smoothed_non_zero_metadata( 'non_orphan_fulfilled_capacity', time_start=arrow.now().shift(weeks=-1).timestamp, time_end=arrow.now().timestamp, ) if not capacity_history: return ClustermanResources() time_start, time_end, non_orphan_fulfilled_capacity = capacity_history weighted_resource_dict: MutableMapping[str, float] = {} for resource in ClustermanResources._fields: resource_history = self._get_smoothed_non_zero_metadata( f'{resource}_total', time_start=time_start, time_end=time_end, ) if not resource_history: weighted_resource_dict[resource] = 0 else: weighted_resource_dict[resource] = resource_history[ 2] / non_orphan_fulfilled_capacity return ClustermanResources(**weighted_resource_dict)
class AgentMetadata(NamedTuple): agent_id: str = '' allocated_resources: ClustermanResources = ClustermanResources() batch_task_count: int = 0 state: AgentState = AgentState.UNKNOWN task_count: int = 0 total_resources: ClustermanResources = ClustermanResources()
def get_cluster_allocated_resources(self) -> ClustermanResources: """Get all allocated resources for the cluster""" allocated_resources = { resource: self.get_resource_allocation(resource) for resource in ClustermanResources._fields } return ClustermanResources(**allocated_resources)
def test_get_resource_request_only_pending_pods(pending_pods): assert _get_resource_request(ClustermanResources(), pending_pods) == SignalResourceRequest( cpus=6, mem=1000, disk=0, gpus=0, )
def _get_cluster_total_resources(self) -> ClustermanResources: total_resources = { resource: self.pool_manager.cluster_connector.get_resource_total(resource) for resource in ClustermanResources._fields } return ClustermanResources(**total_resources)
def total_node_resources(node: KubernetesNode) -> ClustermanResources: return ClustermanResources( cpus=ResourceParser.cpus(node.status.capacity), mem=ResourceParser.mem(node.status.capacity), disk=ResourceParser.disk(node.status.capacity), gpus=ResourceParser.gpus(node.status.capacity), )
def get_cluster_total_resources(self) -> ClustermanResources: """Get the total available resources for the cluster""" total_resources = { resource: self.get_resource_total(resource) for resource in ClustermanResources._fields } return ClustermanResources(**total_resources)
def total_agent_resources(agent: MesosAgentDict) -> ClustermanResources: resources = agent.get('resources', {}) return ClustermanResources( cpus=resources.get('cpus', 0), mem=resources.get('mem', 0), disk=resources.get('disk', 0), gpus=resources.get('gpus', 0), )
def _get_cluster_allocated_resources(self) -> ClustermanResources: allocated_resources = { resource: self.pool_manager.cluster_connector.get_resource_allocation( resource) for resource in ClustermanResources._fields } return ClustermanResources(**allocated_resources)
def allocated_agent_resources(agent_dict: MesosAgentDict) -> ClustermanResources: used_resources = agent_dict.get('used_resources', {}) return ClustermanResources( cpus=used_resources.get('cpus', 0), mem=used_resources.get('mem', 0), disk=used_resources.get('disk', 0), gpus=used_resources.get('gpus', 0), )
def total_node_resources( node: KubernetesNode, excluded_pods: List[KubernetesPod]) -> ClustermanResources: base_total = ClustermanResources( cpus=ResourceParser.cpus(node.status.allocatable), mem=ResourceParser.mem(node.status.allocatable), disk=ResourceParser.disk(node.status.allocatable), gpus=ResourceParser.gpus(node.status.allocatable), ) excluded_resources = allocated_node_resources(excluded_pods) return base_total - excluded_resources
def test_scale_most_constrained_resource(self, mock_autoscaler): resource_request = SignalResourceRequest(cpus=500, mem=30000, disk=19000, gpus=0) resource_totals = ClustermanResources(cpus=1000, mem=50000, disk=20000, gpus=0) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 100 mock_autoscaler.pool_manager.cluster_connector.get_cluster_total_resources.return_value = resource_totals new_target_capacity = mock_autoscaler._compute_target_capacity(resource_request) # disk would be the most constrained resource, so we should scale the target_capacity (100) by an amount # such that requested/(total*scale_factor) = setpoint expected_new_target_capacity = 100 * 19000 / (20000 * 0.7) assert new_target_capacity == pytest.approx(expected_new_target_capacity)
def test_excluded_resources(self, mock_autoscaler): resource_request = SignalResourceRequest(cpus=500, mem=30000, disk=19000, gpus=0) resource_totals = ClustermanResources(cpus=1000, mem=50000, disk=20000, gpus=0) mock_autoscaler.autoscaling_config = AutoscalingConfig(['disk'], 0.7, 0.1) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 100 mock_autoscaler.pool_manager.cluster_connector.get_cluster_total_resources.return_value = resource_totals new_target_capacity = mock_autoscaler._compute_target_capacity(resource_request) # disk would be the most constrained resource, but it's excluded, so we scale on the next most constrained (mem) expected_new_target_capacity = 100 * 30000 / (50000 * 0.7) assert new_target_capacity == pytest.approx(expected_new_target_capacity)
def test_get_historical_weighted_resource_value(mock_autoscaler): mock_autoscaler._get_smoothed_non_zero_metadata = mock.Mock(side_effect=[ (100, 200, 78), # historical non_zero_fulfilled_capacity (100, 200, 20), # cpus None, # mem (100, 200, 0.1), # disk (100, 200, 1), # gpus ]) assert mock_autoscaler._get_historical_weighted_resource_value() == ClustermanResources( cpus=20 / 78, mem=0, disk=0.1 / 78, gpus=1 / 78, )
def total_pod_resources(pod: KubernetesPod) -> ClustermanResources: return ClustermanResources( cpus=sum( ResourceParser.cpus(c.resources.requests) for c in pod.spec.containers), mem=sum( ResourceParser.mem(c.resources.requests) for c in pod.spec.containers), disk=sum( ResourceParser.disk(c.resources.requests) for c in pod.spec.containers), gpus=sum( ResourceParser.gpus(c.resources.requests) for c in pod.spec.containers), )
def test_get_options_for_instance_type(mock_asrg): mock_asrg._group_config['AvailabilityZones'] = ['us-west-1a', 'us-west-2a'] result = mock_asrg._get_options_for_instance_type('m5.4xlarge') assert len(result) == 2 assert all([ r.agent.total_resources == ClustermanResources( cpus=16, mem=64 * 1024, disk=DEFAULT_VOLUME_SIZE_GB * 1024, gpus=0, ) for r in result ]) assert result[0].instance.market == InstanceMarket('m5.4xlarge', 'us-west-1a') assert result[1].instance.market == InstanceMarket('m5.4xlarge', 'us-west-2a')
def test_current_target_capacity_no_historical_data(self, mock_autoscaler): mock_autoscaler.pool_manager.cluster_connector.get_resource_total.return_value = 0 mock_autoscaler.pool_manager.target_capacity = 0 mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler._get_historical_weighted_resource_value = mock.Mock( return_value=ClustermanResources()) new_target_capacity = mock_autoscaler._compute_target_capacity({ 'cpus': 7, 'mem': 400, 'disk': 70, 'gpus': 0 }) assert new_target_capacity == 1
def _get_options_for_instance_type( self, instance_type: str, weight: Optional[float] = None, ) -> List[ClusterNodeMetadata]: """ Generate a list of possible ClusterNode types that could be added to this ASG, given a particular instance type """ options = [] az_options = self._group_config['AvailabilityZones'] for az in az_options: instance_market = InstanceMarket(instance_type, az) weight = weight or self.market_weight(instance_market) options.append(ClusterNodeMetadata( agent=AgentMetadata(total_resources=ClustermanResources.from_instance_type(instance_type)), instance=InstanceMetadata(market=instance_market, weight=weight), )) return options
def _get_agent_metadata(self, instance_ip: str) -> AgentMetadata: for c in self.simulator.aws_clusters: for i in c.instances.values(): if instance_ip == i.ip_address: return AgentMetadata( agent_id=str(uuid.uuid4()), state=( AgentState.ORPHANED if self.simulator.current_time < i.join_time else AgentState.IDLE ), total_resources=ClustermanResources( cpus=i.resources.cpus, mem=i.resources.mem * 1000, disk=(i.resources.disk or staticconf.read_int('ebs_volume_size', 0)) * 1000, gpus=(i.resources.gpus), ) ) # if we don't know the given IP then it's orphaned return AgentMetadata(state=AgentState.ORPHANED)
def allocated_node_resources(pods: List[KubernetesPod]) -> ClustermanResources: cpus = mem = disk = gpus = 0 for pod in pods: cpus += sum( ResourceParser.cpus(c.resources.requests) for c in pod.spec.containers) mem += sum( ResourceParser.mem(c.resources.requests) for c in pod.spec.containers) disk += sum( ResourceParser.disk(c.resources.requests) for c in pod.spec.containers) gpus += sum( ResourceParser.gpus(c.resources.requests) for c in pod.spec.containers) return ClustermanResources( cpus=cpus, mem=mem, disk=disk, gpus=gpus, )
def _get_most_constrained_resource_for_request( self, resource_request: SignalResponseDict, cluster_total_resources: ClustermanResources, ) -> Tuple[str, float]: """Determine what would be the most constrained resource if were to fulfill a resource_request without scaling the cluster. :param resource_rquest: dictionary of resource name (cpu, mem, disk) to the requested quantity of that resource :param cluster_total_resources: the currently available resources in the cluster :returns: a tuple of the most constrained resource name and its utilization percentage if the provided request were to be fulfilled """ requested_resource_usage_pcts = {} for resource, resource_total in cluster_total_resources._asdict( ).items(): resource_request_value = resource_request.get(resource) if resource_request_value is None: continue if resource in self.autoscaling_config.excluded_resources: logger.info( f'Signal requested {resource_total} {resource} but it is excluded from scaling decisions' ) continue if resource_total == 0: if resource_request_value > 0: raise ResourceRequestError( f'Signal requested {resource_request_value} for {resource} ' "but the cluster doesn't have any of that resource") requested_resource_usage_pcts[resource] = 0 else: requested_resource_usage_pcts[ resource] = resource_request_value / resource_total return max(requested_resource_usage_pcts.items(), key=lambda x: x[1])
def allocated_resources(): return ClustermanResources(cpus=150, mem=1000, disk=500, gpus=0)
def create_k8s_autoscaler(context, prevent_scale_down_after_capacity_loss=False): behave.use_fixture(autoscaler_patches, context) context.mock_cluster_connector.__class__ = KubernetesClusterConnector context.mock_cluster_connector.get_cluster_allocated_resources.return_value = ClustermanResources( cpus=context.allocated_cpus, ) context.mock_cluster_connector._pending_pods = [] if float(context.pending_cpus) > 0: context.mock_cluster_connector.get_unschedulable_pods = \ lambda: KubernetesClusterConnector.get_unschedulable_pods(context.mock_cluster_connector) context.mock_cluster_connector._get_pod_unschedulable_reason.side_effect = lambda pod: ( PodUnschedulableReason.InsufficientResources if pod.metadata.name == 'pod1' else PodUnschedulableReason.Unknown) context.mock_cluster_connector._pending_pods = [ V1Pod( metadata=V1ObjectMeta(name='pod1'), status=V1PodStatus( phase='Pending', conditions=[ V1PodCondition(status='False', type='PodScheduled', reason='Unschedulable') ], ), spec=V1PodSpec(containers=[ V1Container(name='container1', resources=V1ResourceRequirements( requests={'cpu': context.pending_cpus})), ]), ), V1Pod( metadata=V1ObjectMeta(name='pod2'), status=V1PodStatus( phase='Pending', conditions=[ V1PodCondition(status='False', type='PodScheduled', reason='Unschedulable') ], ), spec=V1PodSpec(containers=[ V1Container(name='container1', resources=V1ResourceRequirements( requests={'cpu': context.pending_cpus})), ]), ), ] context.autoscaler = Autoscaler( cluster='kube-test', pool='bar', apps=['bar'], scheduler='kubernetes', metrics_client=mock.Mock(), monitoring_enabled=False, ) if prevent_scale_down_after_capacity_loss: context.autoscaler.autoscaling_config = AutoscalingConfig( excluded_resources=[], setpoint=0.7, target_capacity_margin=0.1, prevent_scale_down_after_capacity_loss=True, instance_loss_threshold=0)
def change_allocated_cpus(): reload_fn() context.mock_cluster_connector.get_cluster_allocated_resources.return_value = ClustermanResources( cpus=context.allocated_cpus, )
def autoscaler_patches(context): behave.use_fixture(boto_patches, context) resource_groups = {} for i in range(context.rgnum): resource_groups[f'rg{i}'] = mock.Mock( spec=SpotFleetResourceGroup, id=f'rg{i}', target_capacity=context.target_capacity / context.rgnum, fulfilled_capacity=context.target_capacity / context.rgnum, is_stale=False, min_capacity=0, max_capacity=float('inf'), ) resource_totals = ClustermanResources(cpus=context.cpus, mem=context.mem, disk=context.disk, gpus=context.gpus) with staticconf.testing.PatchConfiguration( {'autoscaling': { 'default_signal_role': 'bar' }}, ), mock.patch( 'clusterman.autoscaler.autoscaler.get_monitoring_client', ), mock.patch( 'clusterman.aws.util.SpotFleetResourceGroup.load', return_value=resource_groups, ), mock.patch( 'clusterman.autoscaler.pool_manager.PoolManager', wraps=PoolManager, ), mock.patch( 'clusterman.autoscaler.autoscaler.PoolManager.prune_excess_fulfilled_capacity', ), mock.patch( 'clusterman.autoscaler.pool_manager.ClusterConnector.load', ) as mock_cluster_connector, mock.patch( 'clusterman.autoscaler.autoscaler.PoolManager._calculate_non_orphan_fulfilled_capacity', return_value=context.target_capacity, ), mock.patch( 'clusterman.signals.external_signal.ExternalSignal._connect_to_signal_process', ), mock.patch( 'clusterman.signals.external_signal.get_metrics_for_signal', ) as mock_metrics, mock_dynamodb2(): dynamodb.create_table( TableName=CLUSTERMAN_STATE_TABLE, KeySchema=[ { 'AttributeName': 'state', 'KeyType': 'HASH' }, { 'AttributeName': 'entity', 'KeyType': 'SORT' }, ], AttributeDefinitions=[ { 'AttributeName': 'state', 'AttributeType': 'S' }, { 'AttributeName': 'entity', 'AttributeType': 'S' }, ], ) mock_metrics.return_value = { } # don't know why this is necessary but we get flaky tests if it's not set mock_cluster_connector.return_value.get_cluster_total_resources.return_value = resource_totals context.mock_cluster_connector = mock_cluster_connector.return_value yield
def test_single_resource(self, mock_autoscaler, resource, signal_resource, total_resource, expected_capacity): mock_autoscaler.pool_manager.target_capacity = 125 mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 125 mock_autoscaler.pool_manager.cluster_connector.get_cluster_total_resources.return_value = ClustermanResources( cpus=total_resource, mem=total_resource, disk=total_resource, gpus=total_resource, ) new_target_capacity = mock_autoscaler._compute_target_capacity(SignalResourceRequest( **{resource: signal_resource}, )) assert new_target_capacity == pytest.approx(expected_capacity)
def test_current_target_capacity_no_historical_data(self, mock_autoscaler): mock_autoscaler.pool_manager.cluster_connector.get_resource_total.return_value = 0 mock_autoscaler.pool_manager.target_capacity = 0 mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler._get_historical_weighted_resource_value = mock.Mock(return_value=ClustermanResources()) new_target_capacity = mock_autoscaler._compute_target_capacity( SignalResourceRequest(cpus=7, mem=400, disk=70, gpus=0), ) assert new_target_capacity == 1
def test_get_historical_weighted_resource_value_no_historical_data( mock_autoscaler): mock_autoscaler._get_smoothed_non_zero_metadata = mock.Mock( return_value=None) assert mock_autoscaler._get_historical_weighted_resource_value( ) == ClustermanResources()