示例#1
0
def test_pool_manager_init(mock_pool_manager, mock_resource_groups):
    assert mock_pool_manager.cluster == 'mesos-test'
    assert mock_pool_manager.pool == 'bar'
    assert mock_pool_manager.scheduler == 'mesos'
    with staticconf.testing.MockConfiguration(
        {
            'scaling_limits': {
                'max_tasks_to_kill': 'inf',
                'max_weight_to_add': 100,
                'max_weight_to_remove': 100,
                'min_capacity': 3,
                'max_capacity': 3,
            },
        },
        namespace='bar.mesos_config',
    ), mock.patch(
        'clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup.load',
        return_value={},
    ), mock.patch(
        'clusterman.autoscaler.pool_manager.DrainingClient',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.pool_manager.PoolManager.reload_state'
    ):
        mock_manager = PoolManager('mesos-test', 'bar', 'mesos')
        mock_manager.resource_groups = mock_resource_groups
        assert mock_manager.max_tasks_to_kill == float('inf')
示例#2
0
def mark_stale(manager: PoolManager, dry_run: bool) -> str:
    if not dry_run and not ask_for_confirmation(
            f'Marking all resource groups in {manager.cluster}, {manager.pool}.{manager.scheduler} stale.  Proceed? '
    ):
        print('Aborting operation.')
        return ''

    manager.mark_stale(dry_run)
    return (
        f'All resource groups in {manager.pool}.{manager.scheduler} on {manager.cluster} manually '
        f'marked as stale by {getuser()}')
示例#3
0
def mock_pool_manager(mock_resource_groups):
    with mock.patch(
        'clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup.load',
        return_value={},
    ), mock.patch(
        'clusterman.autoscaler.pool_manager.DrainingClient',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.pool_manager.PoolManager.reload_state'
    ), mock.patch(
        'clusterman.autoscaler.pool_manager.ClusterConnector.load',
    ):
        manager = PoolManager('mesos-test', 'bar', 'mesos')
        manager.resource_groups = mock_resource_groups

        return manager
示例#4
0
def main(args: argparse.Namespace) -> None:
    if args.target_capacity and args.mark_stale:
        raise ValueError(
            'Cannot specify --target-capacity and --mark-stale simultaneously')

    manager = PoolManager(args.cluster, args.pool, args.scheduler)
    log_messages = []
    if args.target_capacity:
        log_message = change_target_capacity(manager, args.target_capacity,
                                             args.dry_run)
        log_messages.append(log_message)

    elif args.mark_stale:
        log_message = mark_stale(manager, args.dry_run)
        log_messages.append(log_message)

    for log_message in log_messages:
        if not log_message:
            continue

        print(log_message)
        if not args.dry_run:
            scribe_stream = get_autoscaler_scribe_stream(
                args.cluster, args.pool, args.scheduler)
            log_to_scribe(scribe_stream, f'{LOG_TEMPLATE} {log_message}')
示例#5
0
def print_status(manager: PoolManager, args) -> None:
    sys.stdout.write('\n')
    print(
        f'Current status for the {manager.pool} pool in the {manager.cluster} cluster:\n'
    )
    print(
        f'Resource groups (target capacity: {manager.target_capacity}, fulfilled: {manager.fulfilled_capacity}, '
        f'non-orphan: {manager.non_orphan_fulfilled_capacity}):')

    node_metadatas = manager.get_node_metadatas() if args.verbose else {}

    for group in manager.resource_groups.values():
        _write_resource_group_line(group)
        for metadata in node_metadatas:
            if (metadata.instance.group_id != group.id
                    or (args.only_orphans
                        and metadata.agent.state != AgentState.ORPHANED) or
                (args.only_idle and metadata.agent.state != AgentState.IDLE)):
                continue
            _write_agent_details(metadata)

        sys.stdout.write('\n')

    _write_summary(manager)
    sys.stdout.write('\n')
示例#6
0
    def configure_initial(self):
        setup_config(self.options)
        self.autoscaler = None
        self.logger = logger

        self.apps = [
            self.options.pool
        ]  # TODO (CLUSTERMAN-126) someday these should not be the same thing

        pool_manager = PoolManager(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
        )
        self.autoscaler = Autoscaler(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
            self.apps,
            monitoring_enabled=(not self.options.dry_run),
            pool_manager=pool_manager,
        )

        # We don't want to watch anything here because the autoscaler bootstrap script takes care of that for us
        self.config.watchers.clear()
示例#7
0
def _status_json(manager: PoolManager, get_node_metadatas: bool) -> StatusJsonObject:
    node_metadatas = manager.get_node_metadatas() if get_node_metadatas else []
    return {
        'disabled': autoscaling_is_paused(manager.cluster, manager.pool, manager.scheduler, arrow.now()),
        'target_capacity': manager.target_capacity,
        'fulfilled_capacity': manager.fulfilled_capacity,
        'non_orphan_fulfilled_capacity': manager.non_orphan_fulfilled_capacity,
        'resource_groups': _get_resource_groups_json(manager.resource_groups.values(), node_metadatas),
    }
示例#8
0
def make_pool_manager(context, num, rg_type):
    behave.use_fixture(boto_patches, context)
    behave.use_fixture(mock_agents_by_ip_and_tasks, context)
    context.rg_type = rg_type
    context.rg_num = int(num)
    behave.use_fixture(mock_reload_resource_groups, context)
    context.pool_manager = PoolManager('mesos-test', 'bar', 'mesos')
    context.rg_ids = [i for i in context.pool_manager.resource_groups]
    context.pool_manager.max_capacity = 101
示例#9
0
 def load_pool_managers(self) -> None:
     logger.info('Reloading all PoolManagers')
     self.pool_managers: Mapping[str, PoolManager] = {}
     for scheduler, pools in self.pools.items():
         for pool in pools:
             logger.info(
                 f'Loading resource groups for {pool}.{scheduler} on {self.options.cluster}'
             )
             self.pool_managers[f'{pool}.{scheduler}'] = PoolManager(
                 self.options.cluster, pool, scheduler)
示例#10
0
def change_target_capacity(manager: PoolManager, target_capacity: str,
                           dry_run: bool) -> str:
    old_target = manager.target_capacity
    requested_target = get_target_capacity_value(target_capacity, manager.pool,
                                                 manager.scheduler)
    if not dry_run and not ask_for_confirmation(
            f'Modifying target capacity for {manager.cluster}, {manager.pool}.{manager.scheduler} '
            f'from {old_target} to {requested_target}.  Proceed? '):
        print('Aborting operation.')
        return ''

    new_target = manager.modify_target_capacity(requested_target, dry_run)
    return (
        f'Target capacity for {manager.pool}.{manager.scheduler} on {manager.cluster} manually changed '
        f'from {old_target} to {new_target} by {getuser()}')
示例#11
0
    def configure(self) -> None:
        setup_config(self.options)
        self.autoscaler = None
        self.logger = logger

        self.apps = [self.options.pool]  # TODO (CLUSTERMAN-126) someday these should not be the same thing

        pool_manager = PoolManager(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
        )
        self.autoscaler = Autoscaler(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
            self.apps,
            monitoring_enabled=(not self.options.dry_run),
            pool_manager=pool_manager,
        )
示例#12
0
def make_pool_manager(context, num, rg_type):
    behave.use_fixture(boto_patches, context)
    behave.use_fixture(mock_agents_by_ip_and_tasks, context)
    context.rg_type = rg_type
    with mock.patch(
            'clusterman.aws.auto_scaling_resource_group.AutoScalingResourceGroup.load',
            return_value={},
    ) as mock_asg_load, mock.patch(
            'clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup.load',
            return_value={},
    ) as mock_sfr_load, mock.patch(
            'clusterman.aws.ec2_fleet_resource_group.EC2FleetResourceGroup.load',
            return_value={},
    ) as mock_fleet_load:
        if context.rg_type == 'asg':
            mock_asg_load.return_value = mock_asgs(int(num), context.subnet_id)
        elif context.rg_type == 'sfr':
            mock_sfr_load.return_value = mock_sfrs(int(num), context.subnet_id)
        elif context.rg_type == 'fleet':
            mock_fleet_load.return_value = mock_fleets(int(num),
                                                       context.subnet_id)
        context.pool_manager = PoolManager('mesos-test', 'bar', 'mesos')
    context.rg_ids = [i for i in context.pool_manager.resource_groups]
    context.pool_manager.max_capacity = 101
示例#13
0
    def __init__(
        self,
        cluster: str,
        pool: str,
        scheduler: str,
        apps: List[str],
        pool_manager: Optional[PoolManager] = None,
        metrics_client: Optional[ClustermanMetricsBotoClient] = None,
        monitoring_enabled: bool = True,
    ) -> None:
        """ Class containing the core logic for autoscaling a cluster

        :param cluster: the name of the cluster to autoscale
        :param pool: the name of the pool to autoscale
        :param apps: a list of apps running on the pool
        :param pool_manager: a PoolManager object (used for simulations)
        :param metrics_client: a ClustermanMetricsBotoClient object (used for simulations)
        :param monitoring_enabled: set to False to disable sensu alerts during scaling
        """
        self.cluster = cluster
        self.pool = pool
        self.scheduler = scheduler
        self.apps = apps
        self.monitoring_enabled = monitoring_enabled

        # TODO: handle multiple apps in the autoscaler (CLUSTERMAN-126)
        if len(self.apps) > 1:
            raise NotImplementedError(
                'Scaling multiple apps in a cluster is not yet supported')

        logger.info(
            f'Initializing autoscaler engine for {self.pool} in {self.cluster}...'
        )

        gauge_dimensions = {'cluster': cluster, 'pool': pool}
        monitoring_client = get_monitoring_client()
        self.target_capacity_gauge = monitoring_client.create_gauge(
            TARGET_CAPACITY_GAUGE_NAME, gauge_dimensions)
        self.resource_request_gauges: Dict[str, Any] = {}
        for resource in ('cpus', 'mem', 'disk'):
            self.resource_request_gauges[
                resource] = monitoring_client.create_gauge(
                    RESOURCE_GAUGE_BASE_NAME.format(resource=resource),
                    gauge_dimensions,
                )

        self.autoscaling_config = get_autoscaling_config(
            POOL_NAMESPACE.format(pool=self.pool, scheduler=self.scheduler), )
        self.pool_manager = pool_manager or PoolManager(
            self.cluster, self.pool, self.scheduler)

        self.mesos_region = staticconf.read_string('aws.region')
        self.metrics_client = metrics_client or ClustermanMetricsBotoClient(
            self.mesos_region)
        self.default_signal = Signal(
            self.cluster,
            self.pool,
            self.scheduler,
            '__default__',
            DEFAULT_NAMESPACE,
            self.metrics_client,
            signal_namespace=staticconf.read_string(
                'autoscaling.default_signal_role'),
        )
        self.signal = self._get_signal_for_app(self.apps[0])
        logger.info('Initialization complete')
示例#14
0
def main(args: argparse.Namespace) -> None:  # pragma: no cover
    manager = PoolManager(args.cluster, args.pool, args.scheduler)
    if args.json:
        print_status_json(manager)
    else:
        print_status(manager, args)