示例#1
0
def get_metrics_for_signal(
    cluster: str,
    pool: str,
    scheduler: str,
    app: str,
    metrics_client: ClustermanMetricsBotoClient,
    required_metrics: List[MetricsConfigDict],
    end_time: arrow.Arrow,
) -> MetricsValuesDict:
    """ Get the metrics required for a signal """

    metrics: MetricsValuesDict = defaultdict(list)
    for metric_dict in required_metrics:
        if metric_dict['type'] not in (SYSTEM_METRICS, APP_METRICS):
            raise MetricsError(
                f"Metrics of type {metric_dict['type']} cannot be queried by signals."
            )

        # Need to add the cluster/pool to get the right system metrics
        # TODO (CLUSTERMAN-126) this should probably be cluster/pool/app eventually
        # TODO (CLUSTERMAN-446) if a mesos pool and a k8s pool share the same app_name,
        #      APP_METRICS will be used for both
        if metric_dict['type'] == SYSTEM_METRICS:
            dims_list = [get_cluster_dimensions(cluster, pool, scheduler)]
            if scheduler == 'mesos':  # handle old (non-scheduler-aware) metrics
                dims_list.insert(0,
                                 get_cluster_dimensions(cluster, pool, None))
        else:
            dims_list = [{}]

        # We only support regex expressions for APP_METRICS
        if 'regex' not in metric_dict:
            metric_dict['regex'] = False

        start_time = end_time.shift(minutes=-metric_dict['minute_range'])
        for dims in dims_list:
            query_results = metrics_client.get_metric_values(
                metric_dict['name'],
                metric_dict['type'],
                start_time.timestamp,
                end_time.timestamp,
                is_regex=metric_dict['regex'],
                extra_dimensions=dims,
                app_identifier=app,
            )
            for metric_name, timeseries in query_results.items():
                metrics[metric_name].extend(timeseries)
                # safeguard; the metrics _should_ already be sorted since we inserted the old
                # (non-scheduler-aware) metrics before the new metrics above, so this should be fast
                metrics[metric_name].sort()
    return metrics
示例#2
0
    def _get_smoothed_non_zero_metadata(
        self,
        metric_name: str,
        time_start: arrow.Arrow,
        time_end: arrow.Arrow,
        smoothing: int = 5,
    ) -> Optional[Tuple[int, int, float]]:
        """ Compute some smoothed-out historical metrics metadata

        :param metric_name: the metadata metric to query
        :param time_start: the beginning of the historical time window to query
        :param time_end: the end of the historical time window to query
        :param smoothing: take this many non-zero metric values and average them together
        :returns: the start and end times over which the average was taken, and smoothed-out metric value during this
            time period; or None, if no historical data exists
        """
        metrics = self.metrics_client.get_metric_values(
            metric_name,
            METADATA,
            time_start,
            time_end,
            extra_dimensions=get_cluster_dimensions(self.cluster, self.pool,
                                                    self.scheduler),
        )[metric_name]
        latest_non_zero_values = [(ts, val) for ts, val in metrics
                                  if val > 0][-smoothing:]
        if not latest_non_zero_values:
            return None
        return (
            latest_non_zero_values[0][0],
            latest_non_zero_values[-1][0],
            sum([float(val) for __, val in latest_non_zero_values]) /
            len(latest_non_zero_values),
        )
示例#3
0
def _populate_cluster_size_events(simulator, start_time, end_time):
    capacity_metrics = simulator.metrics_client.get_metric_values(
        f'fulfilled_capacity',
        METADATA,
        start_time.timestamp,
        end_time.timestamp,
        use_cache=False,
        extra_dimensions=get_cluster_dimensions(
            simulator.metadata.cluster,
            simulator.metadata.pool,
            simulator.metadata.scheduler,
        ),
    )
    for i, (timestamp,
            data) in enumerate(capacity_metrics['fulfilled_capacity']):
        market_data = {}
        for market_str, value in data.items():
            market = InstanceMarket.parse(market_str)
            weight = get_market_resources(market).cpus // staticconf.read_int(
                'cpus_per_weight')
            market_data[market] = int(value) // weight
        simulator.markets |= set(market_data.keys())
        use_join_delay = (
            i != 0)  # Want to start the cluster out at the expected capacity
        simulator.add_event(
            ModifyClusterSizeEvent(arrow.get(timestamp), market_data,
                                   use_join_delay))
示例#4
0
def generate_simple_metadata(
        manager: PoolManager) -> Generator[ClusterMetric, None, None]:
    dimensions = get_cluster_dimensions(manager.cluster, manager.pool,
                                        manager.scheduler)
    for metric_name, value_method in SIMPLE_METADATA.items():
        yield ClusterMetric(metric_name,
                            value_method(manager),
                            dimensions=dimensions)
示例#5
0
def generate_system_metrics(
        manager: PoolManager) -> Generator[ClusterMetric, None, None]:
    dimensions = get_cluster_dimensions(manager.cluster, manager.pool,
                                        manager.scheduler)
    for metric_name, value_method in SYSTEM_METRICS.items():
        yield ClusterMetric(metric_name,
                            value_method(manager),
                            dimensions=dimensions)
def generate_simple_metadata(
        manager: PoolManager) -> Generator[ClusterMetric, None, None]:
    dimensions = get_cluster_dimensions(manager.cluster, manager.pool,
                                        manager.scheduler)
    for metric_name, value_method in SIMPLE_METADATA.items():
        try:
            result = value_method(manager)
        except NoResourceGroupsFoundError:
            logger.warning(
                f'Resources for metric {metric_name} cluster {manager.cluster} not found'
            )
            continue

        yield ClusterMetric(metric_name, result, dimensions=dimensions)
示例#7
0
def _populate_allocated_resources(simulator, start_time, end_time):
    allocated_metrics = simulator.metrics_client.get_metric_values(
        'cpus_allocated',
        SYSTEM_METRICS,
        start_time.timestamp,
        end_time.timestamp,
        use_cache=False,
        extra_dimensions=get_cluster_dimensions(
            simulator.metadata.cluster,
            simulator.metadata.pool,
            simulator.metadata.scheduler,
        ),
    )
    # It's OK to just directly set up the timeseries here, instead of using events; if the autoscaler
    # depends on these values it will re-read it from the metrics client anyways.
    #
    # In the future, we may want to make the simulator smarter (if the value of cpus_allocated exceeds the
    # simulated total cpus, for example), but for right now I don't care (CLUSTERMAN-145)
    for timestamp, data in allocated_metrics['cpus_allocated']:
        simulator.mesos_cpus_allocated.add_breakpoint(arrow.get(timestamp), float(data))
示例#8
0
    def _make_autoscaler(self, autoscaler_config_file: str) -> None:
        fetch_count, signal_count = setup_signals_environment(
            self.metadata.pool, self.metadata.scheduler)
        signal_dir = os.path.join(os.path.expanduser('~'), '.cache',
                                  'clusterman')

        endpoint_url = staticconf.read_string('aws.endpoint_url',
                                              '').format(svc='s3')
        env = os.environ.copy()
        if endpoint_url:
            env['AWS_ENDPOINT_URL_ARGS'] = f'--endpoint-url {endpoint_url}'

        for i in range(fetch_count):
            subprocess.run(['fetch_clusterman_signal',
                            str(i), signal_dir],
                           check=True,
                           env=env)
        for i in range(signal_count):
            subprocess.Popen(['run_clusterman_signal',
                              str(i), signal_dir],
                             env=env)

        with open(autoscaler_config_file) as f:
            autoscaler_config = yaml.safe_load(f)
        configs = autoscaler_config.get('configs', [])
        if 'sfrs' in autoscaler_config:
            aws_configs = ec2.describe_spot_fleet_requests(
                SpotFleetRequestIds=autoscaler_config['sfrs'])
            configs.extend([
                config['SpotFleetRequestConfig']
                for config in aws_configs['SpotFleetRequestConfigs']
            ])
        pool_manager = SimulatedPoolManager(self.metadata.cluster,
                                            self.metadata.pool, configs, self)
        metric_values = self.metrics_client.get_metric_values(
            'target_capacity',
            METADATA,
            self.start_time.timestamp,
            # metrics collector runs 1x/min, but we'll try to get five data points in case some data is missing
            self.start_time.shift(minutes=5).timestamp,
            use_cache=False,
            extra_dimensions=get_cluster_dimensions(self.metadata.cluster,
                                                    self.metadata.pool,
                                                    self.metadata.scheduler),
        )
        # take the earliest data point available - this is a Decimal, which doesn't play nicely, so convert to an int
        with patch_join_delay():
            actual_target_capacity = int(
                metric_values['target_capacity'][0][1])
            pool_manager.modify_target_capacity(actual_target_capacity,
                                                force=True,
                                                prune=False)

        for config in configs:
            for spec in config['LaunchSpecifications']:
                self.markets |= {get_instance_market(spec)}
        self.autoscaler = Autoscaler(
            self.metadata.cluster,
            self.metadata.pool,
            self.metadata.scheduler,
            [self.metadata.pool],
            pool_manager=pool_manager,
            metrics_client=self.metrics_client,
            monitoring_enabled=False,  # no sensu alerts during simulations
        )