예제 #1
0
    def test_aggregator(self):
        # do not raise with multiple valid rate limits
        rate_limit_params_outer = RateLimitParameters("foo", "bar", None, 5)
        rate_limit_params_inner = RateLimitParameters("foo", "bar", None, 5)

        with RateLimitAggregator(
            [rate_limit_params_outer, rate_limit_params_inner]):
            pass

        # raise when the inner rate limit should fail
        rate_limit_params_outer = RateLimitParameters("foo", "bar", None, 0)
        rate_limit_params_inner = RateLimitParameters("foo", "bar", None, 5)

        with pytest.raises(RateLimitExceeded):
            with RateLimitAggregator(
                [rate_limit_params_outer, rate_limit_params_inner]):
                pass

        # raise when the outer rate limit should fail
        rate_limit_params_outer = RateLimitParameters("foo", "bar", None, 5)
        rate_limit_params_inner = RateLimitParameters("foo", "bar", None, 0)

        with pytest.raises(RateLimitExceeded):
            with RateLimitAggregator(
                [rate_limit_params_outer, rate_limit_params_inner]):
                pass
예제 #2
0
def execute_query_with_rate_limits(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    request_settings: RequestSettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    query_settings: MutableMapping[str, Any],
) -> Result:
    # XXX: We should consider moving this that it applies to the logical query,
    # not the physical query.
    with RateLimitAggregator(request_settings.get_rate_limit_params()
                             ) as rate_limit_stats_container:
        stats.update(rate_limit_stats_container.to_dict())
        timer.mark("rate_limit")

        project_rate_limit_stats = rate_limit_stats_container.get_stats(
            PROJECT_RATE_LIMIT_NAME)

        if ("max_threads" in query_settings
                and project_rate_limit_stats is not None
                and project_rate_limit_stats.concurrent > 1):
            maxt = query_settings["max_threads"]
            query_settings["max_threads"] = max(
                1, maxt - project_rate_limit_stats.concurrent + 1)

        return execute_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            stats,
            query_settings,
        )
예제 #3
0
파일: db_query.py 프로젝트: getsentry/snuba
def execute_query_with_rate_limits(
    clickhouse_query: Union[Query, CompositeQuery[Table]],
    query_settings: QuerySettings,
    formatted_query: FormattedQuery,
    reader: Reader,
    timer: Timer,
    stats: MutableMapping[str, Any],
    clickhouse_query_settings: MutableMapping[str, Any],
    robust: bool,
) -> Result:
    # Global rate limiter is added at the end of the chain to be
    # the last for evaluation.
    # This allows us not to borrow capacity from the global quota
    # during the evaluation if one of the more specific limiters
    # (like the project rate limiter) rejects the query first.
    query_settings.add_rate_limit(get_global_rate_limit_params())
    # XXX: We should consider moving this that it applies to the logical query,
    # not the physical query.
    with RateLimitAggregator(
        query_settings.get_rate_limit_params()
    ) as rate_limit_stats_container:
        stats.update(rate_limit_stats_container.to_dict())
        timer.mark("rate_limit")

        project_rate_limit_stats = rate_limit_stats_container.get_stats(
            PROJECT_RATE_LIMIT_NAME
        )

        thread_quota = query_settings.get_resource_quota()
        if (
            ("max_threads" in clickhouse_query_settings or thread_quota is not None)
            and project_rate_limit_stats is not None
            and project_rate_limit_stats.concurrent > 1
        ):
            maxt = (
                clickhouse_query_settings["max_threads"]
                if thread_quota is None
                else thread_quota.max_threads
            )
            clickhouse_query_settings["max_threads"] = max(
                1, maxt - project_rate_limit_stats.concurrent + 1
            )

        _record_rate_limit_metrics(rate_limit_stats_container, reader, stats)

        return execute_query(
            clickhouse_query,
            query_settings,
            formatted_query,
            reader,
            timer,
            stats,
            clickhouse_query_settings,
            robust=robust,
        )
예제 #4
0
    def test_concurrent_limit(self):
        # No concurrent limit should not raise
        rate_limit_params = RateLimitParameters("foo", "bar", None, None)
        with rate_limit(rate_limit_params) as stats:
            assert stats is not None

        # 0 concurrent limit
        rate_limit_params = RateLimitParameters("foo", "bar", None, 0)

        with pytest.raises(RateLimitExceeded):
            with rate_limit(rate_limit_params):
                pass

        # Concurrent limit 1 with consecutive queries should not raise
        rate_limit_params = RateLimitParameters("foo", "bar", None, 1)

        with rate_limit(rate_limit_params):
            pass

        with rate_limit(rate_limit_params):
            pass

        # Concurrent limit with concurrent queries
        rate_limit_params = RateLimitParameters("foo", "bar", None, 1)

        with pytest.raises(RateLimitExceeded):
            with rate_limit(rate_limit_params):
                with rate_limit(rate_limit_params):
                    pass

        # Concurrent with different buckets should not raise
        rate_limit_params1 = RateLimitParameters("foo", "bar", None, 1)
        rate_limit_params2 = RateLimitParameters("shoe", "star", None, 1)

        with RateLimitAggregator([rate_limit_params1]):
            with RateLimitAggregator([rate_limit_params2]):
                pass
예제 #5
0
    def test_concurrent_limit(self):
        # No concurrent limit should not raise
        rate_limit_params = RateLimitParameters('foo', 'bar', None, None)
        with rate_limit(rate_limit_params):
            pass

        # 0 concurrent limit
        rate_limit_params = RateLimitParameters('foo', 'bar', None, 0)

        with pytest.raises(RateLimitExceeded):
            with rate_limit(rate_limit_params):
                pass

        # Concurrent limit 1 with consecutive queries should not raise
        rate_limit_params = RateLimitParameters('foo', 'bar', None, 1)

        with rate_limit(rate_limit_params):
            pass

        with rate_limit(rate_limit_params):
            pass

        # Concurrent limit with concurrent queries
        rate_limit_params = RateLimitParameters('foo', 'bar', None, 1)

        with pytest.raises(RateLimitExceeded):
            with rate_limit(rate_limit_params):
                with rate_limit(rate_limit_params):
                    pass

        # Concurrent with different buckets should not raise
        rate_limit_params1 = RateLimitParameters('foo', 'bar', None, 1)
        rate_limit_params2 = RateLimitParameters('shoe', 'star', None, 1)

        with RateLimitAggregator([rate_limit_params1]):
            with RateLimitAggregator([rate_limit_params2]):
                pass
예제 #6
0
파일: query.py 프로젝트: jiankunking/snuba
def raw_query(
    request: Request,
    query: DictClickhouseQuery,
    reader: Reader[ClickhouseQuery],
    timer: Timer,
    stats: Optional[MutableMapping[str, Any]] = None,
) -> ClickhouseQueryResult:
    """
    Submit a raw SQL query to clickhouse and do some post-processing on it to
    fix some of the formatting issues in the result JSON
    """

    stats = stats or {}
    use_cache, use_deduper, uc_max = state.get_configs(
        [("use_cache", 0), ("use_deduper", 1), ("uncompressed_cache_max_cols", 5)]
    )

    all_confs = state.get_all_configs()
    query_settings = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items()
        if k.startswith("query_settings/")
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(request.query.get_all_referenced_columns()) > uc_max:
        query_settings["use_uncompressed_cache"] = 0
        use_cache = 0

    timer.mark("get_configs")

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark("dedupe_wait")

        result = cache.get(query_id) if use_cache else None
        timer.mark("cache_get")

        stats.update(
            {
                "is_duplicate": is_dupe,
                "query_id": query_id,
                "use_cache": bool(use_cache),
                "cache_hit": bool(result),
            }
        ),

        if not result:
            try:
                with RateLimitAggregator(
                    request.settings.get_rate_limit_params()
                ) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark("rate_limit")

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME
                    )

                    if (
                        "max_threads" in query_settings
                        and project_rate_limit_stats is not None
                        and project_rate_limit_stats.concurrent > 1
                    ):
                        maxt = query_settings["max_threads"]
                        query_settings["max_threads"] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1
                        )

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats["consistent"] = consistent
                    if consistent:
                        query_settings["load_balancing"] = "in_order"
                        query_settings["max_threads"] = 1

                    try:
                        result = reader.execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )

                        timer.mark("execute")
                        stats.update(
                            {
                                "result_rows": len(result["data"]),
                                "result_cols": len(result["meta"]),
                            }
                        )

                        if use_cache:
                            cache.set(query_id, result)
                            timer.mark("cache_set")

                    except BaseException as ex:
                        error = str(ex)
                        logger.exception("Error running query: %s\n%s", sql, error)
                        stats = log_query_and_update_stats(
                            request, sql, timer, stats, "error", query_settings
                        )
                        meta = {}
                        if isinstance(ex, ClickHouseError):
                            err_type = "clickhouse"
                            meta["code"] = ex.code
                        else:
                            err_type = "unknown"
                        raise RawQueryException(
                            err_type=err_type,
                            message=error,
                            stats=stats,
                            sql=sql,
                            **meta,
                        )
            except RateLimitExceeded as ex:
                stats = log_query_and_update_stats(
                    request, sql, timer, stats, "rate-limited", query_settings
                )
                raise RawQueryException(
                    err_type="rate-limited",
                    message="rate limit exceeded",
                    stats=stats,
                    sql=sql,
                    detail=str(ex),
                )

    stats = log_query_and_update_stats(
        request, sql, timer, stats, "success", query_settings
    )

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        result["stats"] = stats
        result["sql"] = sql

    return result
예제 #7
0
def raw_query(
    request: Request,
    query: ClickhouseQuery,
    client: ClickhousePool,
    timer: Timer,
    stats=None,
) -> QueryResult:
    """
    Submit a raw SQL query to clickhouse and do some post-processing on it to
    fix some of the formatting issues in the result JSON
    """
    from snuba.clickhouse.native import NativeDriverReader

    stats = stats or {}
    use_cache, use_deduper, uc_max = state.get_configs([
        ('use_cache', 0),
        ('use_deduper', 1),
        ('uncompressed_cache_max_cols', 5),
    ])

    all_confs = state.get_all_configs()
    query_settings = {
        k.split('/', 1)[1]: v
        for k, v in all_confs.items() if k.startswith('query_settings/')
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(all_referenced_columns(request.query)) > uc_max:
        query_settings['use_uncompressed_cache'] = 0
        use_cache = 0

    timer.mark('get_configs')

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark('dedupe_wait')

        result = state.get_result(query_id) if use_cache else None
        timer.mark('cache_get')

        stats.update({
            'is_duplicate': is_dupe,
            'query_id': query_id,
            'use_cache': bool(use_cache),
            'cache_hit': bool(result)
        }),

        if result:
            status = 200
        else:
            try:
                with RateLimitAggregator(
                        request.settings.get_rate_limit_params(
                        )) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark('rate_limit')

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME)

                    if 'max_threads' in query_settings and \
                            project_rate_limit_stats is not None and \
                            project_rate_limit_stats.concurrent > 1:
                        maxt = query_settings['max_threads']
                        query_settings['max_threads'] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1)

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats['consistent'] = consistent
                    if consistent:
                        query_settings['load_balancing'] = 'in_order'
                        query_settings['max_threads'] = 1

                    try:
                        result = NativeDriverReader(client).execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )
                        status = 200

                        logger.debug(sql)
                        timer.mark('execute')
                        stats.update({
                            'result_rows': len(result['data']),
                            'result_cols': len(result['meta']),
                        })

                        if use_cache:
                            state.set_result(query_id, result)
                            timer.mark('cache_set')

                    except BaseException as ex:
                        error = str(ex)
                        status = 500
                        logger.exception("Error running query: %s\n%s", sql,
                                         error)
                        if isinstance(ex, ClickHouseError):
                            result = {
                                'error': {
                                    'type': 'clickhouse',
                                    'code': ex.code,
                                    'message': error,
                                }
                            }
                        else:
                            result = {
                                'error': {
                                    'type': 'unknown',
                                    'message': error,
                                }
                            }

            except RateLimitExceeded as ex:
                error = str(ex)
                status = 429
                result = {
                    'error': {
                        'type': 'ratelimit',
                        'message': 'rate limit exceeded',
                        'detail': error
                    }
                }

    stats.update(query_settings)

    if settings.RECORD_QUERIES:
        # send to redis
        state.record_query({
            'request': request.body,
            'sql': sql,
            'timing': timer,
            'stats': stats,
            'status': status,
        })

        timer.send_metrics_to(metrics,
                              tags={
                                  'status': str(status),
                                  'referrer': stats.get('referrer', 'none'),
                                  'final': str(stats.get('final', False)),
                              },
                              mark_tags={
                                  'final': str(stats.get('final', False)),
                              })

    result['timing'] = timer

    if settings.STATS_IN_RESPONSE or request.settings.get_debug():
        result['stats'] = stats
        result['sql'] = sql

    return QueryResult(result, status)
예제 #8
0
def raw_query(
    request: Request,
    query: ClickhouseQuery,
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    stats: MutableMapping[str, Any],
    trace_id: Optional[str] = None,
) -> RawQueryResult:
    """
    Submits a raw SQL query to the DB and does some post-processing on it to
    fix some of the formatting issues in the result JSON.
    This function is not supposed to depend on anything higher level than the storage
    query (ClickhouseQuery as of now). If this function ends up depending on the
    dataset, something is wrong.

    TODO: As soon as we have a StorageQuery abstraction remove all the references
    to the original query from the request.
    """

    use_cache, use_deduper, uc_max = state.get_configs([
        ("use_cache", settings.USE_RESULT_CACHE),
        ("use_deduper", 1),
        ("uncompressed_cache_max_cols", 5),
    ])

    all_confs = state.get_all_configs()
    query_settings: MutableMapping[str, Any] = {
        k.split("/", 1)[1]: v
        for k, v in all_confs.items() if k.startswith("query_settings/")
    }

    # Experiment, if we are going to grab more than X columns worth of data,
    # don't use uncompressed_cache in clickhouse, or result cache in snuba.
    if len(request.query.get_all_referenced_columns()) > uc_max:
        query_settings["use_uncompressed_cache"] = 0
        use_cache = 0

    timer.mark("get_configs")

    sql = query.format_sql()
    query_id = md5(force_bytes(sql)).hexdigest()
    with state.deduper(query_id if use_deduper else None) as is_dupe:
        timer.mark("dedupe_wait")

        result = cache.get(query_id) if use_cache else None
        timer.mark("cache_get")

        stats.update({
            "is_duplicate": is_dupe,
            "query_id": query_id,
            "use_cache": bool(use_cache),
            "cache_hit": bool(result),
        }),

        update_with_status = partial(
            update_query_metadata_and_stats,
            request,
            sql,
            timer,
            stats,
            query_metadata,
            query_settings,
            trace_id,
        )

        if not result:
            try:
                with RateLimitAggregator(
                        request.settings.get_rate_limit_params(
                        )) as rate_limit_stats_container:
                    stats.update(rate_limit_stats_container.to_dict())
                    timer.mark("rate_limit")

                    project_rate_limit_stats = rate_limit_stats_container.get_stats(
                        PROJECT_RATE_LIMIT_NAME)

                    if ("max_threads" in query_settings
                            and project_rate_limit_stats is not None
                            and project_rate_limit_stats.concurrent > 1):
                        maxt = query_settings["max_threads"]
                        query_settings["max_threads"] = max(
                            1, maxt - project_rate_limit_stats.concurrent + 1)

                    # Force query to use the first shard replica, which
                    # should have synchronously received any cluster writes
                    # before this query is run.
                    consistent = request.settings.get_consistent()
                    stats["consistent"] = consistent
                    if consistent:
                        query_settings["load_balancing"] = "in_order"
                        query_settings["max_threads"] = 1

                    try:
                        result = reader.execute(
                            query,
                            query_settings,
                            # All queries should already be deduplicated at this point
                            # But the query_id will let us know if they aren't
                            query_id=query_id if use_deduper else None,
                            with_totals=request.query.has_totals(),
                        )

                        timer.mark("execute")
                        stats.update({
                            "result_rows": len(result["data"]),
                            "result_cols": len(result["meta"]),
                        })

                        if use_cache:
                            cache.set(query_id, result)
                            timer.mark("cache_set")

                    except BaseException as ex:
                        error = str(ex)
                        logger.exception("Error running query: %s\n%s", sql,
                                         error)
                        stats = update_with_status("error")
                        meta = {}
                        if isinstance(ex, ClickhouseError):
                            err_type = "clickhouse"
                            meta["code"] = ex.code
                        else:
                            err_type = "unknown"
                        raise RawQueryException(
                            err_type=err_type,
                            message=error,
                            stats=stats,
                            sql=sql,
                            **meta,
                        )
            except RateLimitExceeded as ex:
                stats = update_with_status("rate-limited")
                raise RawQueryException(
                    err_type="rate-limited",
                    message="rate limit exceeded",
                    stats=stats,
                    sql=sql,
                    detail=str(ex),
                )

    stats = update_with_status("success")

    return RawQueryResult(result, {"stats": stats, "sql": sql})