Пример #1
0
def test_get_time_range() -> None:
    """
    Test finding the time range of a query.
    """
    body = {
        "selected_columns": ["event_id"],
        "conditions": [
            # Cannot test complex conditions based on explicit calls
            # the `and` and `or` functions, because they would not be
            # parsed as datetime by the old parser.
            ("timestamp", ">=", "2019-09-18T10:00:00"),
            ("timestamp", ">=", "2000-09-18T10:00:00"),
            ("timestamp", "<", "2019-09-19T12:00:00"),
            [("timestamp", "<", "2019-09-18T12:00:00"),
             ("project_id", "IN", [1])],
            ("project_id", "IN", [1]),
        ],
    }

    events = get_dataset("events")
    query = parse_query(body, events)
    processors = events.get_default_entity().get_query_processors()
    for processor in processors:
        if isinstance(processor, TimeSeriesProcessor):
            processor.process_query(query, HTTPRequestSettings())

    from_date_ast, to_date_ast = get_time_range(identity_translate(query),
                                                "timestamp")
    assert (from_date_ast is not None and isinstance(from_date_ast, datetime)
            and from_date_ast.isoformat() == "2019-09-18T10:00:00")
    assert (to_date_ast is not None and isinstance(to_date_ast, datetime)
            and to_date_ast.isoformat() == "2019-09-19T12:00:00")
Пример #2
0
def _get_date_range(query: Query) -> Optional[int]:
    """
    Best guess to find the time range for the query.
    We pick the first column that is compared with a datetime Literal.
    """
    pattern = FunctionCall(
        Or([String(ConditionFunctions.GT),
            String(ConditionFunctions.GTE)]),
        (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))),
    )

    condition = query.get_condition_from_ast()
    if condition is None:
        return None
    for exp in condition:
        result = pattern.match(exp)
        if result is not None:
            from_date, to_date = get_time_range(query,
                                                result.string("col_name"))
            if from_date is None or to_date is None:
                return None
            else:
                return (to_date - from_date).days

    return None
Пример #3
0
def test_get_time_range() -> None:
    """
    Test finding the time range of a query.
    """
    body = {
        "selected_columns": ["event_id"],
        "conditions": [
            ("timestamp", ">=", "2019-09-18T10:00:00"),
            ("timestamp", ">=", "2000-09-18T10:00:00"),
            ("timestamp", "<", "2019-09-19T12:00:00"),
            [("timestamp", "<", "2019-09-18T12:00:00"),
             ("project_id", "IN", [1])],
            ("project_id", "IN", [1]),
        ],
    }

    events = get_dataset("events")
    query = parse_query(body, events)
    processors = events.get_query_processors()
    for processor in processors:
        if isinstance(processor, TimeSeriesProcessor):
            processor.process_query(query, HTTPRequestSettings())

    from_date_ast, to_date_ast = get_time_range(ClickhouseQuery(query),
                                                "timestamp")
    assert (from_date_ast is not None and isinstance(from_date_ast, datetime)
            and from_date_ast.isoformat() == "2019-09-18T10:00:00")
    assert (to_date_ast is not None and isinstance(to_date_ast, datetime)
            and to_date_ast.isoformat() == "2019-09-19T12:00:00")
Пример #4
0
    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
    ) -> QueryResult:
        from_date_ast, to_date_ast = get_time_range(query, "timestamp")
        assert from_date_ast is not None and isinstance(
            from_date_ast, datetime)
        assert to_date_ast is not None and isinstance(to_date_ast, datetime)

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if condition[0] == "timestamp" and condition[1] == ">="),
            None,
        )
        to_date_str = next(
            (condition[2] for condition in conditions
             if condition[0] == "timestamp" and condition[1] == "<"),
            None,
        )
        assert from_date_str == from_date_ast.isoformat()
        assert to_date_str == to_date_ast.isoformat()

        found_timestamps.append(
            (from_date_ast.isoformat(), to_date_ast.isoformat()))

        return QueryResult({"data": []}, {})
Пример #5
0
def test_get_time_range() -> None:
    """
    Test finding the time range of a query.
    """
    body = """
        MATCH (events)
        SELECT event_id
        WHERE timestamp >= toDateTime('2019-09-18T10:00:00')
            AND timestamp >= toDateTime('2000-09-18T10:00:00')
            AND timestamp < toDateTime('2019-09-19T12:00:00')
            AND (timestamp < toDateTime('2019-09-18T12:00:00') OR project_id IN tuple(1))
            AND project_id IN tuple(1)
        """

    events = get_dataset("events")
    query, _ = parse_snql_query(body, events)
    processors = events.get_default_entity().get_query_processors()
    for processor in processors:
        if isinstance(processor, TimeSeriesProcessor):
            processor.process_query(query, HTTPQuerySettings())

    from_date_ast, to_date_ast = get_time_range(identity_translate(query),
                                                "timestamp")
    assert (from_date_ast is not None and isinstance(from_date_ast, datetime)
            and from_date_ast.isoformat() == "2019-09-18T10:00:00")
    assert (to_date_ast is not None and isinstance(to_date_ast, datetime)
            and to_date_ast.isoformat() == "2019-09-19T12:00:00")
Пример #6
0
 def process_query(self, query: Query, request_settings: RequestSettings) -> None:
     # NOTE: the product side is restricted to a 6h window, however it rounds
     # outwards, which extends the window to 7h.
     from_date, to_date = get_time_range(query, "started")
     if not from_date or not to_date or (to_date - from_date) > timedelta(hours=7):
         raise ValidationException(
             "Minute-resolution queries are restricted to a 7-hour time window."
         )
Пример #7
0
    def do_query(
        query: ClickhouseQuery, request_settings: RequestSettings,
    ) -> QueryResult:
        from_date_ast, to_date_ast = get_time_range(query, "timestamp")
        assert from_date_ast is not None and isinstance(from_date_ast, datetime)
        assert to_date_ast is not None and isinstance(to_date_ast, datetime)

        found_timestamps.append((from_date_ast.isoformat(), to_date_ast.isoformat()))

        return QueryResult({"data": []}, {})
Пример #8
0
def parse_and_run_query(
    dataset: Dataset,
    request: Request,
    timer: Timer,
    robust: bool = False,
    concurrent_queries_gauge: Optional[Gauge] = None,
) -> QueryResult:
    """
    Runs a Snuba Query, then records the metadata about each split query that was run.
    """
    # from_clause = request.query.get_from_clause()
    start, end = None, None
    entity_name = "unknown"
    if isinstance(request.query, LogicalQuery):
        entity_key = request.query.get_from_clause().key
        entity = get_entity(entity_key)
        entity_name = entity_key.value
        if entity.required_time_column is not None:
            start, end = get_time_range(request.query,
                                        entity.required_time_column)

    query_metadata = SnubaQueryMetadata(
        request=request,
        start_timestamp=start,
        end_timestamp=end,
        dataset=get_dataset_name(dataset),
        entity=entity_name,
        timer=timer,
        query_list=[],
        projects=ProjectsFinder().visit(request.query),
        snql_anonymized=request.snql_anonymized,
    )

    try:
        result = _run_query_pipeline(
            dataset=dataset,
            request=request,
            timer=timer,
            query_metadata=query_metadata,
            robust=robust,
            concurrent_queries_gauge=concurrent_queries_gauge,
        )
        _set_query_final(request, result.extra)
        if not request.query_settings.get_dry_run():
            record_query(request, timer, query_metadata, result.extra)
    except QueryException as error:
        _set_query_final(request, error.extra)
        record_query(request, timer, query_metadata, error.extra)
        raise error

    return result
Пример #9
0
 def _query_overlaps_replacements(
     self,
     query: Query,
     latest_replacement_time: Optional[datetime],
 ) -> bool:
     """
     Given a Query and the latest replacement time for any project
     this query touches, returns whether or not this Query's time
     range overlaps that replacement.
     """
     query_from, _ = get_time_range(query, "timestamp")
     return (
         latest_replacement_time > query_from
         if latest_replacement_time and query_from
         else True
     )
Пример #10
0
def v2_selector_function(query: Query, referrer: str) -> Tuple[str, List[str]]:
    if settings.TRANSACTIONS_UPGRADE_BEGINING_OF_TIME is None or not isinstance(
            query, ProcessableQuery):
        return ("transactions_v1", [])

    range = get_time_range(query, "timestamp")
    if range[0] is None or range[
            0] < settings.TRANSACTIONS_UPGRADE_BEGINING_OF_TIME:
        return ("transactions_v1", [])

    mapping = {
        Option.TRANSACTIONS: "transactions_v1",
        Option.TRANSACTIONS_V2: "transactions_v2",
    }
    choice = RolloutSelector(Option.TRANSACTIONS, Option.TRANSACTIONS_V2,
                             "transactions").choose(referrer)
    if choice.secondary is None:
        return (mapping[choice.primary], [])
    else:
        return (mapping[choice.primary], [mapping[choice.secondary]])
Пример #11
0
    def select_storage(self, query: Query,
                       query_settings: QuerySettings) -> StorageAndMappers:

        # If the passed in `query_settings` arg is an instance of `SubscriptionQuerySettings`,
        # then it is a crash rate alert subscription, and hence we decide on whether to use the
        # materialized storage or the raw storage by examining the time_window.
        # If the `time_window` <=1h, then select the raw storage otherwise select materialized
        # storage
        # NOTE: If we were to support other types of subscriptions over the sessions dataset that
        # do not follow this method used to identify which storage to use, we would need to
        # find a different way to distinguish them.
        if isinstance(query_settings, SubscriptionQuerySettings):
            from_date, to_date = get_time_range(query, "started")
            if from_date and to_date:
                use_materialized_storage = to_date - from_date > timedelta(
                    hours=1)
            else:
                use_materialized_storage = True
        else:
            granularity = extract_granularity_from_query(query,
                                                         "started") or 3600
            use_materialized_storage = granularity >= 3600 and (granularity %
                                                                3600) == 0

        metrics.increment(
            "query.selector",
            tags={
                "selected_storage":
                "materialized" if use_materialized_storage else "raw",
            },
        )

        if use_materialized_storage:
            return StorageAndMappers(self.materialized_storage,
                                     sessions_hourly_translators)
        else:
            return StorageAndMappers(self.raw_storage,
                                     sessions_raw_translators)
Пример #12
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby()
        if not orderby or orderby[0] != f"-{self.__timestamp_col}":
            return None

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, ">=")),
            None,
        )

        to_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, "<")),
            None,
        )
        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if not from_date_str or not to_date_str:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        to_date = util.parse_datetime(to_date_str, date_align)
        from_date = util.parse_datetime(from_date_str, date_align)

        if from_date != from_date_ast:
            logger.warning(
                "Mismatch in start date on time splitter.",
                extra={
                    "ast": str(from_date_ast),
                    "legacy": str(from_date)
                },
                exc_info=True,
            )
            metrics.increment("mismatch.ast_from_date")

        remaining_offset = query.get_offset()

        overall_result = None
        split_end = to_date
        split_start = max(split_end - timedelta(seconds=split_step), from_date)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_condition(split_query, self.__timestamp_col, ">=",
                               split_start.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_condition(split_query, self.__timestamp_col, "<",
                               split_end.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step), from_date)
                except OverflowError:
                    split_start = from_date

        return overall_result
Пример #13
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby_from_ast():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby_from_ast()
        if (not orderby or orderby[0].direction != OrderByDirection.DESC
                or not isinstance(orderby[0].expression, ColumnExpr) or
                not orderby[0].expression.column_name == self.__timestamp_col):
            return None

        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if from_date_ast is None or to_date_ast is None:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        assert isinstance(split_step, int)
        remaining_offset = query.get_offset()

        overall_result: Optional[QueryResult] = None
        split_end = to_date_ast
        split_start = max(split_end - timedelta(seconds=split_step),
                          from_date_ast)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step),
                        from_date_ast)
                except OverflowError:
                    split_start = from_date_ast

        return overall_result