Пример #1
0
 def wrapped_match_method(self, segments: tuple, parse_context):
     """A wrapper on the match function to do some basic validation."""
     # Use the ephemeral_segment if present. This should only
     # be the case for grammars where `ephemeral_name` is defined.
     if self.ephemeral_name:
         # We're going to return as though it's a full match, similar to Anything().
         new_grammar = copy.copy(self)
         # Reset the ephemeral name on the new version of the grammar otherwise
         # we get infinite recursion.
         new_grammar.ephemeral_name = None
         # We shouldn't allow nested ephemerals. If they're present, don't create
         # another. This can happen when grammars call super() on their match method.
         if len(segments) == 1 and segments[0].is_type("ephemeral"):
             return MatchResult.from_matched(segments)
         else:
             return MatchResult.from_matched((
                 EphemeralSegment(
                     segments=segments,
                     pos_marker=None,
                     # Ephemeral segments get a copy of the parent grammar.
                     parse_grammar=new_grammar,
                     name=self.ephemeral_name,
                 ), ))
     else:
         # Otherwise carry on through with wrapping the function.
         return func(self, segments, parse_context=parse_context)
Пример #2
0
    def match(cls, segments, parse_context):
        """Compare input segments for a match, return a `MatchResult`.

        ReSegment implements its own matching function where
        we assume that ._template is a r"" string, and is formatted
        for use directly as a regex. This only matches on a single segment.
        """
        # If we've been passed the singular, make it a list
        if isinstance(segments, BaseSegment):
            segments = [segments]
        # Regardless of what we're passed, make a string.
        # NB: We only match on the first element of a set of segments.
        s = segments[0].raw
        # Case sensitivity is not supported
        sc = s.upper()
        if len(s) == 0:
            raise ValueError("Zero length string passed to ReSegment!?")
        # Try the regex
        result = re.match(cls._template, sc)
        if result:
            r = result.group(0)
            # Check that we've fully matched
            if r == sc:
                # Check that the _anti_template (if set) hasn't also matched
                if cls._anti_template and re.match(cls._anti_template, sc):
                    return MatchResult.from_unmatched(segments)
                else:
                    m = (
                        cls(raw=s, pos_marker=segments[0].pos_marker),
                    )  # Return a tuple
                    return MatchResult(m, segments[1:])
        return MatchResult.from_unmatched(segments)
Пример #3
0
    def match(
        self,
        segments: Union[BaseSegment, Tuple[BaseSegment, ...]],
        parse_context: "ParseContext",
    ) -> MatchResult:
        """Compare input segments for a match, return a `MatchResult`.

        Note: For matching here, we only consider the *first* element,
        because we assume that a keyword can only span one raw segment.
        """
        # If we've been passed the singular, make it a tuple
        if isinstance(segments, BaseSegment):
            segments = (segments, )

        # We're only going to match against the first element
        if len(segments) >= 1:
            # Is the first one already of this type?
            if (isinstance(segments[0], self.raw_class)
                    and segments[0].name == self.name
                    and segments[0].is_type(self.type)):
                return MatchResult((segments[0], ), segments[1:])
            # Does it match?
            elif self._is_first_match(segments[0]):
                return self._make_match_from_first_result(segments)
        return MatchResult.from_unmatched(segments)
Пример #4
0
 def match(
     cls, segments: Tuple["BaseSegment", ...], parse_context: ParseContext
 ) -> MatchResult:
     """Only useful as a terminator."""
     if segments and isinstance(segments[0], cls):
         return MatchResult((segments[0],), segments[1:])
     return MatchResult.from_unmatched(segments)
Пример #5
0
    def match(self, segments, parse_context):
        """Evaluate conditionals and return content."""
        if not self.is_enabled(parse_context):
            return MatchResult.from_unmatched(segments)

        # Instantiate the new element and return
        new_seg = self._elements[0]()
        return MatchResult((new_seg, ), segments)
Пример #6
0
    def match(self, segments, parse_context):
        """Match if this sequence starts with a match."""
        first_code_idx = None
        # Work through to find the first code segment...
        for idx, seg in enumerate(segments):
            if seg.is_code:
                first_code_idx = idx
                break
        else:
            # We've trying to match on a sequence of segments which contain no code.
            # That means this isn't a match.
            return MatchResult.from_unmatched(
                segments)  # pragma: no cover TODO?
        with parse_context.deeper_match() as ctx:
            match = self.target.match(segments=segments[first_code_idx:],
                                      parse_context=ctx)

        if not match:
            return MatchResult.from_unmatched(segments)

        # The match will probably have returned a mutated version rather
        # that the raw segment sent for matching. We need to reinsert it
        # back into the sequence in place of the raw one, but we can't
        # just assign at the index because it's a tuple and not a list.
        # to get around that we do this slightly more elaborate construction.

        # NB: This match may be partial or full, either is cool. In the case
        # of a partial match, given that we're only interested in what it STARTS
        # with, then we can still used the unmatched parts on the end.
        # We still need to deal with any non-code segments at the start.
        greedy_match = self.greedy_match(
            match.unmatched_segments,
            parse_context,
            matchers=[self.terminator],
            enforce_whitespace_preceding_terminator=(
                self.enforce_whitespace_preceding_terminator),
            include_terminator=self.include_terminator,
        )

        # NB: If all we matched in the greedy match was non-code then we can't
        # claim it.
        if not any(seg.is_code for seg in greedy_match.matched_segments):
            # So just return the original match.
            return match

        # Otherwise Combine the results.
        return MatchResult(
            match.matched_segments + greedy_match.matched_segments,
            greedy_match.unmatched_segments,
        )
Пример #7
0
    def _match_once(self, segments: Tuple[BaseSegment, ...],
                    parse_context: ParseContext) -> MatchResult:
        """Match the forward segments against the available elements once.

        This serves as the main body of OneOf, but also a building block
        for AnyNumberOf.
        """
        # For efficiency, we'll be pruning options if we can
        # based on their simpleness. this provides a short cut
        # to return earlier if we can.
        # `segments` may already be nested so we need to break out
        # the raw segments within it.
        available_options, _ = self._prune_options(segments,
                                                   parse_context=parse_context)

        # If we've pruned all the options, return unmatched (with some logging).
        if not available_options:
            return MatchResult.from_unmatched(segments)

        with parse_context.deeper_match() as ctx:
            match, _ = self._longest_trimmed_match(
                segments,
                available_options,
                parse_context=ctx,
                trim_noncode=False,
            )

        return match
Пример #8
0
    def match(self, segments, parse_context):
        """Matches... Anything.

        Most useful in match grammars, where a later parse grammar
        will work out what's inside.
        """
        return MatchResult.from_matched(segments)
Пример #9
0
    def match(self, segments, parse_context):
        """Matches... nothing.

        Useful for placeholders which might be overwritten by other
        dialects.
        """
        return MatchResult.from_unmatched(segments)
Пример #10
0
        def wrapped_match_method(self_cls, segments: tuple, parse_context):
            """A wrapper on the match function to do some basic validation."""
            # Use the ephemeral_segment if present. This should only
            # be the case for grammars where `ephemeral_name` is defined.
            ephemeral_segment = getattr(self_cls, "ephemeral_segment", None)
            if ephemeral_segment:
                # We're going to return as though it's a full match, similar to Anything().
                m = MatchResult.from_matched(
                    ephemeral_segment(segments=segments))
            else:
                # Otherwise carry on through with wrapping the function.
                m = func(self_cls, segments, parse_context=parse_context)

            # Validate result
            if not isinstance(m, MatchResult):
                parse_context.logger.warning(
                    "{0}.match, returned {1} rather than MatchResult".format(
                        func.__qualname__, type(m)))

            # Log the result.
            WrapParseMatchLogObject(
                grammar=func.__qualname__,
                func="match",
                match=m,
                parse_context=parse_context,
                segments=segments,
                v_level=v_level,
            ).log()

            # Basic Validation, skipped here because it still happens in the parse commands.
            return m
Пример #11
0
    def match(self, segments: Tuple[BaseSegment, ...],
              parse_context: ParseContext) -> "MatchResult":
        """Matches... nothing.

        Useful for placeholders which might be overwritten by other
        dialects.
        """
        return MatchResult.from_unmatched(segments)
Пример #12
0
    def match(self, segments: Tuple[BaseSegment, ...],
              parse_context: ParseContext) -> "MatchResult":
        """Match a list of segments against this segment.

        Matching can be done from either the raw or the segments.
        This raw function can be overridden, or a grammar defined
        on the underlying class.

        The match element of Ref, also implements the caching
        using the parse_context `denylist` methods.
        """
        elem = self._get_elem(dialect=parse_context.dialect)

        # First if we have an *exclude* option, we should check that
        # which would prevent the rest of this grammar from matching.
        if self.exclude:
            with parse_context.deeper_match() as ctx:
                if self.exclude.match(segments, parse_context=ctx):
                    return MatchResult.from_unmatched(segments)

        # First check against the efficiency Cache.
        # We rely on segments not being mutated within a given
        # match cycle and so the ids should continue to refer to unchanged
        # objects.
        seg_tuple = (id(seg) for seg in segments)
        self_name = self._get_ref()
        if parse_context.denylist.check(self_name,
                                        seg_tuple):  # pragma: no cover TODO?
            # This has been tried before.
            parse_match_logging(
                self.__class__.__name__,
                "match",
                "SKIP",
                parse_context=parse_context,
                v_level=3,
                self_name=self_name,
            )
            return MatchResult.from_unmatched(segments)

        # Match against that. NB We're not incrementing the match_depth here.
        # References shouldn't really count as a depth of match.
        with parse_context.matching_segment(self._get_ref()) as ctx:
            resp = elem.match(segments=segments, parse_context=ctx)
        if not resp:
            parse_context.denylist.mark(self_name, seg_tuple)
        return resp
Пример #13
0
    def match(self, segments: Tuple[BaseSegment, ...],
              parse_context: ParseContext) -> "MatchResult":
        """Matches... Anything.

        Most useful in match grammars, where a later parse grammar
        will work out what's inside.
        """
        return MatchResult.from_matched(segments)
Пример #14
0
 def match(self, segments, parse_context):
     """Match any starting non-code segments."""
     if not isinstance(segments, tuple):  # pragma: no cover
         raise TypeError("NonCodeMatcher expects a tuple.")
     idx = 0
     while idx < len(segments) and not segments[idx].is_code:
         idx += 1
     return MatchResult(segments[:idx], segments[idx:])
Пример #15
0
def test__parser__match_add(input_func, raw_seg):
    """Test construction of MatchResults."""
    m1 = MatchResult.from_matched([raw_seg])
    # Test adding
    m2 = m1 + input_func(raw_seg)
    # Check it's a match result
    assert isinstance(m2, MatchResult)
    # In all cases, it should also be of length 2
    assert len(m2) == 2
Пример #16
0
    def match(self, segments: Tuple[BaseSegment, ...],
              parse_context: ParseContext) -> MatchResult:
        """Match against any of the elements a relevant number of times.

        If it matches multiple, it returns the longest, and if any are the same
        length it returns the first (unless we explicitly just match first).
        """
        # First if we have an *exclude* option, we should check that
        # which would prevent the rest of this grammar from matching.
        if self.exclude:
            with parse_context.deeper_match() as ctx:
                if self.exclude.match(segments, parse_context=ctx):
                    return MatchResult.from_unmatched(segments)

        # Match on each of the options
        matched_segments: MatchResult = MatchResult.from_empty()
        unmatched_segments: Tuple[BaseSegment, ...] = segments
        n_matches = 0
        while True:
            if self.max_times and n_matches >= self.max_times:
                # We've matched as many times as we can
                return MatchResult(matched_segments.matched_segments,
                                   unmatched_segments)

            # Is there anything left to match?
            if len(unmatched_segments) == 0:
                # No...
                if n_matches >= self.min_times:
                    return MatchResult(matched_segments.matched_segments,
                                       unmatched_segments)
                else:
                    # We didn't meet the hurdle
                    return MatchResult.from_unmatched(unmatched_segments)

            # If we've already matched once...
            if n_matches > 0 and self.allow_gaps:
                # Consume any non-code if there is any
                pre_seg, mid_seg, post_seg = trim_non_code_segments(
                    unmatched_segments)
                unmatched_segments = mid_seg + post_seg
            else:
                pre_seg = ()  # empty tuple

            match = self._match_once(unmatched_segments,
                                     parse_context=parse_context)
            if match:
                matched_segments += pre_seg + match.matched_segments
                unmatched_segments = match.unmatched_segments
                n_matches += 1
            else:
                # If we get here, then we've not managed to match. And the next
                # unmatched segments are meaningful, i.e. they're not what we're
                # looking for.
                if n_matches >= self.min_times:
                    return MatchResult(matched_segments.matched_segments,
                                       pre_seg + unmatched_segments)
                else:
                    # We didn't meet the hurdle
                    return MatchResult.from_unmatched(unmatched_segments)
Пример #17
0
    def match(cls, segments, parse_context):
        """Compare input segments for a match, return a `MatchResult`.

        Note: For Keyword matching, we only consider the *first* element,
        because we assume that a keyword can only span one raw segment.
        """
        # If we've been passed the singular, make it a list
        if isinstance(segments, BaseSegment):
            segments = [segments]

        # We're only going to match against the first element
        if len(segments) >= 1:
            raw = segments[0].raw
            pos = segments[0].pos_marker
            raw_comp = raw.upper()

            if cls._template == raw_comp:
                m = (cls(raw=raw, pos_marker=pos), )  # Return as a tuple
                return MatchResult(m, segments[1:])
        return MatchResult.from_unmatched(segments)
Пример #18
0
    def match(cls, segments, parse_context):
        """Compare input segments for a match, return a `MatchResult`.

        NamedSegment implements its own matching function where
        we assume that ._template is the `name` of a segment.
        """
        # If we've been passed the singular, make it a list
        if isinstance(segments, BaseSegment):
            segments = [segments]

        # We only match on the first element of a set of segments
        if len(segments) >= 1:
            s = segments[0]
            # Case sensitivity is not supported.
            n = s.name.upper()
            if cls._template == n:
                m = (cls(raw=s.raw, pos_marker=segments[0].pos_marker),
                     )  # Return a tuple
                return MatchResult(m, segments[1:])
        return MatchResult.from_unmatched(segments)
Пример #19
0
    def _make_match_from_first_result(self, segments: Tuple[BaseSegment, ...]):
        """Make a MatchResult from the first segment in the given list.

        This is a helper function for reuse by other parsers.
        """
        # Construct the segment object
        new_seg = self.raw_class(
            raw=segments[0].raw,
            pos_marker=segments[0].pos_marker,
            type=self.type,
            name=self.name,
            **self.segment_kwargs,
        )
        # Return as a tuple
        return MatchResult((new_seg, ), segments[1:])
Пример #20
0
    def match(self, segments, parse_context):
        """Match a list of segments against this segment.

        Matching can be done from either the raw or the segments.
        This raw function can be overridden, or a grammar defined
        on the underlying class.

        The match element of Ref, also implements the caching
        using the parse_context `blacklist` methods.
        """
        elem = self._get_elem(dialect=parse_context.dialect)

        if not elem:
            raise ValueError(
                "Null Element returned! _elements: {0!r}".format(self._elements)
            )

        # First check against the efficiency Cache.
        # We used to use seg_to_tuple here, but it was too slow,
        # so instead we rely on segments not being mutated within a given
        # match cycle and so the ids should continue to refer to unchanged
        # objects.
        seg_tuple = (id(seg) for seg in segments)
        self_name = self._get_ref()
        if parse_context.blacklist.check(self_name, seg_tuple):
            # This has been tried before.
            parse_match_logging(
                self.__class__.__name__,
                "match",
                "SKIP",
                parse_context=parse_context,
                v_level=3,
                self_name=self_name,
            )
            return MatchResult.from_unmatched(segments)

        # Match against that. NB We're not incrementing the match_depth here.
        # References shouldn't really count as a depth of match.
        with parse_context.matching_segment(self._get_ref()) as ctx:
            resp = elem.match(segments=segments, parse_context=ctx)
        if not resp:
            parse_context.blacklist.mark(self_name, seg_tuple)
        return resp
Пример #21
0
    def match(
        self,
        segments: Union[BaseSegment, Tuple[BaseSegment, ...]],
        parse_context: "ParseContext",
    ) -> MatchResult:
        """Compare input segments for a match, return a `MatchResult`.

        Note: For matching here, we only consider the *first* element,
        because we assume that a keyword can only span one raw segment.
        """
        # If we've been passed the singular, make it a tuple
        if isinstance(segments, BaseSegment):
            segments = (segments, )

        # We're only going to match against the first element
        if len(segments) >= 1:
            if self._is_first_match(segments[0]):
                return self._make_match_from_first_result(segments)
        return MatchResult.from_unmatched(segments)
Пример #22
0
 def wrapped_match_method(self, segments: tuple, parse_context):
     """A wrapper on the match function to do some basic validation."""
     # Use the ephemeral_segment if present. This should only
     # be the case for grammars where `ephemeral_name` is defined.
     if self.ephemeral_name:
         # We're going to return as though it's a full match, similar to Anything().
         new_grammar = copy.copy(self)
         # Reset the ephemeral name on the new version of the grammar otherwise
         # we get infinite recursion.
         new_grammar.ephemeral_name = None
         return MatchResult.from_matched((
             EphemeralSegment(
                 segments=segments,
                 pos_marker=None,
                 # Ephemeral segments get a copy of the parent grammar.
                 parse_grammar=new_grammar,
                 name=self.ephemeral_name,
             ), ))
     else:
         # Otherwise carry on through with wrapping the function.
         return func(self, segments, parse_context=parse_context)
Пример #23
0
    def match(cls, segments: Tuple["BaseSegment", ...],
              parse_context: ParseContext) -> MatchResult:
        """Match a list of segments against this segment.

        Note: Match for segments is done in the ABSTRACT.
        When dealing with concrete then we're always in parse.
        Parse is what happens during expand.

        Matching can be done from either the raw or the segments.
        This raw function can be overridden, or a grammar defined
        on the underlying class.
        """
        # Edge case, but it's possible that we have *already matched* on
        # a previous cycle. Do should first check whether this is a case
        # of that.
        if len(segments) == 1 and isinstance(segments[0], cls):
            # This has already matched. Winner.
            parse_match_logging(
                cls.__name__,
                "_match",
                "SELF",
                parse_context=parse_context,
                v_level=3,
                symbol="+++",
            )
            return MatchResult.from_matched(segments)
        elif len(segments) > 1 and isinstance(segments[0], cls):
            parse_match_logging(
                cls.__name__,
                "_match",
                "SELF",
                parse_context=parse_context,
                v_level=3,
                symbol="+++",
            )
            # This has already matched, but only partially.
            return MatchResult((segments[0], ), segments[1:])

        if cls.match_grammar:
            # Call the private method
            with parse_context.deeper_match() as ctx:
                m = cls.match_grammar.match(segments=segments,
                                            parse_context=ctx)

            # Calling unify here, allows the MatchResult class to do all the type checking.
            if not isinstance(m, MatchResult):
                raise TypeError(
                    "[PD:{0} MD:{1}] {2}.match. Result is {3}, not a MatchResult!"
                    .format(
                        parse_context.parse_depth,
                        parse_context.match_depth,
                        cls.__name__,
                        type(m),
                    ))
            # Once unified we can deal with it just as a MatchResult
            if m.has_match():
                return MatchResult((cls(segments=m.matched_segments), ),
                                   m.unmatched_segments)
            else:
                return MatchResult.from_unmatched(segments)
        else:
            raise NotImplementedError(
                "{0} has no match function implemented".format(cls.__name__))
Пример #24
0
    def _bracket_sensitive_look_ahead_match(
        cls,
        segments,
        matchers,
        parse_context,
        start_bracket=None,
        end_bracket=None,
        bracket_pairs_set="bracket_pairs",
    ):
        """Same as `_look_ahead_match` but with bracket counting.

        NB: Given we depend on `_look_ahead_match` we can also utilise
        the same performance optimisations which are implemented there.

        bracket_pairs_set: Allows specific segments to override the available
            bracket pairs. See the definition of "angle_bracket_pairs" in the
            BigQuery dialect for additional context on why this exists.

        Returns:
            `tuple` of (unmatched_segments, match_object, matcher).

        """
        # Type munging
        matchers = list(matchers)
        if isinstance(segments, BaseSegment):
            segments = [segments]

        # Have we been passed an empty list?
        if len(segments) == 0:
            return ((), MatchResult.from_unmatched(segments), None)

        # Get hold of the bracket matchers from the dialect, and append them
        # to the list of matchers. We get them from the relevant set on the
        # dialect. We use zip twice to "unzip" them. We ignore the first
        # argument because that's just the name.
        _, start_bracket_refs, end_bracket_refs = zip(
            *parse_context.dialect.sets(bracket_pairs_set)
        )
        # These are matchables, probably StringParsers.
        start_brackets = [
            parse_context.dialect.ref(seg_ref) for seg_ref in start_bracket_refs
        ]
        end_brackets = [
            parse_context.dialect.ref(seg_ref) for seg_ref in end_bracket_refs
        ]
        # Add any bracket-like things passed as arguments
        if start_bracket:
            start_brackets += [start_bracket]
        if end_bracket:
            end_brackets += [end_bracket]
        bracket_matchers = start_brackets + end_brackets

        # Make some buffers
        seg_buff = segments
        pre_seg_buff = ()  # NB: Tuple
        bracket_stack: List[BracketInfo] = []

        # Iterate
        while True:
            # Do we have anything left to match on?
            if seg_buff:
                # Yes we have buffer left to work with.
                # Are we already in a bracket stack?
                if bracket_stack:
                    # Yes, we're just looking for the closing bracket, or
                    # another opening bracket.
                    pre, match, matcher = cls._look_ahead_match(
                        seg_buff,
                        bracket_matchers,
                        parse_context=parse_context,
                    )

                    if match:
                        # NB: We can only consider this as a nested bracket if the start
                        # and end tokens are not the same. If a matcher is both a start and
                        # end token we cannot deepen the bracket stack. In general, quoted
                        # strings are a typical example where the start and end tokens are
                        # the same. Currently, though, quoted strings are handled elsewhere
                        # in the parser, and there are no cases where *this* code has to
                        # handle identical start and end brackets. For now, consider this
                        # a small, speculative investment in a possible future requirement.
                        if matcher in start_brackets and matcher not in end_brackets:
                            # Same procedure as below in finding brackets.
                            bracket_stack.append(
                                BracketInfo(
                                    bracket=match.matched_segments[0],
                                )
                            )
                            pre_seg_buff += pre
                            pre_seg_buff += match.matched_segments
                            seg_buff = match.unmatched_segments
                            continue
                        elif matcher in end_brackets:
                            # Found an end bracket. Does its type match that of
                            # the innermost start bracket? E.g. ")" matches "(",
                            # "]" matches "[".
                            # For the start bracket we don't have the matcher
                            # but we can work out the name, so we use that for
                            # the lookup.
                            start_index = [
                                bracket.name for bracket in start_brackets
                            ].index(bracket_stack[-1].bracket.name)
                            # For the end index, we can just look for the matcher
                            end_index = end_brackets.index(matcher)
                            bracket_types_match = start_index == end_index
                            if bracket_types_match:
                                # Yes, the types match. So we've found a
                                # matching end bracket. Pop the stack and carry
                                # on.
                                bracket_stack.pop()
                                pre_seg_buff += pre
                                pre_seg_buff += match.matched_segments
                                seg_buff = match.unmatched_segments
                                continue
                            else:
                                # The types don't match. Error.
                                raise SQLParseError(
                                    f"Found unexpected end bracket!, was expecting {end_brackets[start_index]}, but got {matcher}",
                                    segment=match.matched_segments[0],
                                )

                        else:
                            raise RuntimeError("I don't know how we get here?!")
                    else:
                        # No match, we're in a bracket stack. Error.
                        raise SQLParseError(
                            "Couldn't find closing bracket for opening bracket.",
                            segment=bracket_stack[-1].bracket,
                        )
                else:
                    # No, we're open to more opening brackets or the thing(s)
                    # that we're otherwise looking for.
                    pre, match, matcher = cls._look_ahead_match(
                        seg_buff,
                        matchers + bracket_matchers,
                        parse_context=parse_context,
                    )

                    if match:
                        if matcher in matchers:
                            # It's one of the things we were looking for!
                            # Return.
                            return (pre_seg_buff + pre, match, matcher)
                        elif matcher in start_brackets:
                            # We've found the start of a bracket segment.
                            # NB: It might not *actually* be the bracket itself,
                            # but could be some non-code element preceding it.
                            # That's actually ok.

                            # Add the bracket to the stack.
                            bracket_stack.append(
                                BracketInfo(
                                    bracket=match.matched_segments[0],
                                )
                            )
                            # Add the matched elements and anything before it to the
                            # pre segment buffer. Reset the working buffer.
                            pre_seg_buff += pre
                            pre_seg_buff += match.matched_segments
                            seg_buff = match.unmatched_segments
                            continue
                        elif matcher in end_brackets:
                            # We've found an unexpected end bracket! This is likely
                            # because we're matching a section which should have ended.
                            # If we had a match, it would have matched by now, so this
                            # means no match.
                            parse_match_logging(
                                cls.__name__,
                                "_bracket_sensitive_look_ahead_match",
                                "UEXB",
                                parse_context=parse_context,
                                v_level=3,
                                got=matcher,
                            )
                            return ((), MatchResult.from_unmatched(segments), None)
                        else:
                            # This shouldn't happen!?
                            raise NotImplementedError(
                                "This shouldn't happen. Panic in _bracket_sensitive_look_ahead_match."
                            )
                    else:
                        # Not in a bracket stack, but no match. This is a happy
                        # unmatched exit.
                        return ((), MatchResult.from_unmatched(segments), None)
            else:
                # No we're at the end:
                # Now check have we closed all our brackets?
                if bracket_stack:
                    # No we haven't.
                    raise SQLParseError(
                        f"Couldn't find closing bracket for opened brackets: `{bracket_stack}`.",
                        segment=bracket_stack[-1].bracket,
                    )

                # We reached the end with no open brackets. This is a friendly
                # unmatched return.
                return ((), MatchResult.from_unmatched(segments), None)
Пример #25
0
def test__parser__match_construct_from_empty():
    """Test construction of MatchResults from empty."""
    m = MatchResult.from_empty()
    assert len(m) == 0
Пример #26
0
    def _look_ahead_match(cls, segments, matchers, parse_context):
        """Look ahead for matches beyond the first element of the segments list.

        This function also contains the performance improved hash-matching approach to
        searching for matches, which should significantly improve performance.

        Prioritise the first match, and if multiple match at the same point the longest.
        If two matches of the same length match at the same time, then it's the first in
        the iterable of matchers.

        Returns:
            `tuple` of (unmatched_segments, match_object, matcher).

        """
        parse_match_logging(
            cls.__name__,
            "_look_ahead_match",
            "IN",
            parse_context=parse_context,
            v_level=4,
            ls=len(segments),
            seg=LateBoundJoinSegmentsCurtailed(segments),
        )

        # Do some type munging
        matchers = list(matchers)
        if isinstance(segments, BaseSegment):
            segments = [segments]

        # Have we been passed an empty list?
        if len(segments) == 0:
            return ((), MatchResult.from_empty(), None)

        # Here we enable a performance optimisation. Most of the time in this cycle
        # happens in loops looking for simple matchers which we should
        # be able to find a shortcut for.
        # First: Assess the matchers passed in, if any are
        # "simple", then we effectively use a hash lookup across the
        # content of segments to quickly evaluate if the segment is present.
        # Matchers which aren't "simple" still take a slower route.
        _matchers = [
            (matcher, matcher.simple(parse_context=parse_context))
            for matcher in matchers
        ]
        simple_matchers = [matcher for matcher in _matchers if matcher[1]]
        non_simple_matchers = [matcher[0] for matcher in _matchers if not matcher[1]]
        best_simple_match = None
        if simple_matchers:
            # If they're all simple we can use a hash match to identify the first one.
            # Build a buffer of all the upper case raw segments ahead of us.
            str_buff = []
            # For existing compound segments, we should assume that within
            # that segment, things are internally consistent, that means
            # rather than enumerating all the individual segments of a longer
            # one we just dump out the whole segment, but splitting off the
            # first element seperated by whitespace. This is a) faster and
            # also b) prevents some really horrible bugs with bracket matching.
            # See https://github.com/sqlfluff/sqlfluff/issues/433

            def _trim_elem(seg):
                s = seg.raw_upper.split(maxsplit=1)
                return s[0] if s else ""

            str_buff = [_trim_elem(seg) for seg in segments]
            match_queue = []

            for matcher, simple in simple_matchers:
                # Simple will be a tuple of options
                for simple_option in simple:
                    # NOTE: We use iter_indices to make sure we capture
                    # all instances of potential matches if there are many.
                    # This is important for bracket counting.
                    for buff_pos in iter_indices(str_buff, simple_option):
                        match_queue.append((matcher, buff_pos, simple_option))

            # Sort the match queue. First to process AT THE END.
            # That means we pop from the end.
            match_queue = sorted(match_queue, key=lambda x: x[1])

            parse_match_logging(
                cls.__name__,
                "_look_ahead_match",
                "SI",
                parse_context=parse_context,
                v_level=4,
                mq=match_queue,
                sb=str_buff,
            )

            while match_queue:
                # We've managed to match. We can shortcut home.
                # NB: We may still need to deal with whitespace.
                queued_matcher, queued_buff_pos, queued_option = match_queue.pop()
                # Here we do the actual transform to the new segment.
                match = queued_matcher.match(segments[queued_buff_pos:], parse_context)
                if not match:
                    # We've had something match in simple matching, but then later excluded.
                    # Log but then move on to the next item on the list.
                    parse_match_logging(
                        cls.__name__,
                        "_look_ahead_match",
                        "NM",
                        parse_context=parse_context,
                        v_level=4,
                        _so=queued_option,
                    )
                    continue
                # Ok we have a match. Because we sorted the list, we'll take it!
                best_simple_match = (segments[:queued_buff_pos], match, queued_matcher)

        if not non_simple_matchers:
            # There are no other matchers, we can just shortcut now.

            parse_match_logging(
                cls.__name__,
                "_look_ahead_match",
                "SC",
                parse_context=parse_context,
                v_level=4,
                bsm=None
                if not best_simple_match
                else (
                    len(best_simple_match[0]),
                    len(best_simple_match[1]),
                    best_simple_match[2],
                ),
            )

            if best_simple_match:
                return best_simple_match
            else:
                return ((), MatchResult.from_unmatched(segments), None)

        # Make some buffers
        seg_buff = segments
        pre_seg_buff = ()  # NB: Tuple

        # Loop
        while True:
            # Do we have anything left to match on?
            if seg_buff:
                # Great, carry on.
                pass
            else:
                # We've got to the end without a match, return empty
                return ((), MatchResult.from_unmatched(segments), None)

            # We only check the NON-simple ones here for brevity.
            mat, m = cls._longest_trimmed_match(
                seg_buff,
                non_simple_matchers,
                parse_context=parse_context,
                trim_noncode=False,
            )

            if mat and not best_simple_match:
                return (pre_seg_buff, mat, m)
            elif mat:
                # It will be earlier than the simple one if we've even checked,
                # but there's a chance that this might be *longer*, or just FIRST.
                pre_lengths = (len(pre_seg_buff), len(best_simple_match[0]))
                mat_lengths = (len(mat), len(best_simple_match[1]))
                mat_indexes = (matchers.index(m), matchers.index(best_simple_match[2]))
                if (
                    (pre_lengths[0] < pre_lengths[1])
                    or (
                        pre_lengths[0] == pre_lengths[1]
                        and mat_lengths[0] > mat_lengths[1]
                    )
                    or (
                        pre_lengths[0] == pre_lengths[1]
                        and mat_lengths[0] == mat_lengths[1]
                        and mat_indexes[0] < mat_indexes[1]
                    )
                ):
                    return (pre_seg_buff, mat, m)
                else:
                    return best_simple_match
            else:
                # If there aren't any matches, then advance the buffer and try again.
                # Two improvements:
                # 1) if we get as far as the first simple match, then return that.
                # 2) be eager in consuming non-code segments if allowed
                if best_simple_match and len(pre_seg_buff) >= len(best_simple_match[0]):
                    return best_simple_match

                pre_seg_buff += (seg_buff[0],)
                seg_buff = seg_buff[1:]
Пример #27
0
    def _longest_trimmed_match(
        cls,
        segments: Tuple["BaseSegment", ...],
        matchers: List["MatchableType"],
        parse_context: ParseContext,
        trim_noncode=True,
    ) -> Tuple[MatchResult, Optional["MatchableType"]]:
        """Return longest match from a selection of matchers.

        Prioritise the first match, and if multiple match at the same point the longest.
        If two matches of the same length match at the same time, then it's the first in
        the iterable of matchers.

        Returns:
            `tuple` of (match_object, matcher).

        """
        # Have we been passed an empty list?
        if len(segments) == 0:
            return MatchResult.from_empty(), None

        # If gaps are allowed, trim the ends.
        if trim_noncode:
            pre_nc, segments, post_nc = trim_non_code_segments(segments)

        best_match_length = 0
        # iterate at this position across all the matchers
        for matcher in matchers:
            # MyPy seems to require a type hint here. Not quite sure why.
            res_match: MatchResult = matcher.match(
                segments, parse_context=parse_context
            )
            if res_match.is_complete():
                # Just return it! (WITH THE RIGHT OTHER STUFF)
                if trim_noncode:
                    return (
                        MatchResult.from_matched(
                            pre_nc + res_match.matched_segments + post_nc
                        ),
                        matcher,
                    )
                else:
                    return res_match, matcher
            elif res_match:
                # We've got an incomplete match, if it's the best so far keep it.
                if res_match.matched_length > best_match_length:
                    best_match = res_match, matcher
                    best_match_length = res_match.matched_length

        # If we get here, then there wasn't a complete match. If we
        # has a best_match, return that.
        if best_match_length > 0:
            if trim_noncode:
                return (
                    MatchResult(
                        pre_nc + best_match[0].matched_segments,
                        best_match[0].unmatched_segments + post_nc,
                    ),
                    best_match[1],
                )
            else:
                return best_match
        # If no match at all, return nothing
        return MatchResult.from_unmatched(segments), None
Пример #28
0
    def _bracket_sensitive_look_ahead_match(cls,
                                            segments,
                                            matchers,
                                            parse_context,
                                            start_bracket=None,
                                            end_bracket=None):
        """Same as `_look_ahead_match` but with bracket counting.

        NB: Given we depend on `_look_ahead_match` we can also utilise
        the same performance optimisations which are implemented there.

        Returns:
            `tuple` of (unmatched_segments, match_object, matcher).

        """
        # Type munging
        matchers = list(matchers)
        if isinstance(segments, BaseSegment):
            segments = [segments]

        # Have we been passed an empty list?
        if len(segments) == 0:
            return ((), MatchResult.from_unmatched(segments), None)

        # Get hold of the bracket matchers from the dialect, and append them
        # to the list of matchers. We get them from the relevant set on the
        # dialect. We use zip twice to "unzip" them. We ignore the first
        # argument because that's just the name.
        _, start_bracket_refs, end_bracket_refs, definitely_bracket = zip(
            *parse_context.dialect.sets("bracket_pairs"))
        # These are currently strings which need rehydrating
        start_brackets = [
            parse_context.dialect.ref(seg_ref)
            for seg_ref in start_bracket_refs
        ]
        end_brackets = [
            parse_context.dialect.ref(seg_ref) for seg_ref in end_bracket_refs
        ]
        start_definite = list(definitely_bracket)
        end_definite = list(definitely_bracket)
        # Add any bracket-like things passed as arguments
        if start_bracket:
            start_brackets += [start_bracket]
            start_definite += [True]
        if end_bracket:
            end_brackets += [end_bracket]
            end_definite += [True]
        bracket_matchers = start_brackets + end_brackets

        # Make some buffers
        seg_buff = segments
        pre_seg_buff = ()  # NB: Tuple
        bracket_stack: List[BracketInfo] = []

        # Iterate
        while True:
            # Do we have anything left to match on?
            if seg_buff:
                # Yes we have buffer left to work with.
                # Are we already in a bracket stack?
                if bracket_stack:
                    # Yes, we're just looking for the closing bracket, or
                    # another opening bracket.
                    pre, match, matcher = cls._look_ahead_match(
                        seg_buff,
                        bracket_matchers,
                        parse_context=parse_context,
                    )

                    if match:
                        # NB: We can only consider this as a nested bracket if the start
                        # and end tokens are not the same. If a matcher is both a start and
                        # end token we cannot deepen the bracket stack. In general, quoted
                        # strings are a typical example where the start and end tokens are
                        # the same. Currently, though, quoted strings are handled elsewhere
                        # in the parser, and there are no cases where *this* code has to
                        # handle identical start and end brackets. For now, consider this
                        # a small, speculative investment in a possible future requirement.
                        if matcher in start_brackets and matcher not in end_brackets:
                            # Same procedure as below in finding brackets.
                            bracket_stack.append(
                                BracketInfo(
                                    bracket=match.matched_segments[0],
                                    is_definite=start_definite[
                                        start_brackets.index(matcher)],
                                ))
                            pre_seg_buff += pre
                            pre_seg_buff += match.matched_segments
                            seg_buff = match.unmatched_segments
                            continue
                        elif matcher in end_brackets:
                            # Found an end bracket. Does its type match that of
                            # the innermost start bracket (e.g. ")" matches "(",
                            # "]" matches "[".
                            start_index = start_brackets.index(
                                type(bracket_stack[-1].bracket))
                            end_index = end_brackets.index(matcher)
                            bracket_types_match = start_index == end_index
                            if bracket_types_match:
                                # Yes, the types match. So we've found a
                                # matching end bracket. Pop the stack and carry
                                # on.
                                bracket_stack.pop()
                                pre_seg_buff += pre
                                pre_seg_buff += match.matched_segments
                                seg_buff = match.unmatched_segments
                                continue
                            else:
                                # The types don't match. Check whether the end
                                # bracket is a definite bracket.
                                end_is_definite = end_definite[end_index]
                                if not end_is_definite:
                                    # The end bracket whose type didn't match
                                    # the innermost open bracket is not
                                    # definite. Assume it's not a bracket and
                                    # carry on.
                                    pre_seg_buff += pre
                                    pre_seg_buff += match.matched_segments
                                    seg_buff = match.unmatched_segments
                                else:
                                    # Definite end bracket does not match the
                                    # innermost start bracket. Was the innermost
                                    # start bracket definite? If yes, error. If
                                    # no, assume it was not a bracket.
                                    # Can we remove any brackets from the stack which aren't definites
                                    # to resolve the issue?
                                    for idx in range(
                                            len(bracket_stack) - 1, -1, -1):
                                        if not bracket_stack[idx].is_definite:
                                            del bracket_stack[idx]
                                            # We don't change the string buffer, we assume that was ok.
                                            break
                                    else:
                                        raise SQLParseError(
                                            f"Found unexpected end bracket!, was expecting {end_brackets[start_index]}, but got {matcher}",
                                            segment=match.matched_segments[0],
                                        )

                        else:
                            raise RuntimeError(
                                "I don't know how we get here?!")
                    else:
                        # No match, we're in a bracket stack. Either this is an error,
                        # OR we were mistaken in our initial identification of the opening
                        # bracket. That's only allowed if `not definitely_bracket`.

                        # Can we remove any brackets from the stack which aren't definites
                        # to resolve the issue?
                        for idx, elem in enumerate(reversed(bracket_stack)):
                            if not elem.is_definite:
                                del bracket_stack[-idx]
                                # We don't change the string buffer, we assume that was ok.
                                break
                        else:
                            # No we can't. We don't have a match and we're in a bracket stack.
                            raise SQLParseError(
                                "Couldn't find closing bracket for opening bracket.",
                                segment=bracket_stack[-1].bracket,
                            )
                        # We have attempted a potential solution to the problem. Loop around.
                        continue
                else:
                    # No, we're open to more opening brackets or the thing(s)
                    # that we're otherwise looking for.
                    pre, match, matcher = cls._look_ahead_match(
                        seg_buff,
                        matchers + bracket_matchers,
                        parse_context=parse_context,
                    )

                    if match:
                        if matcher in matchers:
                            # It's one of the things we were looking for!
                            # Return.
                            return (pre_seg_buff + pre, match, matcher)
                        elif matcher in start_brackets:
                            # We've found the start of a bracket segment.
                            # NB: It might not *actually* be the bracket itself,
                            # but could be some non-code element preceding it.
                            # That's actually ok.

                            # Add the bracket to the stack.
                            bracket_stack.append(
                                BracketInfo(
                                    bracket=match.matched_segments[0],
                                    is_definite=start_definite[
                                        start_brackets.index(matcher)],
                                ))
                            # Add the matched elements and anything before it to the
                            # pre segment buffer. Reset the working buffer.
                            pre_seg_buff += pre
                            pre_seg_buff += match.matched_segments
                            seg_buff = match.unmatched_segments
                            continue
                        elif matcher in end_brackets:
                            # each bracket with its "definite" attribute
                            bracket_is_definite = end_definite[
                                end_brackets.index(matcher)]
                            if bracket_is_definite:
                                # We've found an unexpected end bracket!
                                raise SQLParseError(
                                    f"Found unexpected end bracket!, was expecting one of: {matchers + bracket_matchers}, but got {matcher}",
                                    segment=match.matched_segments[0],
                                )
                            pre_seg_buff += pre
                            pre_seg_buff += match.matched_segments
                            seg_buff = match.unmatched_segments
                            continue
                        else:
                            # This shouldn't happen!?
                            raise NotImplementedError(
                                "This shouldn't happen. Panic in _bracket_sensitive_look_ahead_match."
                            )
                    else:
                        # Not in a bracket stack, but no match. This is a happy
                        # unmatched exit.
                        return ((), MatchResult.from_unmatched(segments), None)
            else:
                # No we're at the end:
                # Now check have we closed all our brackets?
                if bracket_stack:
                    # No we haven't.
                    # Check that the unclosed brackets are definite
                    definite_bracket_stack = [
                        b for b in bracket_stack if b.is_definite
                    ]
                    if definite_bracket_stack:
                        raise SQLParseError(
                            f"Couldn't find closing bracket for opened brackets: `{bracket_stack}`.",
                            segment=bracket_stack[-1].bracket,
                        )

                # We at the end but without a bracket left open. This is a
                # friendly unmatched return.
                return ((), MatchResult.from_unmatched(segments), None)
Пример #29
0
    def match(self, segments: Tuple[BaseSegment, ...],
              parse_context: ParseContext) -> MatchResult:
        """Match an arbitrary number of elements separated by a delimiter.

        Note that if there are multiple elements passed in that they will be treated
        as different options of what can be delimited, rather than a sequence.
        """
        # Have we been passed an empty list?
        if len(segments) == 0:
            return MatchResult.from_empty()

        # Make some buffers
        seg_buff = segments
        matched_segments = MatchResult.from_empty()
        # delimiters is a list of tuples containing delimiter segments as we find them.
        delimiters: List[BaseSegment] = []

        # First iterate through all the segments, looking for the delimiter.
        # Second, split the list on each of the delimiters, and ensure that
        # each sublist in turn matches one of the elements.

        # In more detail, match against delimiter, if we match, put a slice
        # up to that point onto a list of slices. Carry on.
        while True:
            # Check to see whether we've exhausted the buffer, either by iterating through it,
            # or by consuming all the non-code segments already.
            # NB: If we're here then we've already tried matching the remaining segments against
            # the content, so we must be in a trailing case.
            if len(seg_buff) == 0:
                # Append the remaining buffer in case we're in the not is_code case.
                matched_segments += seg_buff
                # Nothing left, this is potentially a trailing case?
                if self.allow_trailing and (
                        self.min_delimiters is None
                        or len(delimiters) >= self.min_delimiters):
                    # It is! (nothing left so no unmatched segments to append)
                    return MatchResult.from_matched(
                        matched_segments.matched_segments)
                else:
                    return MatchResult.from_unmatched(segments)

            # We rely on _bracket_sensitive_look_ahead_match to do the bracket counting
            # element of this now. We look ahead to find a delimiter or terminator.
            matchers = [self.delimiter]
            if self.terminator:
                matchers.append(self.terminator)
            # If gaps aren't allowed, a gap (or non-code segment), acts like a terminator.
            if not self.allow_gaps:
                matchers.append(NonCodeMatcher())

            with parse_context.deeper_match() as ctx:
                (
                    pre_content,
                    delimiter_match,
                    delimiter_matcher,
                ) = self._bracket_sensitive_look_ahead_match(
                    seg_buff,
                    matchers,
                    parse_context=ctx,
                )
            # Keep track of the *length* of this pre-content section before we start
            # to change it later. We need this for dealing with terminators.
            pre_content_len = len(pre_content)

            # Have we found a delimiter or terminator looking forward?
            if delimiter_match:
                if delimiter_matcher is self.delimiter:
                    # Yes. Store it and then match the contents up to now.
                    delimiters.append(delimiter_match.matched_segments)

                # We now test the intervening section as to whether it matches one
                # of the things we're looking for. NB: If it's of zero length then
                # we return without trying it.
                if len(pre_content) > 0:
                    with parse_context.deeper_match() as ctx:
                        match, matcher = self._longest_trimmed_match(
                            segments=pre_content,
                            matchers=self._elements,
                            parse_context=ctx,
                            trim_noncode=self.allow_gaps,
                        )

                    # No match, or an incomplete match: Not allowed
                    if not match or not match.is_complete():
                        return MatchResult.from_unmatched(segments)

                    # We have a complete match!

                    # First add the segment up to the delimiter to the matched segments
                    matched_segments += match.matched_segments
                    # Then it depends what we matched.
                    # Delimiter
                    if delimiter_matcher is self.delimiter:
                        # Then add the delimiter to the matched segments
                        matched_segments += delimiter_match.matched_segments
                        # Break this for loop and move on, looking for the next delimiter
                        seg_buff = delimiter_match.unmatched_segments
                        # Still got some buffer left. Carry on.
                        continue
                    # Terminator (or the gap terminator).
                    elif delimiter_matcher is self.terminator or isinstance(
                            delimiter_matcher, NonCodeMatcher):
                        # We just return straight away here. We don't add the terminator to
                        # this match, it should go with the unmatched parts. The terminator
                        # may also have mutated the returned segments so we also DON'T want
                        # the mutated version, it can do that itself (so we return `seg_buff`
                        # and not `delimiter_match.all_segments()``)

                        # First check we've had enough delimiters
                        if (self.min_delimiters
                                and len(delimiters) < self.min_delimiters):
                            return MatchResult.from_unmatched(segments)
                        else:
                            return MatchResult(
                                matched_segments.matched_segments,
                                # Return the part of the seg_buff which isn't in the
                                # pre-content.
                                seg_buff[pre_content_len:],
                            )
                    else:
                        raise RuntimeError((
                            "I don't know how I got here. Matched instead on {0}, which "
                            "doesn't appear to be delimiter or terminator"
                        ).format(delimiter_matcher))
                else:
                    # Zero length section between delimiters, or zero code
                    # elements if appropriate. Return unmatched.
                    return MatchResult.from_unmatched(segments)
            else:
                # No match for a delimiter looking forward, this means we're
                # at the end. In this case we look for a potential partial match
                # looking forward. We know it's a non-zero length section because
                # we checked that up front.

                # First check we're had enough delimiters, because if we haven't then
                # there's no sense to try matching
                if self.min_delimiters and len(
                        delimiters) < self.min_delimiters:
                    return MatchResult.from_unmatched(segments)

                # We use the whitespace padded match to hoover up whitespace if enabled,
                # and default to the longest matcher. We don't care which one matches.
                with parse_context.deeper_match() as ctx:
                    mat, _ = self._longest_trimmed_match(
                        seg_buff,
                        self._elements,
                        parse_context=ctx,
                        trim_noncode=self.allow_gaps,
                    )
                if mat:
                    # We've got something at the end. Return!
                    if mat.unmatched_segments:
                        # We have something unmatched and so we should let it also have the trailing elements
                        return MatchResult(
                            matched_segments.matched_segments +
                            mat.matched_segments,
                            mat.unmatched_segments,
                        )
                    else:
                        # If there's nothing unmatched in the most recent match, then we can consume the trailing
                        # non code segments
                        return MatchResult.from_matched(
                            matched_segments.matched_segments +
                            mat.matched_segments, )
                else:
                    # No match at the end, are we allowed to trail? If we are then return,
                    # otherwise we fail because we can't match the last element.
                    if self.allow_trailing:
                        return MatchResult(matched_segments.matched_segments,
                                           seg_buff)
                    else:
                        return MatchResult.from_unmatched(segments)
Пример #30
0
def test__parser__match_add_raises(fail_case, raw_seg):
    """Test construction of MatchResults."""
    m1 = MatchResult.from_matched([raw_seg])
    # Test adding, and check we get an exception of the right type
    with pytest.raises(TypeError):
        m1 + fail_case