def match(self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext) -> "MatchResult": """Match a list of segments against this segment. Matching can be done from either the raw or the segments. This raw function can be overridden, or a grammar defined on the underlying class. The match element of Ref, also implements the caching using the parse_context `denylist` methods. """ elem = self._get_elem(dialect=parse_context.dialect) # First if we have an *exclude* option, we should check that # which would prevent the rest of this grammar from matching. if self.exclude: with parse_context.deeper_match() as ctx: if self.exclude.match(segments, parse_context=ctx): return MatchResult.from_unmatched(segments) # First check against the efficiency Cache. # We rely on segments not being mutated within a given # match cycle and so the ids should continue to refer to unchanged # objects. seg_tuple = (id(seg) for seg in segments) self_name = self._get_ref() if parse_context.denylist.check(self_name, seg_tuple): # pragma: no cover TODO? # This has been tried before. parse_match_logging( self.__class__.__name__, "match", "SKIP", parse_context=parse_context, v_level=3, self_name=self_name, ) return MatchResult.from_unmatched(segments) # Match against that. NB We're not incrementing the match_depth here. # References shouldn't really count as a depth of match. with parse_context.matching_segment(self._get_ref()) as ctx: resp = elem.match(segments=segments, parse_context=ctx) if not resp: parse_context.denylist.mark(self_name, seg_tuple) return resp
def match(self, segments, parse_context): """Match a list of segments against this segment. Matching can be done from either the raw or the segments. This raw function can be overridden, or a grammar defined on the underlying class. The match element of Ref, also implements the caching using the parse_context `blacklist` methods. """ elem = self._get_elem(dialect=parse_context.dialect) if not elem: raise ValueError( "Null Element returned! _elements: {0!r}".format(self._elements) ) # First check against the efficiency Cache. # We used to use seg_to_tuple here, but it was too slow, # so instead we rely on segments not being mutated within a given # match cycle and so the ids should continue to refer to unchanged # objects. seg_tuple = (id(seg) for seg in segments) self_name = self._get_ref() if parse_context.blacklist.check(self_name, seg_tuple): # This has been tried before. parse_match_logging( self.__class__.__name__, "match", "SKIP", parse_context=parse_context, v_level=3, self_name=self_name, ) return MatchResult.from_unmatched(segments) # Match against that. NB We're not incrementing the match_depth here. # References shouldn't really count as a depth of match. with parse_context.matching_segment(self._get_ref()) as ctx: resp = elem.match(segments=segments, parse_context=ctx) if not resp: parse_context.blacklist.mark(self_name, seg_tuple) return resp
def match(cls, segments: Tuple["BaseSegment", ...], parse_context: ParseContext) -> MatchResult: """Match a list of segments against this segment. Note: Match for segments is done in the ABSTRACT. When dealing with concrete then we're always in parse. Parse is what happens during expand. Matching can be done from either the raw or the segments. This raw function can be overridden, or a grammar defined on the underlying class. """ # Edge case, but it's possible that we have *already matched* on # a previous cycle. Do should first check whether this is a case # of that. if len(segments) == 1 and isinstance(segments[0], cls): # This has already matched. Winner. parse_match_logging( cls.__name__, "_match", "SELF", parse_context=parse_context, v_level=3, symbol="+++", ) return MatchResult.from_matched(segments) elif len(segments) > 1 and isinstance(segments[0], cls): parse_match_logging( cls.__name__, "_match", "SELF", parse_context=parse_context, v_level=3, symbol="+++", ) # This has already matched, but only partially. return MatchResult((segments[0], ), segments[1:]) if cls.match_grammar: # Call the private method with parse_context.deeper_match() as ctx: m = cls.match_grammar.match(segments=segments, parse_context=ctx) # Calling unify here, allows the MatchResult class to do all the type checking. if not isinstance(m, MatchResult): raise TypeError( "[PD:{0} MD:{1}] {2}.match. Result is {3}, not a MatchResult!" .format( parse_context.parse_depth, parse_context.match_depth, cls.__name__, type(m), )) # Once unified we can deal with it just as a MatchResult if m.has_match(): return MatchResult((cls(segments=m.matched_segments), ), m.unmatched_segments) else: return MatchResult.from_unmatched(segments) else: raise NotImplementedError( "{0} has no match function implemented".format(cls.__name__))
def _bracket_sensitive_look_ahead_match( cls, segments, matchers, parse_context, start_bracket=None, end_bracket=None, bracket_pairs_set="bracket_pairs", ): """Same as `_look_ahead_match` but with bracket counting. NB: Given we depend on `_look_ahead_match` we can also utilise the same performance optimisations which are implemented there. bracket_pairs_set: Allows specific segments to override the available bracket pairs. See the definition of "angle_bracket_pairs" in the BigQuery dialect for additional context on why this exists. Returns: `tuple` of (unmatched_segments, match_object, matcher). """ # Type munging matchers = list(matchers) if isinstance(segments, BaseSegment): segments = [segments] # Have we been passed an empty list? if len(segments) == 0: return ((), MatchResult.from_unmatched(segments), None) # Get hold of the bracket matchers from the dialect, and append them # to the list of matchers. We get them from the relevant set on the # dialect. We use zip twice to "unzip" them. We ignore the first # argument because that's just the name. _, start_bracket_refs, end_bracket_refs = zip( *parse_context.dialect.sets(bracket_pairs_set) ) # These are matchables, probably StringParsers. start_brackets = [ parse_context.dialect.ref(seg_ref) for seg_ref in start_bracket_refs ] end_brackets = [ parse_context.dialect.ref(seg_ref) for seg_ref in end_bracket_refs ] # Add any bracket-like things passed as arguments if start_bracket: start_brackets += [start_bracket] if end_bracket: end_brackets += [end_bracket] bracket_matchers = start_brackets + end_brackets # Make some buffers seg_buff = segments pre_seg_buff = () # NB: Tuple bracket_stack: List[BracketInfo] = [] # Iterate while True: # Do we have anything left to match on? if seg_buff: # Yes we have buffer left to work with. # Are we already in a bracket stack? if bracket_stack: # Yes, we're just looking for the closing bracket, or # another opening bracket. pre, match, matcher = cls._look_ahead_match( seg_buff, bracket_matchers, parse_context=parse_context, ) if match: # NB: We can only consider this as a nested bracket if the start # and end tokens are not the same. If a matcher is both a start and # end token we cannot deepen the bracket stack. In general, quoted # strings are a typical example where the start and end tokens are # the same. Currently, though, quoted strings are handled elsewhere # in the parser, and there are no cases where *this* code has to # handle identical start and end brackets. For now, consider this # a small, speculative investment in a possible future requirement. if matcher in start_brackets and matcher not in end_brackets: # Same procedure as below in finding brackets. bracket_stack.append( BracketInfo( bracket=match.matched_segments[0], ) ) pre_seg_buff += pre pre_seg_buff += match.matched_segments seg_buff = match.unmatched_segments continue elif matcher in end_brackets: # Found an end bracket. Does its type match that of # the innermost start bracket? E.g. ")" matches "(", # "]" matches "[". # For the start bracket we don't have the matcher # but we can work out the name, so we use that for # the lookup. start_index = [ bracket.name for bracket in start_brackets ].index(bracket_stack[-1].bracket.name) # For the end index, we can just look for the matcher end_index = end_brackets.index(matcher) bracket_types_match = start_index == end_index if bracket_types_match: # Yes, the types match. So we've found a # matching end bracket. Pop the stack and carry # on. bracket_stack.pop() pre_seg_buff += pre pre_seg_buff += match.matched_segments seg_buff = match.unmatched_segments continue else: # The types don't match. Error. raise SQLParseError( f"Found unexpected end bracket!, was expecting {end_brackets[start_index]}, but got {matcher}", segment=match.matched_segments[0], ) else: raise RuntimeError("I don't know how we get here?!") else: # No match, we're in a bracket stack. Error. raise SQLParseError( "Couldn't find closing bracket for opening bracket.", segment=bracket_stack[-1].bracket, ) else: # No, we're open to more opening brackets or the thing(s) # that we're otherwise looking for. pre, match, matcher = cls._look_ahead_match( seg_buff, matchers + bracket_matchers, parse_context=parse_context, ) if match: if matcher in matchers: # It's one of the things we were looking for! # Return. return (pre_seg_buff + pre, match, matcher) elif matcher in start_brackets: # We've found the start of a bracket segment. # NB: It might not *actually* be the bracket itself, # but could be some non-code element preceding it. # That's actually ok. # Add the bracket to the stack. bracket_stack.append( BracketInfo( bracket=match.matched_segments[0], ) ) # Add the matched elements and anything before it to the # pre segment buffer. Reset the working buffer. pre_seg_buff += pre pre_seg_buff += match.matched_segments seg_buff = match.unmatched_segments continue elif matcher in end_brackets: # We've found an unexpected end bracket! This is likely # because we're matching a section which should have ended. # If we had a match, it would have matched by now, so this # means no match. parse_match_logging( cls.__name__, "_bracket_sensitive_look_ahead_match", "UEXB", parse_context=parse_context, v_level=3, got=matcher, ) return ((), MatchResult.from_unmatched(segments), None) else: # This shouldn't happen!? raise NotImplementedError( "This shouldn't happen. Panic in _bracket_sensitive_look_ahead_match." ) else: # Not in a bracket stack, but no match. This is a happy # unmatched exit. return ((), MatchResult.from_unmatched(segments), None) else: # No we're at the end: # Now check have we closed all our brackets? if bracket_stack: # No we haven't. raise SQLParseError( f"Couldn't find closing bracket for opened brackets: `{bracket_stack}`.", segment=bracket_stack[-1].bracket, ) # We reached the end with no open brackets. This is a friendly # unmatched return. return ((), MatchResult.from_unmatched(segments), None)
def _look_ahead_match(cls, segments, matchers, parse_context): """Look ahead for matches beyond the first element of the segments list. This function also contains the performance improved hash-matching approach to searching for matches, which should significantly improve performance. Prioritise the first match, and if multiple match at the same point the longest. If two matches of the same length match at the same time, then it's the first in the iterable of matchers. Returns: `tuple` of (unmatched_segments, match_object, matcher). """ parse_match_logging( cls.__name__, "_look_ahead_match", "IN", parse_context=parse_context, v_level=4, ls=len(segments), seg=LateBoundJoinSegmentsCurtailed(segments), ) # Do some type munging matchers = list(matchers) if isinstance(segments, BaseSegment): segments = [segments] # Have we been passed an empty list? if len(segments) == 0: return ((), MatchResult.from_empty(), None) # Here we enable a performance optimisation. Most of the time in this cycle # happens in loops looking for simple matchers which we should # be able to find a shortcut for. # First: Assess the matchers passed in, if any are # "simple", then we effectively use a hash lookup across the # content of segments to quickly evaluate if the segment is present. # Matchers which aren't "simple" still take a slower route. _matchers = [ (matcher, matcher.simple(parse_context=parse_context)) for matcher in matchers ] simple_matchers = [matcher for matcher in _matchers if matcher[1]] non_simple_matchers = [matcher[0] for matcher in _matchers if not matcher[1]] best_simple_match = None if simple_matchers: # If they're all simple we can use a hash match to identify the first one. # Build a buffer of all the upper case raw segments ahead of us. str_buff = [] # For existing compound segments, we should assume that within # that segment, things are internally consistent, that means # rather than enumerating all the individual segments of a longer # one we just dump out the whole segment, but splitting off the # first element seperated by whitespace. This is a) faster and # also b) prevents some really horrible bugs with bracket matching. # See https://github.com/sqlfluff/sqlfluff/issues/433 def _trim_elem(seg): s = seg.raw_upper.split(maxsplit=1) return s[0] if s else "" str_buff = [_trim_elem(seg) for seg in segments] match_queue = [] for matcher, simple in simple_matchers: # Simple will be a tuple of options for simple_option in simple: # NOTE: We use iter_indices to make sure we capture # all instances of potential matches if there are many. # This is important for bracket counting. for buff_pos in iter_indices(str_buff, simple_option): match_queue.append((matcher, buff_pos, simple_option)) # Sort the match queue. First to process AT THE END. # That means we pop from the end. match_queue = sorted(match_queue, key=lambda x: x[1]) parse_match_logging( cls.__name__, "_look_ahead_match", "SI", parse_context=parse_context, v_level=4, mq=match_queue, sb=str_buff, ) while match_queue: # We've managed to match. We can shortcut home. # NB: We may still need to deal with whitespace. queued_matcher, queued_buff_pos, queued_option = match_queue.pop() # Here we do the actual transform to the new segment. match = queued_matcher.match(segments[queued_buff_pos:], parse_context) if not match: # We've had something match in simple matching, but then later excluded. # Log but then move on to the next item on the list. parse_match_logging( cls.__name__, "_look_ahead_match", "NM", parse_context=parse_context, v_level=4, _so=queued_option, ) continue # Ok we have a match. Because we sorted the list, we'll take it! best_simple_match = (segments[:queued_buff_pos], match, queued_matcher) if not non_simple_matchers: # There are no other matchers, we can just shortcut now. parse_match_logging( cls.__name__, "_look_ahead_match", "SC", parse_context=parse_context, v_level=4, bsm=None if not best_simple_match else ( len(best_simple_match[0]), len(best_simple_match[1]), best_simple_match[2], ), ) if best_simple_match: return best_simple_match else: return ((), MatchResult.from_unmatched(segments), None) # Make some buffers seg_buff = segments pre_seg_buff = () # NB: Tuple # Loop while True: # Do we have anything left to match on? if seg_buff: # Great, carry on. pass else: # We've got to the end without a match, return empty return ((), MatchResult.from_unmatched(segments), None) # We only check the NON-simple ones here for brevity. mat, m = cls._longest_trimmed_match( seg_buff, non_simple_matchers, parse_context=parse_context, trim_noncode=False, ) if mat and not best_simple_match: return (pre_seg_buff, mat, m) elif mat: # It will be earlier than the simple one if we've even checked, # but there's a chance that this might be *longer*, or just FIRST. pre_lengths = (len(pre_seg_buff), len(best_simple_match[0])) mat_lengths = (len(mat), len(best_simple_match[1])) mat_indexes = (matchers.index(m), matchers.index(best_simple_match[2])) if ( (pre_lengths[0] < pre_lengths[1]) or ( pre_lengths[0] == pre_lengths[1] and mat_lengths[0] > mat_lengths[1] ) or ( pre_lengths[0] == pre_lengths[1] and mat_lengths[0] == mat_lengths[1] and mat_indexes[0] < mat_indexes[1] ) ): return (pre_seg_buff, mat, m) else: return best_simple_match else: # If there aren't any matches, then advance the buffer and try again. # Two improvements: # 1) if we get as far as the first simple match, then return that. # 2) be eager in consuming non-code segments if allowed if best_simple_match and len(pre_seg_buff) >= len(best_simple_match[0]): return best_simple_match pre_seg_buff += (seg_buff[0],) seg_buff = seg_buff[1:]
def _prune_options( self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext ) -> Tuple[List[MatchableType], List[str]]: """Use the simple matchers to prune which options to match on.""" str_buff = [ segment.raw_upper for segment in self._iter_raw_segs(segments) ] available_options = [] simple_opts = [] prune_buff = [] non_simple = 0 pruned_simple = 0 matched_simple = 0 # Find the first code element to match against. first_elem = None for elem in str_buff: if elem.strip(): first_elem = elem break for opt in self._elements: simple = opt.simple(parse_context=parse_context) if simple is None: # This element is not simple, we have to do a # full match with it... available_options.append(opt) non_simple += 1 continue # Otherwise we have a simple option, so let's use # it for pruning. for simple_opt in simple: # Check it's not a whitespace option if not simple_opt.strip(): raise NotImplementedError( "_prune_options not supported for whitespace matching." ) # We want to know if the first meaningful element of the str_buff # matches the option. if simple_opt in str_buff: # match the FIRST non-whitespace element of the list. if first_elem != simple_opt: # No match, carry on. continue # If we get here, it's matched the FIRST element of the string buffer. available_options.append(opt) simple_opts.append(simple_opt) matched_simple += 1 break else: # Ditch this option, the simple match has failed prune_buff.append(opt) pruned_simple += 1 continue parse_match_logging( self.__class__.__name__, "match", "PRN", parse_context=parse_context, v_level=3, ns=non_simple, ps=pruned_simple, ms=matched_simple, pruned=prune_buff, opts=available_options or "ALL", ) return available_options, simple_opts
def _bracket_sensitive_look_ahead_match( cls, segments: Tuple[BaseSegment, ...], matchers: List[MatchableType], parse_context: ParseContext, start_bracket: Optional[Matchable] = None, end_bracket: Optional[Matchable] = None, bracket_pairs_set: str = "bracket_pairs", ) -> Tuple[Tuple[BaseSegment, ...], MatchResult, Optional[MatchableType]]: """Same as `_look_ahead_match` but with bracket counting. NB: Given we depend on `_look_ahead_match` we can also utilise the same performance optimisations which are implemented there. bracket_pairs_set: Allows specific segments to override the available bracket pairs. See the definition of "angle_bracket_pairs" in the BigQuery dialect for additional context on why this exists. Returns: `tuple` of (unmatched_segments, match_object, matcher). """ # Have we been passed an empty tuple? if not segments: return ((), MatchResult.from_unmatched(segments), None) # Get hold of the bracket matchers from the dialect, and append them # to the list of matchers. We get them from the relevant set on the # dialect. We use zip twice to "unzip" them. We ignore the first # argument because that's just the name. _, start_bracket_refs, end_bracket_refs, persists = zip( *parse_context.dialect.sets(bracket_pairs_set)) # These are matchables, probably StringParsers. start_brackets = [ parse_context.dialect.ref(seg_ref) for seg_ref in start_bracket_refs ] end_brackets = [ parse_context.dialect.ref(seg_ref) for seg_ref in end_bracket_refs ] # Add any bracket-like things passed as arguments if start_bracket: start_brackets += [start_bracket] if end_bracket: end_brackets += [end_bracket] bracket_matchers = start_brackets + end_brackets # Make some buffers seg_buff: Tuple[BaseSegment, ...] = segments pre_seg_buff: Tuple[BaseSegment, ...] = () bracket_stack: List[BracketInfo] = [] # Iterate while True: # Do we have anything left to match on? if seg_buff: # Yes we have buffer left to work with. # Are we already in a bracket stack? if bracket_stack: # Yes, we're just looking for the closing bracket, or # another opening bracket. pre, match, matcher = cls._look_ahead_match( seg_buff, bracket_matchers, parse_context=parse_context, ) if match: # NB: We can only consider this as a nested bracket if the start # and end tokens are not the same. If a matcher is both a start # and end token we cannot deepen the bracket stack. In general, # quoted strings are a typical example where the start and end # tokens are the same. Currently, though, quoted strings are # handled elsewhere in the parser, and there are no cases where # *this* code has to handle identical start and end brackets. # For now, consider this a small, speculative investment in a # possible future requirement. if matcher in start_brackets and matcher not in end_brackets: # Add any segments leading up to this to the previous # bracket. bracket_stack[-1].segments += pre # Add a bracket to the stack and add the matches from the # segment. bracket_stack.append( BracketInfo( bracket=match.matched_segments[0], segments=match.matched_segments, )) seg_buff = match.unmatched_segments continue elif matcher in end_brackets: # Found an end bracket. Does its type match that of # the innermost start bracket? E.g. ")" matches "(", # "]" matches "[". # For the start bracket we don't have the matcher # but we can work out the type, so we use that for # the lookup. start_index = [ bracket.type for bracket in start_brackets ].index(bracket_stack[-1].bracket.get_type()) # For the end index, we can just look for the matcher end_index = end_brackets.index(matcher) bracket_types_match = start_index == end_index if bracket_types_match: # Yes, the types match. So we've found a # matching end bracket. Pop the stack, construct # a bracketed segment and carry # on. # Complete the bracketed info bracket_stack[-1].segments += ( pre + match.matched_segments) # Construct a bracketed segment (as a tuple) if allowed. persist_bracket = persists[end_brackets.index( matcher)] if persist_bracket: new_segments: Tuple[BaseSegment, ...] = ( bracket_stack[-1].to_segment( end_bracket=match.matched_segments ), ) else: new_segments = bracket_stack[-1].segments # Remove the bracket set from the stack bracket_stack.pop() # If we're still in a bracket, add the new segments to # that bracket, otherwise add them to the buffer if bracket_stack: bracket_stack[-1].segments += new_segments else: pre_seg_buff += new_segments seg_buff = match.unmatched_segments continue else: # The types don't match. Error. raise SQLParseError( f"Found unexpected end bracket!, " f"was expecting " f"{end_brackets[start_index]}, " f"but got {matcher}", segment=match.matched_segments[0], ) else: # pragma: no cover raise RuntimeError( "I don't know how we get here?!") else: # pragma: no cover # No match, we're in a bracket stack. Error. raise SQLParseError( "Couldn't find closing bracket for opening bracket.", segment=bracket_stack[-1].bracket, ) else: # No, we're open to more opening brackets or the thing(s) # that we're otherwise looking for. pre, match, matcher = cls._look_ahead_match( seg_buff, matchers + bracket_matchers, parse_context=parse_context, ) if match: if matcher in matchers: # It's one of the things we were looking for! # Return. return (pre_seg_buff + pre, match, matcher) elif matcher in start_brackets: # We've found the start of a bracket segment. # NB: It might not *actually* be the bracket itself, # but could be some non-code element preceding it. # That's actually ok. # Add the bracket to the stack. bracket_stack.append( BracketInfo( bracket=match.matched_segments[0], segments=match.matched_segments, )) # The matched element has already been added to the bracket. # Add anything before it to the pre segment buffer. # Reset the working buffer. pre_seg_buff += pre seg_buff = match.unmatched_segments continue elif matcher in end_brackets: # We've found an unexpected end bracket! This is likely # because we're matching a section which should have ended. # If we had a match, it would have matched by now, so this # means no match. parse_match_logging( cls.__name__, "_bracket_sensitive_look_ahead_match", "UEXB", parse_context=parse_context, v_level=3, got=matcher, ) # From here we'll drop out to the happy unmatched exit. else: # pragma: no cover # This shouldn't happen!? raise NotImplementedError( "This shouldn't happen. Panic in " "_bracket_sensitive_look_ahead_match.") # Not in a bracket stack, but no match. # From here we'll drop out to the happy unmatched exit. else: # No we're at the end: # Now check have we closed all our brackets? if bracket_stack: # pragma: no cover # No we haven't. raise SQLParseError( "Couldn't find closing bracket for opened brackets: " f"`{bracket_stack}`.", segment=bracket_stack[-1].bracket, ) # This is the happy unmatched path. This occurs when: # - We reached the end with no open brackets. # - No match while outside a bracket stack. # - We found an unexpected end bracket before matching something # interesting. We return with the mutated segments so we can reuse any # bracket matching. return ((), MatchResult.from_unmatched(pre_seg_buff + seg_buff), None)