def match(self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext) -> MatchResult: """Match against any of the elements a relevant number of times. If it matches multiple, it returns the longest, and if any are the same length it returns the first (unless we explicitly just match first). """ # First if we have an *exclude* option, we should check that # which would prevent the rest of this grammar from matching. if self.exclude: with parse_context.deeper_match() as ctx: if self.exclude.match(segments, parse_context=ctx): return MatchResult.from_unmatched(segments) # Match on each of the options matched_segments: MatchResult = MatchResult.from_empty() unmatched_segments: Tuple[BaseSegment, ...] = segments n_matches = 0 while True: if self.max_times and n_matches >= self.max_times: # We've matched as many times as we can return MatchResult(matched_segments.matched_segments, unmatched_segments) # Is there anything left to match? if len(unmatched_segments) == 0: # No... if n_matches >= self.min_times: return MatchResult(matched_segments.matched_segments, unmatched_segments) else: # We didn't meet the hurdle return MatchResult.from_unmatched(unmatched_segments) # If we've already matched once... if n_matches > 0 and self.allow_gaps: # Consume any non-code if there is any pre_seg, mid_seg, post_seg = trim_non_code_segments( unmatched_segments) unmatched_segments = mid_seg + post_seg else: pre_seg = () # empty tuple match = self._match_once(unmatched_segments, parse_context=parse_context) if match: matched_segments += pre_seg + match.matched_segments unmatched_segments = match.unmatched_segments n_matches += 1 else: # If we get here, then we've not managed to match. And the next # unmatched segments are meaningful, i.e. they're not what we're # looking for. if n_matches >= self.min_times: return MatchResult(matched_segments.matched_segments, pre_seg + unmatched_segments) else: # We didn't meet the hurdle return MatchResult.from_unmatched(unmatched_segments)
def _look_ahead_match(cls, segments, matchers, parse_context): """Look ahead for matches beyond the first element of the segments list. This function also contains the performance improved hash-matching approach to searching for matches, which should significantly improve performance. Prioritise the first match, and if multiple match at the same point the longest. If two matches of the same length match at the same time, then it's the first in the iterable of matchers. Returns: `tuple` of (unmatched_segments, match_object, matcher). """ parse_match_logging( cls.__name__, "_look_ahead_match", "IN", parse_context=parse_context, v_level=4, ls=len(segments), seg=LateBoundJoinSegmentsCurtailed(segments), ) # Do some type munging matchers = list(matchers) if isinstance(segments, BaseSegment): segments = [segments] # Have we been passed an empty list? if len(segments) == 0: return ((), MatchResult.from_empty(), None) # Here we enable a performance optimisation. Most of the time in this cycle # happens in loops looking for simple matchers which we should # be able to find a shortcut for. # First: Assess the matchers passed in, if any are # "simple", then we effectively use a hash lookup across the # content of segments to quickly evaluate if the segment is present. # Matchers which aren't "simple" still take a slower route. _matchers = [ (matcher, matcher.simple(parse_context=parse_context)) for matcher in matchers ] simple_matchers = [matcher for matcher in _matchers if matcher[1]] non_simple_matchers = [matcher[0] for matcher in _matchers if not matcher[1]] best_simple_match = None if simple_matchers: # If they're all simple we can use a hash match to identify the first one. # Build a buffer of all the upper case raw segments ahead of us. str_buff = [] # For existing compound segments, we should assume that within # that segment, things are internally consistent, that means # rather than enumerating all the individual segments of a longer # one we just dump out the whole segment, but splitting off the # first element seperated by whitespace. This is a) faster and # also b) prevents some really horrible bugs with bracket matching. # See https://github.com/sqlfluff/sqlfluff/issues/433 def _trim_elem(seg): s = seg.raw_upper.split(maxsplit=1) return s[0] if s else "" str_buff = [_trim_elem(seg) for seg in segments] match_queue = [] for matcher, simple in simple_matchers: # Simple will be a tuple of options for simple_option in simple: # NOTE: We use iter_indices to make sure we capture # all instances of potential matches if there are many. # This is important for bracket counting. for buff_pos in iter_indices(str_buff, simple_option): match_queue.append((matcher, buff_pos, simple_option)) # Sort the match queue. First to process AT THE END. # That means we pop from the end. match_queue = sorted(match_queue, key=lambda x: x[1]) parse_match_logging( cls.__name__, "_look_ahead_match", "SI", parse_context=parse_context, v_level=4, mq=match_queue, sb=str_buff, ) while match_queue: # We've managed to match. We can shortcut home. # NB: We may still need to deal with whitespace. queued_matcher, queued_buff_pos, queued_option = match_queue.pop() # Here we do the actual transform to the new segment. match = queued_matcher.match(segments[queued_buff_pos:], parse_context) if not match: # We've had something match in simple matching, but then later excluded. # Log but then move on to the next item on the list. parse_match_logging( cls.__name__, "_look_ahead_match", "NM", parse_context=parse_context, v_level=4, _so=queued_option, ) continue # Ok we have a match. Because we sorted the list, we'll take it! best_simple_match = (segments[:queued_buff_pos], match, queued_matcher) if not non_simple_matchers: # There are no other matchers, we can just shortcut now. parse_match_logging( cls.__name__, "_look_ahead_match", "SC", parse_context=parse_context, v_level=4, bsm=None if not best_simple_match else ( len(best_simple_match[0]), len(best_simple_match[1]), best_simple_match[2], ), ) if best_simple_match: return best_simple_match else: return ((), MatchResult.from_unmatched(segments), None) # Make some buffers seg_buff = segments pre_seg_buff = () # NB: Tuple # Loop while True: # Do we have anything left to match on? if seg_buff: # Great, carry on. pass else: # We've got to the end without a match, return empty return ((), MatchResult.from_unmatched(segments), None) # We only check the NON-simple ones here for brevity. mat, m = cls._longest_trimmed_match( seg_buff, non_simple_matchers, parse_context=parse_context, trim_noncode=False, ) if mat and not best_simple_match: return (pre_seg_buff, mat, m) elif mat: # It will be earlier than the simple one if we've even checked, # but there's a chance that this might be *longer*, or just FIRST. pre_lengths = (len(pre_seg_buff), len(best_simple_match[0])) mat_lengths = (len(mat), len(best_simple_match[1])) mat_indexes = (matchers.index(m), matchers.index(best_simple_match[2])) if ( (pre_lengths[0] < pre_lengths[1]) or ( pre_lengths[0] == pre_lengths[1] and mat_lengths[0] > mat_lengths[1] ) or ( pre_lengths[0] == pre_lengths[1] and mat_lengths[0] == mat_lengths[1] and mat_indexes[0] < mat_indexes[1] ) ): return (pre_seg_buff, mat, m) else: return best_simple_match else: # If there aren't any matches, then advance the buffer and try again. # Two improvements: # 1) if we get as far as the first simple match, then return that. # 2) be eager in consuming non-code segments if allowed if best_simple_match and len(pre_seg_buff) >= len(best_simple_match[0]): return best_simple_match pre_seg_buff += (seg_buff[0],) seg_buff = seg_buff[1:]
def _longest_trimmed_match( cls, segments: Tuple["BaseSegment", ...], matchers: List["MatchableType"], parse_context: ParseContext, trim_noncode=True, ) -> Tuple[MatchResult, Optional["MatchableType"]]: """Return longest match from a selection of matchers. Prioritise the first match, and if multiple match at the same point the longest. If two matches of the same length match at the same time, then it's the first in the iterable of matchers. Returns: `tuple` of (match_object, matcher). """ # Have we been passed an empty list? if len(segments) == 0: return MatchResult.from_empty(), None # If gaps are allowed, trim the ends. if trim_noncode: pre_nc, segments, post_nc = trim_non_code_segments(segments) best_match_length = 0 # iterate at this position across all the matchers for matcher in matchers: # MyPy seems to require a type hint here. Not quite sure why. res_match: MatchResult = matcher.match( segments, parse_context=parse_context ) if res_match.is_complete(): # Just return it! (WITH THE RIGHT OTHER STUFF) if trim_noncode: return ( MatchResult.from_matched( pre_nc + res_match.matched_segments + post_nc ), matcher, ) else: return res_match, matcher elif res_match: # We've got an incomplete match, if it's the best so far keep it. if res_match.matched_length > best_match_length: best_match = res_match, matcher best_match_length = res_match.matched_length # If we get here, then there wasn't a complete match. If we # has a best_match, return that. if best_match_length > 0: if trim_noncode: return ( MatchResult( pre_nc + best_match[0].matched_segments, best_match[0].unmatched_segments + post_nc, ), best_match[1], ) else: return best_match # If no match at all, return nothing return MatchResult.from_unmatched(segments), None
def match(self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext) -> MatchResult: """Match an arbitrary number of elements separated by a delimiter. Note that if there are multiple elements passed in that they will be treated as different options of what can be delimited, rather than a sequence. """ # Have we been passed an empty list? if len(segments) == 0: return MatchResult.from_empty() # Make some buffers seg_buff = segments matched_segments = MatchResult.from_empty() # delimiters is a list of tuples containing delimiter segments as we find them. delimiters: List[BaseSegment] = [] # First iterate through all the segments, looking for the delimiter. # Second, split the list on each of the delimiters, and ensure that # each sublist in turn matches one of the elements. # In more detail, match against delimiter, if we match, put a slice # up to that point onto a list of slices. Carry on. while True: # Check to see whether we've exhausted the buffer, either by iterating through it, # or by consuming all the non-code segments already. # NB: If we're here then we've already tried matching the remaining segments against # the content, so we must be in a trailing case. if len(seg_buff) == 0: # Append the remaining buffer in case we're in the not is_code case. matched_segments += seg_buff # Nothing left, this is potentially a trailing case? if self.allow_trailing and ( self.min_delimiters is None or len(delimiters) >= self.min_delimiters): # It is! (nothing left so no unmatched segments to append) return MatchResult.from_matched( matched_segments.matched_segments) else: return MatchResult.from_unmatched(segments) # We rely on _bracket_sensitive_look_ahead_match to do the bracket counting # element of this now. We look ahead to find a delimiter or terminator. matchers = [self.delimiter] if self.terminator: matchers.append(self.terminator) # If gaps aren't allowed, a gap (or non-code segment), acts like a terminator. if not self.allow_gaps: matchers.append(NonCodeMatcher()) with parse_context.deeper_match() as ctx: ( pre_content, delimiter_match, delimiter_matcher, ) = self._bracket_sensitive_look_ahead_match( seg_buff, matchers, parse_context=ctx, ) # Keep track of the *length* of this pre-content section before we start # to change it later. We need this for dealing with terminators. pre_content_len = len(pre_content) # Have we found a delimiter or terminator looking forward? if delimiter_match: if delimiter_matcher is self.delimiter: # Yes. Store it and then match the contents up to now. delimiters.append(delimiter_match.matched_segments) # We now test the intervening section as to whether it matches one # of the things we're looking for. NB: If it's of zero length then # we return without trying it. if len(pre_content) > 0: with parse_context.deeper_match() as ctx: match, matcher = self._longest_trimmed_match( segments=pre_content, matchers=self._elements, parse_context=ctx, trim_noncode=self.allow_gaps, ) # No match, or an incomplete match: Not allowed if not match or not match.is_complete(): return MatchResult.from_unmatched(segments) # We have a complete match! # First add the segment up to the delimiter to the matched segments matched_segments += match.matched_segments # Then it depends what we matched. # Delimiter if delimiter_matcher is self.delimiter: # Then add the delimiter to the matched segments matched_segments += delimiter_match.matched_segments # Break this for loop and move on, looking for the next delimiter seg_buff = delimiter_match.unmatched_segments # Still got some buffer left. Carry on. continue # Terminator (or the gap terminator). elif delimiter_matcher is self.terminator or isinstance( delimiter_matcher, NonCodeMatcher): # We just return straight away here. We don't add the terminator to # this match, it should go with the unmatched parts. The terminator # may also have mutated the returned segments so we also DON'T want # the mutated version, it can do that itself (so we return `seg_buff` # and not `delimiter_match.all_segments()``) # First check we've had enough delimiters if (self.min_delimiters and len(delimiters) < self.min_delimiters): return MatchResult.from_unmatched(segments) else: return MatchResult( matched_segments.matched_segments, # Return the part of the seg_buff which isn't in the # pre-content. seg_buff[pre_content_len:], ) else: raise RuntimeError(( "I don't know how I got here. Matched instead on {0}, which " "doesn't appear to be delimiter or terminator" ).format(delimiter_matcher)) else: # Zero length section between delimiters, or zero code # elements if appropriate. Return unmatched. return MatchResult.from_unmatched(segments) else: # No match for a delimiter looking forward, this means we're # at the end. In this case we look for a potential partial match # looking forward. We know it's a non-zero length section because # we checked that up front. # First check we're had enough delimiters, because if we haven't then # there's no sense to try matching if self.min_delimiters and len( delimiters) < self.min_delimiters: return MatchResult.from_unmatched(segments) # We use the whitespace padded match to hoover up whitespace if enabled, # and default to the longest matcher. We don't care which one matches. with parse_context.deeper_match() as ctx: mat, _ = self._longest_trimmed_match( seg_buff, self._elements, parse_context=ctx, trim_noncode=self.allow_gaps, ) if mat: # We've got something at the end. Return! if mat.unmatched_segments: # We have something unmatched and so we should let it also have the trailing elements return MatchResult( matched_segments.matched_segments + mat.matched_segments, mat.unmatched_segments, ) else: # If there's nothing unmatched in the most recent match, then we can consume the trailing # non code segments return MatchResult.from_matched( matched_segments.matched_segments + mat.matched_segments, ) else: # No match at the end, are we allowed to trail? If we are then return, # otherwise we fail because we can't match the last element. if self.allow_trailing: return MatchResult(matched_segments.matched_segments, seg_buff) else: return MatchResult.from_unmatched(segments)
def test__parser__match_construct_from_empty(): """Test construction of MatchResults from empty.""" m = MatchResult.from_empty() assert len(m) == 0
def match(self, segments, parse_context): """Match a specific sequence of elements.""" if isinstance(segments, BaseSegment): segments = tuple(segments) matched_segments = MatchResult.from_empty() unmatched_segments = segments # Buffers of uninstantiated meta segments. meta_pre_nc = () meta_post_nc = () early_break = False for idx, elem in enumerate(self._elements): # Check for an early break. if early_break: break while True: # Consume non-code if appropriate if self.allow_gaps: pre_nc, mid_seg, post_nc = trim_non_code_segments( unmatched_segments) else: pre_nc = () mid_seg = unmatched_segments post_nc = () # Is it an indent or dedent? if elem.is_meta: # Elements with a negative indent value come AFTER # the whitespace. Positive or neutral come BEFORE. if elem.indent_val < 0: meta_post_nc += (elem(), ) else: meta_pre_nc += (elem(), ) break # Is it a conditional? If so is it active if isinstance( elem, Conditional) and not elem.is_enabled(parse_context): # If it's not active, skip it. break if len(pre_nc + mid_seg + post_nc) == 0: # We've run our of sequence without matching everything. # Do only optional or meta elements remain? if all(e.is_optional() or e.is_meta or isinstance(elem, Conditional) for e in self._elements[idx:]): # then it's ok, and we can return what we've got so far. # No need to deal with anything left over because we're at the end, # unless it's a meta segment. # We'll add those meta segments after any existing ones. So # the go on the meta_post_nc stack. for e in self._elements[idx:]: # If it's meta, instantiate it. if e.is_meta: meta_post_nc += (e(), ) # If it's conditional and it's enabled, match it. if isinstance(e, Conditional) and e.is_enabled( parse_context): meta_match = e.match(tuple(), parse_context) if meta_match: meta_post_nc += meta_match.matched_segments # Early break to exit via the happy match path. early_break = True break else: # we've got to the end of the sequence without matching all # required elements. return MatchResult.from_unmatched(segments) else: # We've already dealt with potential whitespace above, so carry on to matching with parse_context.deeper_match() as ctx: elem_match = elem.match(mid_seg, parse_context=ctx) if elem_match.has_match(): # We're expecting mostly partial matches here, but complete # matches are possible. Don't be greedy with whitespace! matched_segments += (meta_pre_nc + pre_nc + meta_post_nc + elem_match.matched_segments) meta_pre_nc = () meta_post_nc = () unmatched_segments = elem_match.unmatched_segments + post_nc # Each time we do this, we do a sense check to make sure we haven't # dropped anything. (Because it's happened before!). check_still_complete( segments, matched_segments.matched_segments, unmatched_segments, ) # Break out of the while loop and move to the next element. break else: # If we can't match an element, we should ascertain whether it's # required. If so then fine, move on, but otherwise we should crash # out without a match. We have not matched the sequence. if elem.is_optional(): # This will crash us out of the while loop and move us # onto the next matching element break else: return MatchResult.from_unmatched(segments) # If we get to here, we've matched all of the elements (or skipped them) # but still have some segments left (or perhaps have precisely zero left). # In either case, we're golden. Return successfully, with any leftovers as # the unmatched elements. Meta all go at the end regardless of wny trailing # whitespace. return MatchResult( BaseSegment._position_segments( matched_segments.matched_segments + meta_pre_nc + meta_post_nc, ), unmatched_segments, )
def match( self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext, ) -> MatchResult: """Match an arbitrary number of elements separated by a delimiter. Note that if there are multiple elements passed in that they will be treated as different options of what can be delimited, rather than a sequence. """ # Have we been passed an empty list? if len(segments) == 0: return MatchResult.from_empty() # Make some buffers seg_buff = segments matched_segments: Tuple[BaseSegment, ...] = () unmatched_segments: Tuple[BaseSegment, ...] = () cached_matched_segments: Tuple[BaseSegment, ...] = () cached_unmatched_segments: Tuple[BaseSegment, ...] = () delimiters = 0 matched_delimiter = False # We want to render progress bar only for the main matching loop, # so disable it when in deeper parsing. disable_progress_bar = ( parse_context.parse_depth > 0 or progress_bar_configuration.disable_progress_bar) # We use amount of `NewLineSegment` to estimate how many steps could be in # a big file. It's not perfect, but should do a job in most cases. new_line_segments = [ s for s in segments if isinstance(s, NewlineSegment) ] progressbar_matching = tqdm( total=len(new_line_segments), desc="matching", miniters=30, disable=disable_progress_bar, leave=False, ) seeking_delimiter = False has_matched_segs = False terminated = False delimiter_matchers = [self.delimiter] terminator_matchers = [] if self.terminator: terminator_matchers.append(self.terminator) # If gaps aren't allowed, a gap (or non-code segment), acts like a terminator. if not self.allow_gaps: terminator_matchers.append(NonCodeMatcher()) while True: progressbar_matching.update(n=1) if seeking_delimiter: elements = delimiter_matchers else: elements = self._elements if len(seg_buff) > 0: pre_non_code, seg_content, post_non_code = trim_non_code_segments( seg_buff) if not self.allow_gaps and any(seg.is_whitespace for seg in pre_non_code): unmatched_segments = seg_buff break if not seg_content: # pragma: no cover matched_segments += pre_non_code break # Check whether there is a terminator before checking for content with parse_context.deeper_match() as ctx: match, _ = self._longest_trimmed_match( segments=seg_content, matchers=terminator_matchers, parse_context=ctx, # We've already trimmed trim_noncode=False, ) if match: terminated = True unmatched_segments = (pre_non_code + match.all_segments() + post_non_code) break with parse_context.deeper_match() as ctx: match, _ = self._longest_trimmed_match( segments=seg_content, matchers=elements, parse_context=ctx, # We've already trimmed trim_noncode=False, terminators=delimiter_matchers if elements != delimiter_matchers else None, ) if match: if elements == delimiter_matchers: delimiters += 1 matched_delimiter = True cached_matched_segments = matched_segments cached_unmatched_segments = seg_buff else: matched_delimiter = False has_matched_segs = True seg_buff = match.unmatched_segments + post_non_code unmatched_segments = match.unmatched_segments if match.is_complete(): matched_segments += (pre_non_code + match.matched_segments + post_non_code) unmatched_segments = match.unmatched_segments break matched_segments += pre_non_code + match.matched_segments seeking_delimiter = not seeking_delimiter else: matched_segments += pre_non_code unmatched_segments = match.unmatched_segments + post_non_code break else: break # pragma: no cover if self.min_delimiters: if delimiters < self.min_delimiters: return MatchResult.from_unmatched(matched_segments + unmatched_segments) if terminated: if has_matched_segs: return MatchResult(matched_segments, unmatched_segments) else: return MatchResult.from_unmatched(matched_segments + unmatched_segments) if matched_delimiter and not self.allow_trailing: if not unmatched_segments: return MatchResult.from_unmatched(matched_segments + unmatched_segments) else: return MatchResult(cached_matched_segments, cached_unmatched_segments) if not has_matched_segs: return MatchResult.from_unmatched(matched_segments + unmatched_segments) if not unmatched_segments: return MatchResult.from_matched(matched_segments) return MatchResult(matched_segments, unmatched_segments)
def match( self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext, ) -> MatchResult: """Match an arbitrary number of elements separated by a delimiter. Note that if there are multiple elements passed in that they will be treated as different options of what can be delimited, rather than a sequence. """ # Have we been passed an empty list? if len(segments) == 0: return MatchResult.from_empty() # Make some buffers seg_buff = segments matched_segments = MatchResult.from_empty() # delimiters is a list of tuples containing delimiter segments as we find them. delimiters: List[BaseSegment] = [] # We want to render progress bar only for the main matching loop, # so disable it when in deeper parsing. disable_progress_bar = ( parse_context.parse_depth > 0 or progress_bar_configuration.disable_progress_bar ) # We use amount of `NewLineSegment` to estimate how many steps could be in # a big file. It's not perfect, but should do a job in most cases. new_line_segments = [s for s in segments if isinstance(s, NewlineSegment)] progressbar_matching = tqdm( total=len(new_line_segments), desc="matching", miniters=30, disable=disable_progress_bar, leave=False, ) # First iterate through all the segments, looking for the delimiter. # Second, split the list on each of the delimiters, and ensure that # each sublist in turn matches one of the elements. # In more detail, match against delimiter, if we match, put a slice # up to that point onto a list of slices. Carry on. while True: progressbar_matching.update(n=1) # Check to see whether we've exhausted the buffer, either by iterating # through it, or by consuming all the non-code segments already. # NB: If we're here then we've already tried matching the remaining segments # against the content, so we must be in a trailing case. if len(seg_buff) == 0: # Append the remaining buffer in case we're in the not is_code case. matched_segments += seg_buff # Nothing left, this is potentially a trailing case? if self.allow_trailing and ( self.min_delimiters is None or len(delimiters) >= self.min_delimiters ): # pragma: no cover TODO? # It is! (nothing left so no unmatched segments to append) return MatchResult.from_matched(matched_segments.matched_segments) else: # pragma: no cover TODO? return MatchResult.from_unmatched(segments) # We rely on _bracket_sensitive_look_ahead_match to do the bracket counting # element of this now. We look ahead to find a delimiter or terminator. matchers = [self.delimiter] if self.terminator: matchers.append(self.terminator) # If gaps aren't allowed, a gap (or non-code segment), acts like a # terminator. if not self.allow_gaps: matchers.append(NonCodeMatcher()) with parse_context.deeper_match() as ctx: ( pre_content, delimiter_match, delimiter_matcher, ) = self._bracket_sensitive_look_ahead_match( seg_buff, matchers, parse_context=ctx, bracket_pairs_set=self.bracket_pairs_set, ) # Store the mutated segments to reuse. mutated_segments = pre_content + delimiter_match.all_segments() # Have we found a delimiter or terminator looking forward? if delimiter_match: if delimiter_matcher is self.delimiter: # Yes. Store it and then match the contents up to now. delimiters.append(delimiter_match.matched_segments) # We now test the intervening section as to whether it matches one # of the things we're looking for. NB: If it's of zero length then # we return without trying it. if len(pre_content) > 0: pre_non_code, pre_content, post_non_code = trim_non_code_segments( pre_content ) # Check for whitespace gaps. # We do this explicitly here rather than relying on an # untrimmed match so we can handle _whitespace_ explicitly # compared to other non code segments like placeholders. if not self.allow_gaps and any( seg.is_whitespace for seg in pre_non_code + post_non_code ): return MatchResult.from_unmatched( mutated_segments ) # pragma: no cover TODO? with parse_context.deeper_match() as ctx: match, _ = self._longest_trimmed_match( segments=pre_content, matchers=self._elements, parse_context=ctx, # We've already trimmed trim_noncode=False, ) # No match - Not allowed if not match: if self.allow_trailing: # If we reach this point, the lookahead match has hit a # delimiter beyond the scope of this Delimited section. # Trailing delimiters are allowed, so return matched up to # this section. return MatchResult( matched_segments.matched_segments, pre_non_code + match.unmatched_segments + post_non_code + delimiter_match.all_segments(), ) else: return MatchResult.from_unmatched(mutated_segments) if not match.is_complete(): # If we reach this point, the lookahead match has hit a # delimiter beyond the scope of this Delimited section. We # should return a partial match, and the delimiter as unmatched. return MatchResult( matched_segments.matched_segments + pre_non_code + match.matched_segments, match.unmatched_segments + post_non_code + delimiter_match.all_segments(), ) # We have a complete match! # First add the segment up to the delimiter to the matched segments matched_segments += ( pre_non_code + match.matched_segments + post_non_code ) # Then it depends what we matched. # Delimiter if delimiter_matcher is self.delimiter: # Then add the delimiter to the matched segments matched_segments += delimiter_match.matched_segments # Break this for loop and move on, looking for the next # delimiter seg_buff = delimiter_match.unmatched_segments # Still got some buffer left. Carry on. continue # Terminator (or the gap terminator). elif delimiter_matcher is self.terminator or isinstance( delimiter_matcher, NonCodeMatcher ): # We just return straight away here. We don't add the terminator # to this match, it should go with the unmatched parts. # First check we've had enough delimiters if ( self.min_delimiters and len(delimiters) < self.min_delimiters ): return MatchResult.from_unmatched(mutated_segments) else: return MatchResult( matched_segments.matched_segments, delimiter_match.all_segments(), ) else: # pragma: no cover raise RuntimeError( ( "I don't know how I got here. Matched instead on {}, " "which doesn't appear to be delimiter or terminator" ).format(delimiter_matcher) ) else: # Zero length section between delimiters, or zero code # elements if appropriate. Return unmatched. return MatchResult.from_unmatched(mutated_segments) else: # No match for a delimiter looking forward, this means we're # at the end. In this case we look for a potential partial match # looking forward. We know it's a non-zero length section because # we checked that up front. # First check we're had enough delimiters, because if we haven't then # there's no sense to try matching if self.min_delimiters and len(delimiters) < self.min_delimiters: return MatchResult.from_unmatched(mutated_segments) # We use the whitespace padded match to hoover up whitespace if enabled, # and default to the longest matcher. We don't care which one matches. pre_non_code, trimmed_segments, post_non_code = trim_non_code_segments( mutated_segments ) # Check for whitespace gaps. # We do this explicitly here rather than relying on an # untrimmed match so we can handle _whitespace_ explicitly # compared to other non code segments like placeholders. if not self.allow_gaps and any( seg.is_whitespace for seg in pre_non_code + post_non_code ): return MatchResult.from_unmatched( mutated_segments ) # pragma: no cover TODO? with parse_context.deeper_match() as ctx: mat, _ = self._longest_trimmed_match( trimmed_segments, self._elements, parse_context=ctx, # We've already trimmed trim_noncode=False, ) if mat: # We've got something at the end. Return! if mat.unmatched_segments: # We have something unmatched and so we should let it also have # the trailing elements return MatchResult( matched_segments.matched_segments + pre_non_code + mat.matched_segments, mat.unmatched_segments + post_non_code, ) else: # If there's nothing unmatched in the most recent match, then we # can consume the trailing non code segments return MatchResult.from_matched( matched_segments.matched_segments + pre_non_code + mat.matched_segments + post_non_code, ) else: # No match at the end, are we allowed to trail? If we are then # return, otherwise we fail because we can't match the last element. if self.allow_trailing: return MatchResult(matched_segments.matched_segments, seg_buff) else: return MatchResult.from_unmatched(mutated_segments)
def _longest_trimmed_match( cls, segments: Tuple[BaseSegment, ...], matchers: List[MatchableType], parse_context: ParseContext, trim_noncode=True, terminators: List[MatchableType] = None, ) -> Tuple[MatchResult, Optional[MatchableType]]: """Return longest match from a selection of matchers. Prioritise the first match, and if multiple match at the same point the longest. If two matches of the same length match at the same time, then it's the first in the iterable of matchers. Returns: `tuple` of (match_object, matcher). """ terminated = False # Have we been passed an empty list? if len(segments) == 0: # pragma: no cover return MatchResult.from_empty(), None # If gaps are allowed, trim the ends. if trim_noncode: pre_nc, segments, post_nc = trim_non_code_segments(segments) best_match_length = 0 # iterate at this position across all the matchers for matcher in matchers: # MyPy seems to require a type hint here. Not quite sure why. res_match: MatchResult = matcher.match(segments, parse_context=parse_context) if res_match.is_complete(): # Just return it! (WITH THE RIGHT OTHER STUFF) if trim_noncode: return ( MatchResult.from_matched(pre_nc + res_match.matched_segments + post_nc), matcher, ) else: return res_match, matcher elif res_match: # We've got an incomplete match, if it's the best so far keep it. if res_match.trimmed_matched_length > best_match_length: best_match = res_match, matcher best_match_length = res_match.trimmed_matched_length if terminators: _, segs, _ = trim_non_code_segments( best_match[0].unmatched_segments) for terminator in terminators: terminator_match: MatchResult = terminator.match( segs, parse_context=parse_context) if terminator_match.matched_segments: terminated = True break if terminated: break # We could stash segments here, but given we might have some successful # matches here, we shouldn't, because they'll be mutated in the wrong way. # Eventually there might be a performance gain from doing that sensibly # here. # If we get here, then there wasn't a complete match. If we # has a best_match, return that. if best_match_length > 0: if trim_noncode: return ( MatchResult( pre_nc + best_match[0].matched_segments, best_match[0].unmatched_segments + post_nc, ), best_match[1], ) else: return best_match # If no match at all, return nothing return MatchResult.from_unmatched(segments), None