def replace_impl(interp, pce, replace_obj, subject, limit=-1): replace_obj.setup(interp, pce) space = interp.space rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit) rffi.setintfield(pce.extra, 'c_match_limit_recursion', interp.regexp_recursion_limit) # Calculate the size of the offsets array num_subpats = pce.capturecount + 1 size_offsets = num_subpats * 3 # Initialize some stuff builder = StringBuilder(len(subject)) # Allocate some more raw stuff rawsubject = rffi.str2charp(subject) offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw') try: exoptions = 0 g_notempty = 0 start_offset = 0 original_limit = limit interp.regexp_error_code = PREG_NO_ERROR while limit != 0: # Execute the regular expression. count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject, len(subject), start_offset, exoptions | g_notempty, offsets, size_offsets) # the string was already proved to be valid UTF-8 exoptions |= _pcre.PCRE_NO_UTF8_CHECK # Check for too many substrings condition. if count == 0: interp.notice("Matched, but too many substrings") count = size_offsets // 3 # If something has matched if count > 0: # copy the part of the string before the match match_end = rffi.cast(lltype.Signed, offsets[0]) builder.append_slice(subject, start_offset, match_end) # ask the replace_obj how to handle this match replace_obj.next_replace(builder, subject, count, offsets) limit -= 1 elif count == _pcre.PCRE_ERROR_NOMATCH: # If we previously set PCRE_NOTEMPTY after a null match, # this is not necessarily the end. We need to advance # the start offset, and continue. Fudge the offset # values to achieve this, unless we're already at the # end of the string. if g_notempty != 0 and start_offset < len(subject): next_offset = start_offset next_offset += pce.utf8size(subject, start_offset) builder.append_slice(subject, start_offset, next_offset) offsets[0] = rffi.cast(rffi.INT, start_offset) offsets[1] = rffi.cast(rffi.INT, next_offset) else: builder.append_slice(subject, start_offset, len(subject)) break else: handle_exec_error(interp, count) return None, -1 # If we have matched an empty string, mimic what Perl's /g # options does. This turns out to be rather cunning. First # we set PCRE_NOTEMPTY and try the match again at the same # point. If this fails (picked up above) we advance to the # next character. g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast( lltype.Signed, offsets[0])) else 0) # Advance to the position right after the last full match start_offset = rffi.cast(lltype.Signed, offsets[1]) else: # reached limit == 0: copy the end of the string builder.append_slice(subject, start_offset, len(subject)) finally: lltype.free(offsets, flavor='raw') rffi.free_charp(rawsubject) return space.newstr(builder.build()), original_limit - limit
def replace_impl(interp, pce, replace_obj, subject, limit=-1): replace_obj.setup(interp, pce) space = interp.space rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit) rffi.setintfield(pce.extra, 'c_match_limit_recursion', interp.regexp_recursion_limit) # Calculate the size of the offsets array num_subpats = pce.capturecount + 1 size_offsets = num_subpats * 3 # Initialize some stuff builder = StringBuilder(len(subject)) # Allocate some more raw stuff rawsubject = rffi.str2charp(subject) offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw') try: exoptions = 0 g_notempty = 0 start_offset = 0 original_limit = limit interp.regexp_error_code = PREG_NO_ERROR while limit != 0: # Execute the regular expression. count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject, len(subject), start_offset, exoptions|g_notempty, offsets, size_offsets) # the string was already proved to be valid UTF-8 exoptions |= _pcre.PCRE_NO_UTF8_CHECK # Check for too many substrings condition. if count == 0: interp.notice("Matched, but too many substrings") count = size_offsets // 3 # If something has matched if count > 0: # copy the part of the string before the match match_end = rffi.cast(lltype.Signed, offsets[0]) builder.append_slice(subject, start_offset, match_end) # ask the replace_obj how to handle this match replace_obj.next_replace(builder, subject, count, offsets) limit -= 1 elif count == _pcre.PCRE_ERROR_NOMATCH: # If we previously set PCRE_NOTEMPTY after a null match, # this is not necessarily the end. We need to advance # the start offset, and continue. Fudge the offset # values to achieve this, unless we're already at the # end of the string. if g_notempty != 0 and start_offset < len(subject): next_offset = start_offset next_offset += pce.utf8size(subject, start_offset) builder.append_slice(subject, start_offset, next_offset) offsets[0] = rffi.cast(rffi.INT, start_offset) offsets[1] = rffi.cast(rffi.INT, next_offset) else: builder.append_slice(subject, start_offset, len(subject)) break else: handle_exec_error(interp, count) return None, -1 # If we have matched an empty string, mimic what Perl's /g # options does. This turns out to be rather cunning. First # we set PCRE_NOTEMPTY and try the match again at the same # point. If this fails (picked up above) we advance to the # next character. g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast(lltype.Signed, offsets[0])) else 0) # Advance to the position right after the last full match start_offset = rffi.cast(lltype.Signed, offsets[1]) else: # reached limit == 0: copy the end of the string builder.append_slice(subject, start_offset, len(subject)) finally: lltype.free(offsets, flavor='raw') rffi.free_charp(rawsubject) return space.newstr(builder.build()), original_limit - limit
def match_impl(interp, pce, subject, w_matches, start_offset, mode, limit, flags): space = interp.space # Negative offset counts from the end of the string. if start_offset < 0: start_offset = len(subject) + start_offset if start_offset < 0: start_offset = 0 rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit) rffi.setintfield(pce.extra, 'c_match_limit_recursion', interp.regexp_recursion_limit) # Calculate the size of the offsets array num_subpats = pce.capturecount + 1 size_offsets = num_subpats * 3 # Allocate match sets array and initialize the values. if mode == MODE_PATTERN_ORDER: match_sets = [ space.new_array_from_list([]) for i in range(num_subpats) ] else: match_sets = None # Allocate some more raw stuff rawsubject = rffi.str2charp(subject) offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw') if w_matches: subpats = space.new_array_from_list([]) else: mode = MODE_NO_SUBPAT subpats = None try: exoptions = 0 g_notempty = 0 matched = 0 last_match = 0 interp.regexp_error_code = PREG_NO_ERROR while matched != limit: # Execute the regular expression. count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject, len(subject), start_offset, exoptions | g_notempty, offsets, size_offsets) # the string was already proved to be valid UTF-8 exoptions |= _pcre.PCRE_NO_UTF8_CHECK # Check for too many substrings condition. if count == 0: interp.notice("Matched, but too many substrings") count = size_offsets // 3 # If something has matched if count > 0: matched += 1 # If subpatterns array has been passed, fill it in with values. if mode == MODE_MATCH: # Single pattern matching # For each subpattern, insert it into the # subpatterns array. for i in range(count): subpats = _add_result(space, subpats, subject, offsets, i, flags, pce.subpat_names[i]) elif mode == MODE_PATTERN_ORDER: # Global pattern matching, pattern order # For each subpattern, insert it into the # appropriate array. for i in range(count): match_sets[i] = _add_result(space, match_sets[i], subject, offsets, i, flags) # If the number of captured subpatterns on this # run is less than the total possible number, # pad the result arrays with empty strings. for i in range(count, num_subpats): match_sets[i] = _add_result_range( space, match_sets[i], subject, 0, 0, flags=0) # xxx why???? elif mode == MODE_SET_ORDER: # Global pattern matching, set order sub1 = space.new_array_from_list([]) for i in range(count): sub1 = _add_result(space, sub1, subject, offsets, i, flags, pce.subpat_names[i]) subpats = space.appenditem_maybe_inplace(subpats, sub1) elif mode == MODE_SPLIT: next_head = rffi.cast(lltype.Signed, offsets[0]) no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0 if no_empty and next_head == last_match: matched -= 1 else: subpats = _add_result_range(space, subpats, subject, last_match, next_head, flags) last_match = rffi.cast(lltype.Signed, offsets[1]) if flags & PREG_SPLIT_DELIM_CAPTURE: for i in range(1, count): start = rffi.cast(lltype.Signed, offsets[i << 1]) stop = rffi.cast(lltype.Signed, offsets[(i << 1) + 1]) if no_empty and start == stop: continue subpats = _add_result_range( space, subpats, subject, start, stop, flags) elif count == _pcre.PCRE_ERROR_NOMATCH: # If we previously set PCRE_NOTEMPTY after a null match, # this is not necessarily the end. We need to advance # the start offset, and continue. Fudge the offset # values to achieve this, unless we're already at the # end of the string. if g_notempty != 0 and start_offset < len(subject): offsets[0] = rffi.cast(rffi.INT, start_offset) start_offset += pce.utf8size(subject, start_offset) offsets[1] = rffi.cast(rffi.INT, start_offset) else: break else: handle_exec_error(interp, count) break # If we have matched an empty string, mimic what Perl's /g # options does. This turns out to be rather cunning. First # we set PCRE_NOTEMPTY and try the match again at the same # point. If this fails (picked up above) we advance to the # next character. g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast( lltype.Signed, offsets[0])) else 0) # Advance to the position right after the last full match start_offset = rffi.cast(lltype.Signed, offsets[1]) finally: lltype.free(offsets, flavor='raw') rffi.free_charp(rawsubject) if subpats: if mode == MODE_PATTERN_ORDER: # Add the match sets to the output array for i in range(num_subpats): match_set = match_sets[i] subpat_name = pce.subpat_names[i] if subpat_name is not None: w_key = space.newstr(subpat_name) subpats = space.packitem_maybe_inplace( subpats, w_key, match_set) subpats = space.appenditem_maybe_inplace(subpats, match_set) elif mode == MODE_SPLIT: # the offset might have been incremented, # but without further successful matches start_offset = last_match no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0 if no_empty and start_offset >= len(subject): pass else: subpats = _add_result_range(space, subpats, subject, start_offset, len(subject), flags) w_matches.store(subpats, unique=True) # Did we encounter an error? if interp.regexp_error_code == PREG_NO_ERROR: return space.newint(matched) else: return space.w_False
def match_impl(interp, pce, subject, w_matches, start_offset, mode, limit, flags): space = interp.space # Negative offset counts from the end of the string. if start_offset < 0: start_offset = len(subject) + start_offset if start_offset < 0: start_offset = 0 rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit) rffi.setintfield(pce.extra, 'c_match_limit_recursion', interp.regexp_recursion_limit) # Calculate the size of the offsets array num_subpats = pce.capturecount + 1 size_offsets = num_subpats * 3 # Allocate match sets array and initialize the values. if mode == MODE_PATTERN_ORDER: match_sets = [space.new_array_from_list([]) for i in range(num_subpats)] else: match_sets = None # Allocate some more raw stuff rawsubject = rffi.str2charp(subject) offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw') if w_matches: subpats = space.new_array_from_list([]) else: mode = MODE_NO_SUBPAT subpats = None try: exoptions = 0 g_notempty = 0 matched = 0 last_match = 0 interp.regexp_error_code = PREG_NO_ERROR while matched != limit: # Execute the regular expression. count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject, len(subject), start_offset, exoptions|g_notempty, offsets, size_offsets) # the string was already proved to be valid UTF-8 exoptions |= _pcre.PCRE_NO_UTF8_CHECK # Check for too many substrings condition. if count == 0: interp.notice("Matched, but too many substrings") count = size_offsets // 3 # If something has matched if count > 0: matched += 1 # If subpatterns array has been passed, fill it in with values. if mode == MODE_MATCH: # Single pattern matching # For each subpattern, insert it into the # subpatterns array. for i in range(count): subpats = _add_result(space, subpats, subject, offsets, i, flags, pce.subpat_names[i]) elif mode == MODE_PATTERN_ORDER: # Global pattern matching, pattern order # For each subpattern, insert it into the # appropriate array. for i in range(count): match_sets[i] = _add_result( space, match_sets[i], subject, offsets, i, flags) # If the number of captured subpatterns on this # run is less than the total possible number, # pad the result arrays with empty strings. for i in range(count, num_subpats): match_sets[i] = _add_result_range( space, match_sets[i], subject, 0, 0, flags=0) # xxx why???? elif mode == MODE_SET_ORDER: # Global pattern matching, set order sub1 = space.new_array_from_list([]) for i in range(count): sub1 = _add_result(space, sub1, subject, offsets, i, flags, pce.subpat_names[i]) subpats = space.appenditem_maybe_inplace(subpats, sub1) elif mode == MODE_SPLIT: next_head = rffi.cast(lltype.Signed, offsets[0]) no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0 if no_empty and next_head == last_match: matched -= 1 else: subpats = _add_result_range(space, subpats, subject, last_match, next_head, flags) last_match = rffi.cast(lltype.Signed, offsets[1]) if flags & PREG_SPLIT_DELIM_CAPTURE: for i in range(1, count): start = rffi.cast(lltype.Signed, offsets[i<<1]) stop = rffi.cast(lltype.Signed, offsets[(i<<1)+1]) if no_empty and start == stop: continue subpats = _add_result_range(space, subpats, subject, start, stop, flags) elif count == _pcre.PCRE_ERROR_NOMATCH: # If we previously set PCRE_NOTEMPTY after a null match, # this is not necessarily the end. We need to advance # the start offset, and continue. Fudge the offset # values to achieve this, unless we're already at the # end of the string. if g_notempty != 0 and start_offset < len(subject): offsets[0] = rffi.cast(rffi.INT, start_offset) start_offset += pce.utf8size(subject, start_offset) offsets[1] = rffi.cast(rffi.INT, start_offset) else: break else: handle_exec_error(interp, count) break # If we have matched an empty string, mimic what Perl's /g # options does. This turns out to be rather cunning. First # we set PCRE_NOTEMPTY and try the match again at the same # point. If this fails (picked up above) we advance to the # next character. g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast(lltype.Signed, offsets[0])) else 0) # Advance to the position right after the last full match start_offset = rffi.cast(lltype.Signed, offsets[1]) finally: lltype.free(offsets, flavor='raw') rffi.free_charp(rawsubject) if subpats: if mode == MODE_PATTERN_ORDER: # Add the match sets to the output array for i in range(num_subpats): match_set = match_sets[i] subpat_name = pce.subpat_names[i] if subpat_name is not None: w_key = space.newstr(subpat_name) subpats = space.packitem_maybe_inplace(subpats, w_key, match_set) subpats = space.appenditem_maybe_inplace(subpats, match_set) elif mode == MODE_SPLIT: # the offset might have been incremented, # but without further successful matches start_offset = last_match no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0 if no_empty and start_offset >= len(subject): pass else: subpats = _add_result_range(space, subpats, subject, start_offset, len(subject), flags) w_matches.store(subpats, unique=True) # Did we encounter an error? if interp.regexp_error_code == PREG_NO_ERROR: return space.newint(matched) else: return space.w_False