def test_multi_group(self): comparable_map = TrimEmptyMatchesTest.comparable_map teststring = ("A1B2C3D no match.'~ Awhat doByouCthink??D ABisCD ABCD" "AneverBCmindD __ ABCXD ABC") regex = "A(.*?)B(.*?)C(?P<cd>.*?)D" real = tuple(re.finditer(regex, teststring)) # Check again if our regex works. self.assertEqual(comparable_map(real), ("A1B2C3D", "Awhat doByouCthink??D", "ABisCD", "ABCD", "AneverBCmindD", "ABCXD")) self.assertEqual(comparable_map(trim_empty_matches(real)), ("A1B2C3D", "Awhat doByouCthink??D", "ABisCD", "ABCD", "AneverBCmindD", "ABCXD")) self.assertEqual(comparable_map(trim_empty_matches(real, (1, 3))), ("A1B2C3D", "Awhat doByouCthink??D", "AneverBCmindD", "ABCXD")) self.assertEqual(comparable_map(trim_empty_matches(real, (2,))), ("A1B2C3D", "Awhat doByouCthink??D", "ABisCD")) self.assertEqual(comparable_map(trim_empty_matches(real, ("cd",))), ("A1B2C3D", "Awhat doByouCthink??D", "AneverBCmindD", "ABCXD"))
def test_single_group(self): comparable_map = TrimEmptyMatchesTest.comparable_map teststring = "AHelloB s A B ABAB do what you want." regex = "A(.*?)B" # Using the iterator itself would require to invoke re.finditer again # and again before each assert. real = tuple(re.finditer(regex, teststring)) # Ensure our regex is working like expected. self.assertEqual(comparable_map(real), ("AHelloB", "A B", "AB", "AB")) # Default mode checks for group 0. self.assertEqual(comparable_map(trim_empty_matches(real)), ("AHelloB", "A B", "AB", "AB")) self.assertEqual(comparable_map(trim_empty_matches(real, (1,))), ("AHelloB", "A B")) self.assertEqual(comparable_map(trim_empty_matches(real, (0, 1))), ("AHelloB", "A B", "AB", "AB")) self.assertEqual(comparable_map(trim_empty_matches(real, (1, 0))), ("AHelloB", "A B", "AB", "AB"))
def test_multi_group(self): comparable_map = TrimEmptyMatchesTest.comparable_map teststring = ("A1B2C3D no match.'~ Awhat doByouCthink??D ABisCD ABCD" "AneverBCmindD __ ABCXD ABC") regex = "A(.*?)B(.*?)C(?P<cd>.*?)D" real = tuple(re.finditer(regex, teststring)) # Check again if our regex works. self.assertEqual(comparable_map(real), ("A1B2C3D", "Awhat doByouCthink??D", "ABisCD", "ABCD", "AneverBCmindD", "ABCXD")) self.assertEqual(comparable_map(trim_empty_matches(real)), ("A1B2C3D", "Awhat doByouCthink??D", "ABisCD", "ABCD", "AneverBCmindD", "ABCXD")) self.assertEqual( comparable_map(trim_empty_matches(real, (1, 3))), ("A1B2C3D", "Awhat doByouCthink??D", "AneverBCmindD", "ABCXD")) self.assertEqual(comparable_map(trim_empty_matches(real, (2, ))), ("A1B2C3D", "Awhat doByouCthink??D", "ABisCD")) self.assertEqual( comparable_map(trim_empty_matches(real, ("cd", ))), ("A1B2C3D", "Awhat doByouCthink??D", "AneverBCmindD", "ABCXD"))
def test_single_group(self): comparable_map = TrimEmptyMatchesTest.comparable_map teststring = "AHelloB s A B ABAB do what you want." regex = "A(.*?)B" # Using the iterator itself would require to invoke re.finditer again # and again before each assert. real = tuple(re.finditer(regex, teststring)) # Ensure our regex is working like expected. self.assertEqual(comparable_map(real), ("AHelloB", "A B", "AB", "AB")) # Default mode checks for group 0. self.assertEqual(comparable_map(trim_empty_matches(real)), ("AHelloB", "A B", "AB", "AB")) self.assertEqual(comparable_map(trim_empty_matches(real, (1, ))), ("AHelloB", "A B")) self.assertEqual(comparable_map(trim_empty_matches(real, (0, 1))), ("AHelloB", "A B", "AB", "AB")) self.assertEqual(comparable_map(trim_empty_matches(real, (1, 0))), ("AHelloB", "A B", "AB", "AB"))
def search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Doesn't handle escape sequences. :param begin: A pattern that defines where to start matching. :param end: A pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning InBetweenMatch objects that hold information about the matched begin, inside and end string matched. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (begin) A capturing group that matches the begin sequence. # 2. (.*?) Match any char unlimited times, as few times as possible. Save # the match in the second capturing group (`match.group(2)`). # 3. (end) A capturing group that matches the end sequence. # Because the previous group is lazy (matches as few times as # possible) the next occurring end-sequence is matched. regex = "(" + begin + ")(.*?)(" + end + ")" matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches(matches, (begin_pattern_groups + 2,)) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values( m.group(1), m.start(1), m.group(begin_pattern_groups + 2), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 3), )
def unescaped_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Handles escaped begin- and end-sequences (and so only patterns that are unescaped). .. warning:: Using the escape character '\\' in the begin- or end-sequences the function can return strange results. The backslash can interfere with the escaping regex-sequence used internally to match the enclosed string. :param begin: A regex pattern that defines where to start matching. :param end: A regex pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning the matched strings. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (?<!\\)(?:\\\\)* Unescapes the following char. The first part of # this regex is a look-behind assertion. Only match # the following if no single backslash is before it. # The second part matches all double backslashes. # In fact this sequence matches all escapes that # occur as a multiple of two, means the following # statement is not escaped. # 2. (begin) A capturing group that matches the begin sequence. # 3. (.*?) Match any char unlimited times, as few times as # possible. Save the match in the capturing group # after all capturing groups that can appear in # 'begin'. # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape- # characters get captured. # 5. (end) A capturing group that matches the end sequence. # Because the 3. group is lazy (matches as few times # as possible) the next occurring end-sequence is # matched. regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" + end + ")") matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches( matches, (begin_pattern_groups + 2, begin_pattern_groups + 3)) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values( m.group(1), m.start(1), m.group(begin_pattern_groups + 2) + m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 4), m.start(begin_pattern_groups + 4))
def search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Doesn't handle escape sequences. :param begin: A pattern that defines where to start matching. :param end: A pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning InBetweenMatch objects that hold information about the matched begin, inside and end string matched. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (begin) A capturing group that matches the begin sequence. # 2. (.*?) Match any char unlimited times, as few times as possible. Save # the match in the second capturing group (`match.group(2)`). # 3. (end) A capturing group that matches the end sequence. # Because the previous group is lazy (matches as few times as # possible) the next occurring end-sequence is matched. regex = "(" + begin + ")(.*?)(" + end + ")" matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches(matches, (begin_pattern_groups + 2, )) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values(m.group(1), m.start(1), m.group(begin_pattern_groups + 2), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 3))
def unescaped_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Handles escaped begin- and end-sequences (and so only patterns that are unescaped). .. warning:: Using the escape character '\\' in the begin- or end-sequences the function can return strange results. The backslash can interfere with the escaping regex-sequence used internally to match the enclosed string. :param begin: A regex pattern that defines where to start matching. :param end: A regex pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning the matched strings. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (?<!\\)(?:\\\\)* Unescapes the following char. The first part of # this regex is a look-behind assertion. Only match # the following if no single backslash is before it. # The second part matches all double backslashes. # In fact this sequence matches all escapes that # occur as a multiple of two, means the following # statement is not escaped. # 2. (begin) A capturing group that matches the begin sequence. # 3. (.*?) Match any char unlimited times, as few times as # possible. Save the match in the capturing group # after all capturing groups that can appear in # 'begin'. # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape- # characters get captured. # 5. (end) A capturing group that matches the end sequence. # Because the 3. group is lazy (matches as few times # as possible) the next occurring end-sequence is # matched. regex = r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" + end + ")" matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches(matches, (begin_pattern_groups + 2, begin_pattern_groups + 3)) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values( m.group(1), m.start(1), m.group(begin_pattern_groups + 2) + m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 4), m.start(begin_pattern_groups + 4), )