def test_string_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), "abc") # There's a word boundary at the start of a string. self.assertTrue(re.match(r"\b", "abc")) # A non-empty string includes a non-boundary zero-length match. self.assertTrue(re.search(r"\B", "abc")) # There is no non-boundary match at the start of a string. self.assertFalse(re.match(r"\B", "abc")) # However, an empty string contains no word boundaries, and also no # non-boundaries. self.assertEqual(re.search(r"\B", ""), None) # This one is questionable and different from the perlre behaviour, # but describes current behavior. self.assertEqual(re.search(r"\b", ""), None) # A single word-character string has two boundaries, but no # non-boundary gaps. self.assertEqual(len(re.findall(r"\b", "a")), 2) self.assertEqual(len(re.findall(r"\B", "a")), 0) # If there are no words, there are no boundaries self.assertEqual(len(re.findall(r"\b", " ")), 0) self.assertEqual(len(re.findall(r"\b", " ")), 0) # Can match around the whitespace. self.assertEqual(len(re.findall(r"\B", " ")), 2)
def test_search_star_plus(self): self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) self.assertEqual(re.search('x', 'aaa'), None) self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) self.assertEqual(re.match('a+', 'xxx'), None)
def test_large_search(self, size): # Issue #10182: indices were 32-bit-truncated. s = 'a' * size m = re.search('$', s) self.assertIsNotNone(m) self.assertEqual(m.start(), size) self.assertEqual(m.end(), size)
def has_disallowed_pairs(self): # alpha = self.alpha patterns = ["^be[^r]i$", "^(k|s)e(i|kan)$", "^(di|me|te)[^krwylp]an$"] if self.removed['derivational_prefix'] != '' and self.removed[ 'derivational_suffix'] != '': prefix = self.removed['derivational_prefix'][0] for pattern in patterns: # self.removed['derivational_suffix'] = pcre.search(pattern, prefix) if pcre.search(pattern, prefix): self.removed['derivational_suffix'] = pcre.search( pattern, prefix) return True return False
def test_bug_418626(self): # bugs 418626 at al. -- Testing Greg Chapman's addition of op code # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of # pattern '*?' on a long string. self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 20003) self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) # non-simple '*?' still used to hit the recursion limit, before the # non-recursive scheme was implemented. self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
def delete_derivational_suffix(self, word): result = word derivational_suffix = "(i|k?an)$" match = pcre.search(derivational_suffix, result) if match: result = pcre.sub(derivational_suffix, '', result) self.removed['derivational_suffix'] = match.group(0) check = self.lookup(result) if check: return check return result
def check_rule_precedence(self, word): # alpha = self.alpha patterns = [ "^be(?<word>{})([^k]an|lah|kah)$".format(self.alpha), "^(me|di|pe|te)(?<word>{})(i)$".format(self.alpha), "^(k|s)e(?<word>{})(i|kan)$".format(self.alpha), "^([pm]e[nm]|di[tmp])(?<word>ah|ak|er|el)an$" ] for pattern in patterns: match = pcre.search(pattern, word) if match and match.group('word') != 'ngalam': return True return False
def delete_inflectional_suffix(self, word): result = word patterns = { 'particle': "([klt]ah|pun)$", 'possessive_pronoun': "([km]u|nya)$" } for key, pattern in patterns.items(): match = pcre.search(pattern, result) if match: result = pcre.sub(pattern, '', result) self.removed[key] = match.group(0) check = self.lookup(result) if check: return check return result
def test_search_coverage(self): self.assertEqual(re.search("\s(b)", " b").group(1), "b") self.assertEqual(re.search("a\s", "a ").group(0), "a ")
def test_not_literal(self): self.assertEqual(re.search("\s([^a])", " b").group(1), "b") self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
def test_special_escapes(self): self.assertEqual(re.search(r"\b(b.)\b", "abcd abc bcd bx").group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd").group(1), "bx") self.assertEqual(re.search(r"\b(b.)\b", "abcd abc bcd bx", re.LOCALE).group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd", re.LOCALE).group(1), "bx") self.assertEqual(re.search(r"\b(b.)\b", "abcd abc bcd bx", re.UNICODE).group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd", re.UNICODE).group(1), "bx") self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None) self.assertEqual(re.search(r"\b(b.)\b", u"abcd abc bcd bx").group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", u"abc bcd bc abxd").group(1), "bx") self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None) self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a").group(0), "1aa! a") self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.LOCALE).group(0), "1aa! a") self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.UNICODE).group(0), "1aa! a")
def lookup(self, word): if len(word) < 3: return False check = word check2 = '' query_string = '' # check repeating words like main-main match = pcre.search("^([a-z]+)-([a-z]+)$", check) if match: if match.group(1) == match.group(2): check = match.group(1) check2 = word if len(word) <= 6: query_string = "'{}'".format(check) else: syllabel = "([bcdfghjklmnpqrstvwxyz]|sy)?([aiueo])(?U)([bcdfghjklmnpqrstvwxyz]|ng)?" reg = "^(?<first>aneka|({}{}))(?<second>{}{}(?U)({})*)$".format( syllabel, syllabel, syllabel, syllabel, syllabel) # notsure if true match = pcre.search(reg, word) if match: query_string = "'" + match.group('first') + ' ' + match.group( 'second') + "' OR lemma LIKE '{}'".format(check) else: query_string = "'{}'".format(check) if check2 != '': query_string += " OR lemma LIKE '{}'".format(check2) if pcre.search( '[aiueo]$', word ) and self.removed['derivational_suffix'] == 'kan' and len(word) > 3: query_string += " OR lemma LIKE '{}k' ORDER BY pos DESC".format( check) query = self.database.cursor() txt = "SELECT * FROM dictionary WHERE lemma LIKE {} LIMIT 1".format( query_string) try: query.execute(txt) except: print('error happened') self.total_lookup += 1 try: row = query.fetchall() except: row = "" print('empty data') if row: self.found = row[0] return self.found
def delete_derivational_prefix(self, word): vowel = self.vowel consonant = self.consonant alpha = self.alpha result = word prefix_type = "" prefix = "" patterns = {'plain': "^(di|(k|s)e)", 'complex': "^(b|m|p|t)e"} if len(result) < 4: return result for key, pattern in patterns.items(): match = re.match(pattern, result) if match: prefix_type = (key == 'plain') prefix = match[0] if self.removed[ 'derivational_prefix'] != '' and prefix in self.removed[ 'derivational_prefix']: return result self.recoding_tracker[match[0]] = '' if prefix_type: array = self.removed['derivational_prefix'] if prefix == 'ke' and array != '' and ( array[0] == 'di' and not (pcre.search('(tawa|tahu)', result)) and array[0] != 'be'): return result result = pcre.sub(pattern, '', result) self.complex_prefix_tracker[prefix] = {prefix: ''} else: modification = "" # 'be-' prefix rules # total rule = 5 if prefix == 'be': if self.removed['derivational_prefix'] != '': array_key = list( self.complex_prefix_tracker.keys())[ 0] # get first dict value array = self.complex_prefix_tracker[array_key] added_key = list(array.keys())[0] added = array[added_key] pp = added_key if pp not in ['mem', 'pem', 'di', 'ke']: return result # rule 1 if pcre.search("^ber{}".format(vowel), result): result = pcre.sub("^ber", '', result) modification = {"ber": ''} self.recoding_tracker[prefix] = {'be': ''} # rule 2 elif pcre.search( "^ber[bcdfghjklmnpqstvwxyz][a-z](?!er)", result): result = pcre.sub("^ber", '', result) modification = {'ber': ""} # rule 3 elif pcre.search( "^ber[bcdfghjklmnpqstvwxyz][a-z]er{}".format( vowel), result): result = pcre.sub("^ber", '', result) modification = {'ber': ""} # rule 4 elif pcre.search("^belajar$", result): result = pcre.sub("^bel", '', result) modification = {'bel': ""} # rule 5 elif pcre.search( "^be[bcdfghjkmnpqstvwxyz]er{}".format( consonant), result): result = pcre.sub("^be", '', result) modification = {'be': ""} # unsuccessful else: del self.recoding_tracker[prefix] return word # te- prefix rules # total rule : 5 elif prefix == 'te': if self.removed['derivational_prefix'] != '': array_key = list( self.complex_prefix_tracker.keys())[ 0] # get first dict value array = self.complex_prefix_tracker[array_key] added_key = list(array.keys())[0] added = array[added_key] pp = added_key if pp != 'ke' and pp in [ 'me', 'men', 'pen' ] and not (pcre.search('tawa', result)): return result # rule 6 if pcre.search("^ter{}".format(vowel), result): result = pcre.sub('^ter', '', result) modification = {'ter': ''} self.recoding_tracker[prefix] = {'te': ''} # rule 7 elif pcre.search( "^ter[bcdfghjklmnpqstvwxyz]er{}".format(vowel), result): result = pcre.sub('^ter', '', result) modification = {'ter': ''} # rule 8 elif pcre.search("^ter{}(?!er)".format(consonant), result): result = pcre.sub('^ter', '', result) modification = {'ter': ''} # rule 9 elif pcre.search( "^te[bcdfghjklmnpqstvwxyz]er{}".format( consonant), result): result = pcre.sub('^te', '', result) modification = {'te': ''} # rule 10 elif pcre.search( "^ter[bcdfghjklmnpqstvwxyz]er{}".format( consonant), result): result = pcre.sub('^ter', '', result) modification = {'ter': ''} # unsuccessful else: del self.recoding_tracker[prefix] return word # me- prefix rules # total rule = 10 elif prefix == 'me': if self.removed['derivational_prefix'] != '': return result # rule 11 if pcre.search('^me[lrwy]{}'.format(vowel), result): result = pcre.sub('^me', '', result) modification = {'me': ''} # rule 12 elif pcre.search('^mem[bfv]', result): result = pcre.sub('^mem', '', result) modification = {'mem': ''} # rule 13 elif pcre.search('^mempe', result): result = pcre.sub('^mem', '', result) modification = {'mem': ''} # rule 14 elif pcre.search("^mem(r?)[aiueo]", result): match = pcre.search("^mem(r?)[aiueo]", result) result = pcre.sub('^me', '', result) modification = {'me{}'.format(match.group(1)): ''} self.recoding_tracker[prefix] = {'mem': 'p'} # rule 15 elif pcre.search('^men[cdsjz]', result): result = pcre.sub('^men', '', result) modification = {'men': ''} # rule 16 elif pcre.search('^men{}'.format(vowel), result): result = pcre.sub('^men', 't', result) modification = {'men': 't'} self.recoding_tracker[prefix] = {'me': ''} # rule 17 elif pcre.search('^meng[ghqk]', result): result = pcre.sub('^meng', '', result) modification = {'meng': ''} # rule 18 elif pcre.search('^meng({})'.format(vowel), result): match = pcre.search('^meng({})'.format(vowel), result) result = pcre.sub('^meng', '', result) modification = {'meng': ''} self.recoding_tracker[prefix] = {'meng1': 'k'} self.recoding_tracker[prefix]['menge'] = '' # rule 19 elif pcre.search('^meny{}'.format(vowel), result): result = pcre.sub('^me', '', result) modification = {'me': ''} self.recoding_tracker[prefix] = {'meny': 's'} # rule 20 elif pcre.search('^memp[abcdfghijklmnopqrstuvwxyz]', result): result = pcre.sub('^mem', '', result) modification = {'mem': ''} # unsuccesful else: del self.recoding_tracker[prefix] return word # pe- prefix rules # total rule = 15 elif prefix == 'pe': if self.removed['derivational_prefix'] != '': array_key = list( self.complex_prefix_tracker.keys())[ 0] # get first dict value array = self.complex_prefix_tracker[array_key] added_key = list(array.keys())[0] added = array[added_key] pp = added_key if pp not in ['di', 'ber', 'mem', 'se', 'ke']: return result # rule 21 if pcre.search('^pe[wy]{}'.format(vowel), result): result = pcre.sub('^pe', '', result) modification = {'pe': ''} # rule 22 elif pcre.search('^per{}'.format(vowel), result): result = pcre.sub('^per', '', result) modification = {'per': ''} self.recoding_tracker[prefix] = {'pe': ''} # rule 23 elif pcre.search( '^per[bcdfghjklmnpqstvwxyz][a-z](?!er)', result): result = pcre.sub('^per', '', result) modification = {'per': ''} # rule 24 elif pcre.search( '^per[bcdfghjklmnpqstvwxyz][a-z]er{}'.format( vowel), result): result = pcre.sub('^per', '', result) modification = {'per': ''} # rule 25 elif pcre.search('^pem[bfv]', result): result = pcre.sub('^pem', '', result) modification = {'pem': ''} # rule 26 elif pcre.search('^pem(r?){}'.format(vowel), result): result = pcre.sub('^pe', '', result) modification = {'pe': ''} self.recoding_tracker[prefix] = {'pem': 'p'} # rule 27 elif pcre.search('^pen[cdjz]', result): result = pcre.sub('^pen', '', result) modification = {'pen': ''} # rule 28 elif pcre.search('^pen{}'.format(vowel), result): result = pcre.sub('^pen', 't', result) modification = {'pen': 't'} self.recoding_tracker[prefix] = {'pe': ''} # rule 29 elif pcre.search('^peng{}'.format(consonant), result): result = pcre.sub('^peng', '', result) modification = {'peng': ''} # rule 30 elif pcre.search('^peng({})'.format(vowel), result): match = pcre.search('^peng({})'.format(vowel), result) result = pcre.sub('^peng', '', result) modification = {'peng': ''} self.recoding_tracker[prefix] = {'peng1': 'k'} self.recoding_tracker[prefix]['penge'] = '' # rule 31 elif pcre.search('^peny{}'.format(vowel), result): result = pcre.sub('^pe', '', result) modification = {'pe': ''} self.recoding_tracker[prefix] = {'peny': 's'} # rule 32 elif pcre.search('^pel{}'.format(vowel), result): if (result == 'pelajar'): result = pcre.sub('^pel', '', result) modification = {'pel': ''} else: result = pcre.sub("^pe", "", result) modification = {'pe': ''} # rule 33 elif pcre.search( '^pe[bcdfghjkpqstvxz]er{}'.format(vowel), result): result = pcre.sub('^pe', '', result) modification = {'pe': ''} # rule 34 elif pcre.search('^pe[bcdfghjkpqstvxz](?!er)', result): result = pcre.sub('^pe', '', result) modification = {'pe': ''} # rule 35 elif pcre.search( '^pe[bcdfghjkpqstvxz]er{}'.format(consonant), result): result = pcre.sub('^pe', '', result) modification = {'pe': ''} # unsuccessful else: del self.recoding_tracker[prefix] return word if modification != "": self.complex_prefix_tracker[prefix] = modification else: return result if self.removed['derivational_prefix'] == '': self.removed['derivational_prefix'] = [] self.removed['derivational_prefix'].append(prefix) self.lookup(result) return result return result
def search(self, vhost, https=False, port=None): ips = self.resolv(vhost) if len(ips) > 1: raise "Vhost on multiple IPS not supported." if port is None: if https: port = 443 else: port = 80 candidates = [] ip = ips[0] logging.debug("Pre-select vhost that can serve IP <%s> on %s", ip, ("HTTPS" if https else 'HTTP')) for srv in self.servers: if srv.can_serve(ip, https, port): candidates.append(srv) logging.debug("1st pass: exact names") for srv in candidates: if vhost in srv.server_names: return srv logging.debug( "2nd pass: longest wildcard name starting with an asterisk") pass_candidates = [] for srv in candidates: for srvname in srv.server_names: wildcardvhost = None if srvname.startswith("*."): wildcardvhost = srvname[2:] elif srvname.startswith("."): wildcardvhost = srvname[1:] if wildcardvhost: if vhost.endswith(wildcardvhost): pass_candidates.append( (srv, srvname, srvname.count('.'))) if pass_candidates: dots = 0 selected = None for sp in pass_candidates: if sp[2] > dots: dots = sp[2] selected = sp[0] return selected logging.debug( "3rd pass: longest wildcard name ending with an asterisk") pass_candidates = [] for srv in candidates: for srvname in srv.server_names: wildcardvhost = None if srvname.endswith(".*"): wildcardvhost = srvname[:-2] if vhost.startswith(wildcardvhost): pass_candidates.append( (srv, srvname, srvname.count('.'))) if pass_candidates: dots = 0 selected = None for sp in pass_candidates: if sp[2] > dots: dots = sp[2] selected = sp[0] return selected logging.debug( "4th pass: first matching regular expression (in order of appearance in a configuration file)" ) pass_candidates = [] for srv in candidates: for srvname in srv.server_names: wildcardvhost = None if srvname.startswith('~'): wildcardvhost = srvname[1:] try: revhost = pcre.search(wildcardvhost, vhost) if revhost: return srv except: logging.debug("FAILED to compile PCRE '%s'", wildcardvhost) logging.debug("5th pass: fallback to default vhost") for srv in candidates: if srv.is_default_server_name() or srv.is_default_server(ip, port): return srv return None