Exemplo n.º 1
0
 def test_bug_449000(self):
     # Test for sub() on escaped characters
     self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
                      'abc\ndef\n')
     self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 'abc\ndef\n')
     self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 'abc\ndef\n')
     self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 'abc\ndef\n')
Exemplo n.º 2
0
 def test_bug_449000(self):
     # Test for sub() on escaped characters
     self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
                      'abc\ndef\n')
     self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
                      'abc\ndef\n')
     self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
                      'abc\ndef\n')
     self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
                      'abc\ndef\n')
Exemplo n.º 3
0
    def delete_derivational_suffix(self, word):
        result = word
        derivational_suffix = "(i|k?an)$"
        match = pcre.search(derivational_suffix, result)

        if match:
            result = pcre.sub(derivational_suffix, '', result)
            self.removed['derivational_suffix'] = match.group(0)
            check = self.lookup(result)

            if check:
                return check

        return result
Exemplo n.º 4
0
 def test_bug_1140(self):
     # re.sub(x, y, u'') should return u'', not '', and
     # re.sub(x, y, '') should return '', not u''.
     # Also:
     # re.sub(x, y, unicode(x)) should return unicode(y), and
     # re.sub(x, y, str(x)) should return
     #     str(y) if isinstance(y, str) else unicode(y).
     for x in 'x', u'x':
         for y in 'y', u'y':
             z = re.sub(x, y, u'')
             self.assertEqual(z, u'')
             self.assertEqual(type(z), unicode)
             #
             z = re.sub(x, y, '')
             self.assertEqual(z, '')
             self.assertEqual(type(z), str)
             #
             z = re.sub(x, y, unicode(x))
             self.assertEqual(z, y)
             self.assertEqual(type(z), unicode)
             #
             z = re.sub(x, y, str(x))
             self.assertEqual(z, y)
             self.assertEqual(type(z), type(y))
Exemplo n.º 5
0
    def delete_inflectional_suffix(self, word):
        result = word
        patterns = {
            'particle': "([klt]ah|pun)$",
            'possessive_pronoun': "([km]u|nya)$"
        }

        for key, pattern in patterns.items():
            match = pcre.search(pattern, result)

            if match:
                result = pcre.sub(pattern, '', result)
                self.removed[key] = match.group(0)
                check = self.lookup(result)

                if check:
                    return check

        return result
Exemplo n.º 6
0
    def test_basic_re_sub(self):
        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
                         '9.3 -3 24x100y')
        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
                         '9.3 -3 23x99y')

        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')

        s = r"\1\1"
        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
        self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)

        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')

        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
                         '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))

        self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Exemplo n.º 7
0
 def test_bug_462270(self):
     # Test for empty sub() behaviour, see SF bug #462270
     self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
     self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
Exemplo n.º 8
0
 def test_bug_114660(self):
     self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
                      'hello there')
Exemplo n.º 9
0
 def test_qualified_re_sub(self):
     self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
     self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Exemplo n.º 10
0
    def test_sub_template_numeric_escape(self):
        # bug 776311 and friends
        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')

        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')

        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')

        self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
        self.assertEqual(re.sub('x', r'\777', 'x'), '\377')

        self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
        self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
        self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
        self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'

        # in python2.3 (etc), these loop endlessly in sre_parser.py
        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
                         'xz8')
        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
                         'xza')
Exemplo n.º 11
0
def execute(mode, code, input_str):
  result = ""

  if mode == "l":
     rows = (pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code))
     table = handle_table(rows)
    
     if input_str in table:
       result = table[input_str]
     else:
       result = table["?"]
  elif mode == "f":
     result = code % ast.literal_eval(input_str)
  elif mode == "F":
     literal = ast.literal_eval(input_str)
     if isinstance(literal, tuple):
       result = code % literal
       input_str = str(sum((len(str(x)) for x in literal)))
     else:
       result = code % literal
       input_str = str(len(str(literal)))
  elif mode == "g":
    for string in exrex.generate(code):
      print(unescape(string))     
    return # Generate is always terminal
  elif mode == "h":
    if type(input_str) is str:
      input_str = pcre.escape(input_str)
    for string in exrex.generate(code % input_str):
      print(unescape(string)) 
    return
  elif mode == "p":
    literal = ast.literal_eval(input_str)
    if isinstance(literal, int):
      result = pcre.sub(r"(?<![^\\]\\)~(.+?)(?<![^\\]\\)~",r"\1" * literal, code, flags=pcre.DOTALL) 
    else:
      result = pcre.sub(r"(?<![^\\]\\)%(.+?)(?<![^\\]\\)%",r"\1" * literal[1], pcre.sub(r"~(.+?)~",r"\1" * literal[0], code, flags=pcre.DOTALL), flags=pcre.DOTALL)
  elif mode == "P":
    result = pcre.sub(r"(.)(?<![^\\]\\)~",r"\1" * ast.literal_eval(input_str), code, flags=pcre.DOTALL)
  elif mode == "e":
    rows = (pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code))
    table = handle_table(rows)
    
    for char in i:
      result += table[i]
  elif mode == "o":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    print(unescape(pieces[0]))
    result = handle_pieces(pieces[1:], "")
  elif mode == "s":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])

    input_str = handle_subs(input_str, subs)
    result = handle_pieces(pieces[1:], input_str)
  elif mode == "d":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    
    for sub in subs:
      input_str = pcre.sub(sub, "", input_str)

    result = handle_pieces(pieces[1:], input_str)
  elif mode == "S":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    sub_length = len(subs)

    output = unescape(handle_subs(input_str, subs))
    
    result = handle_pieces(pieces[1:], "")
    print(output)
  elif mode == "i":
    result = code + input_str
  elif mode == "I":
    result = code + "\n" + input_str
  else:
    result = code

  if len(result) > 0 and result[0] == "`":
    input_pieces = pcre.split(r"(?<![^\\]\\)!", result)
    if len(input_pieces) >= 2:
      execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:]))
    else:
      execute(result[1], result[2:], get_input(input_str))
  else:
    print(unescape(result))
Exemplo n.º 12
0
def remove_digits(input_str):
    return re.sub(r'\d+', '', input_str)
Exemplo n.º 13
0
    def eat(self, word, backtrack_step=False):
        result = word
        temp = self.lookup(word)

        if temp:
            if not (backtrack_step):
                self.error = 'input_is_lemma'
                return temp
        else:
            steps = self.check_rule_precedence(word)

            if (backtrack_step):
                steps = [5, 6]
            else:
                if (steps):
                    steps = [5, 6, 3, 4, 7]
                else:
                    steps = [3, 4, 5, 6, 7]

        for step in steps:
            if step == 3:
                temp = self.delete_inflectional_suffix(result)

            elif step == 4:
                temp = self.delete_derivational_suffix(result)

            elif step == 5:
                temp = result

                for i in range(3):
                    previous = temp
                    temp = self.delete_derivational_prefix(temp)

                    if ((i == 0 and self.has_disallowed_pairs()) or self.found
                            or temp == previous or
                        (type(self.removed['derivational_prefix']) is str
                         and len(self.removed['derivational_prefix'] > 3))):
                        break

            elif step == 6:
                temp = self.recode(result)

            elif step == 7:
                prefixes = self.complex_prefix_tracker
                res = temp
                temp = ""

                for prefix, changes in prefixes.items():
                    changes_first_key = list(changes.keys())[0]

                    prefix_added = changes[changes_first_key]
                    prefix_removed = changes_first_key

                    if prefix_added != '':
                        temp = pcre.sub('^{}'.format(prefix_added),
                                        prefix_removed, temp)

                    else:
                        temp = temp + prefix_removed

                self.removed['derivational_prefix'] = ''
                self.complex_prefix_tracker = {}
                temp = temp + res
                backtract = self.eat(temp, True)

                if self.found:
                    return self.found

                # return deriv suffix
                if not (self.found
                        ) and self.removed['derivational_suffix'] != '':

                    if self.removed['derivational_suffix'] == 'kan':

                        temp = temp + 'k'
                        self.removed['derivational_prefix'] = ''
                        self.complex_prefix_tracker = {}
                        backtract = self.eat(temp, True)

                        if self.found:
                            return self.found

                        temp = temp + 'an'

                    else:
                        temp = temp + self.removed['derivational_suffix']

                    self.removed['derivational_prefix'] = ''
                    self.complex_prefix_tracker = {}
                    backtract = self.eat(temp, True)

                # return possessive pronoun
                if not (self.found
                        ) and self.removed['possessive_pronoun'] != '':
                    temp = temp + self.removed['possessive_pronoun']
                    self.removed['derivational_prefix'] = ''
                    self.complex_prefix_tracker = {}
                    backtract = self.eat(temp, True)

                    if self.found:
                        return self.found

                # return particle
                if not (self.found) and self.removed['particle'] != '':
                    temp = temp + self.removed['particle']
                    self.removed['derivational_prefix'] = ''
                    self.complex_prefix_tracker = {}
                    backtract = self.eat(temp, True)

                    if self.found:
                        return self.found

            if self.found:
                return self.found

            result = temp

        if not (backtrack_step) and not (self.error):
            self.error = 'lemma_not_found'

        return word
Exemplo n.º 14
0
    def recode(self, word):
        result = word
        prefixes = self.complex_prefix_tracker
        reverse_ord = list(prefixes.keys())
        reverse_ord.reverse()

        for prefix, changes in prefixes.items():
            recode = self.recoding_tracker[prefix]
            prefix_key = list(changes.keys())[0]
            prefix_added = changes[prefix_key]
            prefix_removed = prefix_key
            temp = ""

            # the code below is different from the original code due to how python insert values to dict
            if prefix_added != '':
                result = pcre.sub('^{}'.format(prefix_added), prefix_removed,
                                  result)
            else:
                result = prefix_removed + result

            if recode != '':
                temp = ""
                temp2 = ""

                for raw_removed, added in recode.items():
                    removed = pcre.sub("[0-9]+", "", raw_removed)
                    if added:
                        temp2 = added
                    else:
                        temp2 = ""

                    temp = pcre.sub('^{}'.format(removed), temp2, result)

                    if self.lookup(temp):
                        self.complex_prefix_tracker[prefix] = {removed: added}
                        return temp

                    previous = ''
                    record = temp
                    before = len(self.complex_prefix_tracker)

                    for i in range(3):
                        previous = record

                        record = self.delete_derivational_prefix(record)

                        if (i == 0 and self.has_disallowed_pairs()
                            ) or record == previous or len(
                                self.removed['derivational_prefix']) > 3:
                            break
                        elif self.found:
                            return record

                    if len(self.complex_prefix_tracker) > before:
                        tempe = dict(self.complex_prefix_tracker)
                        count = 0
                        for key, value in tempe.items():
                            count += 1
                            if count <= before:
                                continue

                            del self.complex_prefix_tracker[key]
                            del self.removed['derivational_prefix'][count - 1]

            # disabling this for now
            # if temp != "":
            #     result = temp

        return word
Exemplo n.º 15
0
    def delete_derivational_prefix(self, word):
        vowel = self.vowel
        consonant = self.consonant
        alpha = self.alpha
        result = word
        prefix_type = ""
        prefix = ""

        patterns = {'plain': "^(di|(k|s)e)", 'complex': "^(b|m|p|t)e"}

        if len(result) < 4:
            return result

        for key, pattern in patterns.items():
            match = re.match(pattern, result)

            if match:
                prefix_type = (key == 'plain')
                prefix = match[0]

                if self.removed[
                        'derivational_prefix'] != '' and prefix in self.removed[
                            'derivational_prefix']:
                    return result

                self.recoding_tracker[match[0]] = ''

                if prefix_type:
                    array = self.removed['derivational_prefix']

                    if prefix == 'ke' and array != '' and (
                            array[0] == 'di'
                            and not (pcre.search('(tawa|tahu)', result))
                            and array[0] != 'be'):
                        return result

                    result = pcre.sub(pattern, '', result)

                    self.complex_prefix_tracker[prefix] = {prefix: ''}

                else:
                    modification = ""

                    #  'be-' prefix rules
                    #   total rule = 5

                    if prefix == 'be':

                        if self.removed['derivational_prefix'] != '':

                            array_key = list(
                                self.complex_prefix_tracker.keys())[
                                    0]  # get first dict value
                            array = self.complex_prefix_tracker[array_key]

                            added_key = list(array.keys())[0]
                            added = array[added_key]
                            pp = added_key

                            if pp not in ['mem', 'pem', 'di', 'ke']:
                                return result

                        # rule 1

                        if pcre.search("^ber{}".format(vowel), result):

                            result = pcre.sub("^ber", '', result)
                            modification = {"ber": ''}
                            self.recoding_tracker[prefix] = {'be': ''}

                        # rule 2
                        elif pcre.search(
                                "^ber[bcdfghjklmnpqstvwxyz][a-z](?!er)",
                                result):

                            result = pcre.sub("^ber", '', result)
                            modification = {'ber': ""}

                        # rule 3
                        elif pcre.search(
                                "^ber[bcdfghjklmnpqstvwxyz][a-z]er{}".format(
                                    vowel), result):

                            result = pcre.sub("^ber", '', result)
                            modification = {'ber': ""}

                        # rule 4
                        elif pcre.search("^belajar$", result):

                            result = pcre.sub("^bel", '', result)
                            modification = {'bel': ""}

                        # rule 5
                        elif pcre.search(
                                "^be[bcdfghjkmnpqstvwxyz]er{}".format(
                                    consonant), result):

                            result = pcre.sub("^be", '', result)
                            modification = {'be': ""}

                        # unsuccessful
                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    # te- prefix rules
                    # total rule : 5

                    elif prefix == 'te':

                        if self.removed['derivational_prefix'] != '':
                            array_key = list(
                                self.complex_prefix_tracker.keys())[
                                    0]  # get first dict value
                            array = self.complex_prefix_tracker[array_key]

                            added_key = list(array.keys())[0]
                            added = array[added_key]
                            pp = added_key

                            if pp != 'ke' and pp in [
                                    'me', 'men', 'pen'
                            ] and not (pcre.search('tawa', result)):
                                return result

                        # rule 6
                        if pcre.search("^ter{}".format(vowel), result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}
                            self.recoding_tracker[prefix] = {'te': ''}

                        # rule 7

                        elif pcre.search(
                                "^ter[bcdfghjklmnpqstvwxyz]er{}".format(vowel),
                                result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}

                        # rule 8
                        elif pcre.search("^ter{}(?!er)".format(consonant),
                                         result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}

                        # rule 9
                        elif pcre.search(
                                "^te[bcdfghjklmnpqstvwxyz]er{}".format(
                                    consonant), result):

                            result = pcre.sub('^te', '', result)
                            modification = {'te': ''}

                        # rule 10
                        elif pcre.search(
                                "^ter[bcdfghjklmnpqstvwxyz]er{}".format(
                                    consonant), result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}

                        # unsuccessful

                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    # me- prefix rules
                    # total rule = 10

                    elif prefix == 'me':

                        if self.removed['derivational_prefix'] != '':
                            return result

                        # rule 11
                        if pcre.search('^me[lrwy]{}'.format(vowel), result):

                            result = pcre.sub('^me', '', result)
                            modification = {'me': ''}

                        # rule 12

                        elif pcre.search('^mem[bfv]', result):

                            result = pcre.sub('^mem', '', result)
                            modification = {'mem': ''}

                        # rule 13
                        elif pcre.search('^mempe', result):

                            result = pcre.sub('^mem', '', result)
                            modification = {'mem': ''}

                        # rule 14
                        elif pcre.search("^mem(r?)[aiueo]", result):
                            match = pcre.search("^mem(r?)[aiueo]", result)
                            result = pcre.sub('^me', '', result)
                            modification = {'me{}'.format(match.group(1)): ''}
                            self.recoding_tracker[prefix] = {'mem': 'p'}

                        # rule 15

                        elif pcre.search('^men[cdsjz]', result):

                            result = pcre.sub('^men', '', result)
                            modification = {'men': ''}

                        # rule 16

                        elif pcre.search('^men{}'.format(vowel), result):

                            result = pcre.sub('^men', 't', result)
                            modification = {'men': 't'}
                            self.recoding_tracker[prefix] = {'me': ''}

                        # rule 17

                        elif pcre.search('^meng[ghqk]', result):

                            result = pcre.sub('^meng', '', result)
                            modification = {'meng': ''}

                        # rule 18

                        elif pcre.search('^meng({})'.format(vowel), result):
                            match = pcre.search('^meng({})'.format(vowel),
                                                result)
                            result = pcre.sub('^meng', '', result)
                            modification = {'meng': ''}

                            self.recoding_tracker[prefix] = {'meng1': 'k'}
                            self.recoding_tracker[prefix]['menge'] = ''

                        # rule 19
                        elif pcre.search('^meny{}'.format(vowel), result):

                            result = pcre.sub('^me', '', result)
                            modification = {'me': ''}
                            self.recoding_tracker[prefix] = {'meny': 's'}

                        # rule 20
                        elif pcre.search('^memp[abcdfghijklmnopqrstuvwxyz]',
                                         result):

                            result = pcre.sub('^mem', '', result)
                            modification = {'mem': ''}

                        # unsuccesful
                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    # pe- prefix rules
                    # total rule = 15

                    elif prefix == 'pe':

                        if self.removed['derivational_prefix'] != '':
                            array_key = list(
                                self.complex_prefix_tracker.keys())[
                                    0]  # get first dict value
                            array = self.complex_prefix_tracker[array_key]

                            added_key = list(array.keys())[0]
                            added = array[added_key]
                            pp = added_key

                            if pp not in ['di', 'ber', 'mem', 'se', 'ke']:
                                return result

                        # rule 21
                        if pcre.search('^pe[wy]{}'.format(vowel), result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # rule 22
                        elif pcre.search('^per{}'.format(vowel), result):

                            result = pcre.sub('^per', '', result)
                            modification = {'per': ''}
                            self.recoding_tracker[prefix] = {'pe': ''}

                        # rule 23
                        elif pcre.search(
                                '^per[bcdfghjklmnpqstvwxyz][a-z](?!er)',
                                result):

                            result = pcre.sub('^per', '', result)
                            modification = {'per': ''}

                        # rule 24
                        elif pcre.search(
                                '^per[bcdfghjklmnpqstvwxyz][a-z]er{}'.format(
                                    vowel), result):

                            result = pcre.sub('^per', '', result)
                            modification = {'per': ''}

                        # rule 25
                        elif pcre.search('^pem[bfv]', result):

                            result = pcre.sub('^pem', '', result)
                            modification = {'pem': ''}

                        # rule 26
                        elif pcre.search('^pem(r?){}'.format(vowel), result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}
                            self.recoding_tracker[prefix] = {'pem': 'p'}

                        # rule 27
                        elif pcre.search('^pen[cdjz]', result):

                            result = pcre.sub('^pen', '', result)
                            modification = {'pen': ''}

                        # rule 28
                        elif pcre.search('^pen{}'.format(vowel), result):

                            result = pcre.sub('^pen', 't', result)
                            modification = {'pen': 't'}
                            self.recoding_tracker[prefix] = {'pe': ''}

                        # rule 29
                        elif pcre.search('^peng{}'.format(consonant), result):

                            result = pcre.sub('^peng', '', result)
                            modification = {'peng': ''}

                        # rule 30
                        elif pcre.search('^peng({})'.format(vowel), result):
                            match = pcre.search('^peng({})'.format(vowel),
                                                result)
                            result = pcre.sub('^peng', '', result)
                            modification = {'peng': ''}

                            self.recoding_tracker[prefix] = {'peng1': 'k'}
                            self.recoding_tracker[prefix]['penge'] = ''

                        # rule 31
                        elif pcre.search('^peny{}'.format(vowel), result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}
                            self.recoding_tracker[prefix] = {'peny': 's'}

                        # rule 32
                        elif pcre.search('^pel{}'.format(vowel), result):

                            if (result == 'pelajar'):
                                result = pcre.sub('^pel', '', result)
                                modification = {'pel': ''}
                            else:
                                result = pcre.sub("^pe", "", result)
                                modification = {'pe': ''}

                        # rule 33
                        elif pcre.search(
                                '^pe[bcdfghjkpqstvxz]er{}'.format(vowel),
                                result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # rule 34
                        elif pcre.search('^pe[bcdfghjkpqstvxz](?!er)', result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # rule 35
                        elif pcre.search(
                                '^pe[bcdfghjkpqstvxz]er{}'.format(consonant),
                                result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # unsuccessful
                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    if modification != "":
                        self.complex_prefix_tracker[prefix] = modification
                    else:
                        return result

                if self.removed['derivational_prefix'] == '':
                    self.removed['derivational_prefix'] = []

                self.removed['derivational_prefix'].append(prefix)
                self.lookup(result)
                return result

        return result
Exemplo n.º 16
0
 def test_bug_449964(self):
     # fails for group followed by other escape
     self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
                      'xx\bxx\b')
Exemplo n.º 17
0
def execute(mode, code, input_str):
  result = ""

  if mode == "l":
     rows = [pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)]
     table = {}
     for row in rows:
       table.update(dict(zip(row[:-1],[row[-1]]*(len(row)-1))))
     if input_str in table:
       result = table[input_str]
     else:
       result = table["?"]
  elif mode == "f":
     result = code % ast.literal_eval(input_str)
  elif mode == "F":
     literal = ast.literal_eval(input_str)
     if isinstance(literal, tuple):
       result = code % literal
       input_str = str(sum([len(str(x)) for x in literal]))
     else:
       result = code % literal
       input_str = str(len(str(literal)))
  elif mode == "g":
    for string in exrex.generate(code):
      print(string.encode("utf-8").decode("unicode-escape"))
    return # Generate is always terminal
  elif mode == "h":
    if type(input_str) is str:
      input_str = pcre.escape(input_str)
    for string in exrex.generate(code % input_str):
      print(string.encode("utf-8").decode("unicode-escape"))
    return
  elif mode == "p":
    literal = ast.literal_eval(input_str)
    if isinstance(literal, int):
      result = pcre.sub(r"(?<![^\\]\\)~(.+?)(?<![^\\]\\)~",r"\1" * literal, code, flags=pcre.DOTALL)
    else:
      result = pcre.sub(r"(?<![^\\]\\)%(.+?)(?<![^\\]\\)%",r"\1" * literal[1], pcre.sub(r"~(.+?)~",r"\1" * literal[0], code, flags=pcre.DOTALL), flags=pcre.DOTALL)
  elif mode == "P":
    result = pcre.sub(r"(.)(?<![^\\]\\)~",r"\1" * ast.literal_eval(input_str), code, flags=pcre.DOTALL)
  elif mode == "e":
    rows = [pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)]
    table = {}
    for row in rows:
      table.update(dict(zip(row[:-1],[row[-1]]*(len(row)-1))))
    for char in i:
      result += table[i]
  elif mode == "o":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    print(pieces[0].encode("utf-8").decode("unicode-escape"))
    result = "`" + "`".join(pieces[1:])
  elif mode == "s":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    sub_length = len(subs)

    for i in range(0, len(subs), 2):
      input_str = pcre.sub(subs[i], subs[i + 1], input_str)

    if len(pieces) > 1:
      result = "`" + "`".join(pieces[1:])
    else:
      result = input_str
  elif mode == "d":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])

    for sub in subs:
      input_str = pcre.sub(sub, "", input_str)

    if len(pieces) > 1:
      result = "`" + "`".join(pieces[1:])
    else:
      result = input_str
  elif mode == "S":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    sub_length = len(subs)
    output = input_str
    for i in range(0, len(subs), 2):
      output = pcre.sub(subs[i], subs[i + 1], output)
    if len(pieces) > 1:
      result = "`" + "`".join(pieces[1:])
    else:
      result = ""
    print(output.encode("utf-8").decode("unicode-escape"))
  elif mode == "i":
    result = code + input_str
  elif mode == "I":
    result = code + "\n" + input_str
  else:
    result = code

  if len(result) > 0 and result[0] == "`":
    input_pieces = pcre.split(r"(?<![^\\]\\)!", result)
    if len(input_pieces) >= 2:
      execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:]))
    else:
      execute(result[1], result[2:], get_input(input_str))
  else:
    print(result.encode("utf-8").decode("unicode-escape"))
Exemplo n.º 18
0
def handle_subs(string, subs):
  for i in range(0, len(subs), 2):
    string = pcre.sub(subs[i], subs[i + 1], string)
    
  return string