示例#1
0
 def parse_named_character_class(self):
     """
     Parse a named character class ([:...:]) subexpresion from the string at its current index.
     @return: a CoverageSet object covering all the characters covered by the named class
     """
     classes = {
         u"alnum": [Parser.lowercase, Parser.uppercase, Parser.digits],
         u"word": [Parser.lowercase, Parser.uppercase, Parser.digits, Parser.underscore],
         u"alpha": [Parser.uppercase, Parser.lowercase],
         u"blank": [Parser.space, Parser.tab],
         u"cntrl": [(0, 31)],
         u"digit": [Parser.digits],
         u"graph": [(33, 127)],
         u"lower": [Parser.lowercase],
         u"print": [(32, 127)],
         u"punct": [(ord(i), ord(i)) for i in "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"],
         u"space": [(ord(i), ord(i)) for i in string.whitespace],
         u"xdigit": [Parser.digits, (ord('a'), ord('f')), (ord('A'), ord('F'))],
         u"upper": [Parser.uppercase]
     }
     unicode_classes = {
         u"alnum": {False: [('alpha', None), ('digit', None)], True: []},
         u"word": {False: [('alpha', None), ('gc', 'Mark'), ('digit', None), ('gc', 'Connector_Punctuation'), ('Join_Control', None)], True: []},
         u"alpha": {False: [('alpha', None)], True: []},
         u"blank": {False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: []},
         u"cntrl": {False: [('gc', 'Control')], True: []},
         u"digit": {False: [('gc', 'Decimal_Number')], True: []},
         u"graph": {False: [], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')]},
         u"lower": {False: [('Lowercase', None)], True: []},
         u"print": {False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')]},
         u"punct": {False: [('gc', 'Punctuation')], True: []},
         u"space": {False: [('Whitespace', None)], True: []},
         u"xdigit": {False: [('gc', 'Decimal_Number'), ('Hex_Digit', None)], True: []},
         u"upper": {False: [('Uppercase', None)], True: []}
     }
     
     self.expect(u':')
     class_name = ""
     while self.next_is(string.ascii_letters):
         class_name += self.get_next()
     self.expect(":")
     self.expect("]")
     
     if self.is_unicode_defaults and class_name in unicode_classes:
         class_queries = unicode_classes[class_name]
         instance = UnicodeQuery.instance(self.unicode_db)
         coverage = CoverageSet()
         for k, v in class_queries[False]:
             coverage.update(instance.query(k, v))
         if len(class_queries[True]) > 0:
             inverted_coverage = CoverageSet([(1, 0x10ffff)])
             for k, v in class_queries[True]:
                 inverted_coverage.difference_update(instance.query(k, v))
             coverage.update(inverted_coverage)
         return coverage
             
     elif not self.is_unicode_defaults and class_name in classes:
         return CoverageSet(classes[class_name])
     else:
         raise RegexParserExceptionInternal("Character class '%s' not recognized" % class_name)
示例#2
0
 def parse_unicode_subexpression(self):
     """
     Parse a single "NAME=VALUE" term for a unicode property.
     @returns: A CoverageSet object containing the values covered by the term
     """
     name = self.parse_unicode_word()
     if self.get_next_if(u':='):
         value = self.parse_unicode_word()
     else:
         value = None
     coverage = UnicodeQuery.instance(self.unicode_db).query(name, value)
     return coverage
示例#3
0
 def parse_unicode_name(self):
     """
     Parse a unicode "{...}" expresion for querying unicode names.
     @returns: A CoverageSet object containing all characters covered by
         the specified unicode query
     """
     self.expect("{")
     name = self.parse_unicode_word()
     coverage = UnicodeQuery.instance(self.unicode_db).query('na', name)
     if coverage.empty():
         raise ValueError("Name '{name}' not found".format(name=name))
     return coverage
示例#4
0
 def parse_unicode_subexpression(self):
     """
     Parse a single "NAME=VALUE" term for a unicode property.
     @returns: A CoverageSet object containing the values covered by the term
     """
     name = self.parse_unicode_word()
     if self.get_next_if(u':='):
         value = self.parse_unicode_word()
     else:
         value = None
     coverage = UnicodeQuery.instance(self.unicode_db).query(name, value)
     return coverage
示例#5
0
 def parse_unicode_name(self):
     """
     Parse a unicode "{...}" expresion for querying unicode names.
     @returns: A CoverageSet object containing all characters covered by
         the specified unicode query
     """
     self.expect("{")
     name = self.parse_unicode_word()
     coverage = UnicodeQuery.instance(self.unicode_db).query('na', name)
     if coverage.empty():
         raise ValueError("Name '{name}' not found".format(name=name))
     return coverage
示例#6
0
    def parse_named_character_class(self):
        """
        Parse a named character class ([:...:]) subexpresion from the string at its current index.
        @return: a CoverageSet object covering all the characters covered by the named class
        """
        classes = {
            u"alnum": [Parser.lowercase, Parser.uppercase, Parser.digits],
            u"word": [
                Parser.lowercase, Parser.uppercase, Parser.digits,
                Parser.underscore
            ],
            u"alpha": [Parser.uppercase, Parser.lowercase],
            u"blank": [Parser.space, Parser.tab],
            u"cntrl": [(0, 31)],
            u"digit": [Parser.digits],
            u"graph": [(33, 127)],
            u"lower": [Parser.lowercase],
            u"print": [(32, 127)],
            u"punct":
            [(ord(i), ord(i)) for i in "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"],
            u"space": [(ord(i), ord(i)) for i in string.whitespace],
            u"xdigit":
            [Parser.digits, (ord('a'), ord('f')), (ord('A'), ord('F'))],
            u"upper": [Parser.uppercase]
        }
        unicode_classes = {
            u"alnum": {
                False: [('alpha', None), ('digit', None)],
                True: []
            },
            u"word": {
                False: [('alpha', None), ('gc', 'Mark'), ('digit', None),
                        ('gc', 'Connector_Punctuation'),
                        ('Join_Control', None)],
                True: []
            },
            u"alpha": {
                False: [('alpha', None)],
                True: []
            },
            u"blank": {
                False: [('gc', 'Space_Separator'),
                        ('na1', 'CHARACTER TABULATION')],
                True: []
            },
            u"cntrl": {
                False: [('gc', 'Control')],
                True: []
            },
            u"digit": {
                False: [('gc', 'Decimal_Number')],
                True: []
            },
            u"graph": {
                False: [],
                True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'),
                       ('gc', 'Unassigned')]
            },
            u"lower": {
                False: [('Lowercase', None)],
                True: []
            },
            u"print": {
                False: [('gc', 'Space_Separator'),
                        ('na1', 'CHARACTER TABULATION')],
                True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'),
                       ('gc', 'Unassigned')]
            },
            u"punct": {
                False: [('gc', 'Punctuation')],
                True: []
            },
            u"space": {
                False: [('Whitespace', None)],
                True: []
            },
            u"xdigit": {
                False: [('gc', 'Decimal_Number'), ('Hex_Digit', None)],
                True: []
            },
            u"upper": {
                False: [('Uppercase', None)],
                True: []
            }
        }

        self.expect(u':')
        class_name = ""
        while self.next_is(string.ascii_letters):
            class_name += self.get_next()
        self.expect(":")
        self.expect("]")

        if self.is_unicode_defaults and class_name in unicode_classes:
            class_queries = unicode_classes[class_name]
            instance = UnicodeQuery.instance(self.unicode_db)
            coverage = CoverageSet()
            for k, v in class_queries[False]:
                coverage.update(instance.query(k, v))
            if len(class_queries[True]) > 0:
                inverted_coverage = CoverageSet([(1, 0x10ffff)])
                for k, v in class_queries[True]:
                    inverted_coverage.difference_update(instance.query(k, v))
                coverage.update(inverted_coverage)
            return coverage

        elif not self.is_unicode_defaults and class_name in classes:
            return CoverageSet(classes[class_name])
        else:
            raise RegexParserExceptionInternal(
                "Character class '%s' not recognized" % class_name)