def parse_named_character_class(self): """ Parse a named character class ([:...:]) subexpresion from the string at its current index. @return: a CoverageSet object covering all the characters covered by the named class """ classes = { u"alnum": [Parser.lowercase, Parser.uppercase, Parser.digits], u"word": [Parser.lowercase, Parser.uppercase, Parser.digits, Parser.underscore], u"alpha": [Parser.uppercase, Parser.lowercase], u"blank": [Parser.space, Parser.tab], u"cntrl": [(0, 31)], u"digit": [Parser.digits], u"graph": [(33, 127)], u"lower": [Parser.lowercase], u"print": [(32, 127)], u"punct": [(ord(i), ord(i)) for i in "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"], u"space": [(ord(i), ord(i)) for i in string.whitespace], u"xdigit": [Parser.digits, (ord('a'), ord('f')), (ord('A'), ord('F'))], u"upper": [Parser.uppercase] } unicode_classes = { u"alnum": {False: [('alpha', None), ('digit', None)], True: []}, u"word": {False: [('alpha', None), ('gc', 'Mark'), ('digit', None), ('gc', 'Connector_Punctuation'), ('Join_Control', None)], True: []}, u"alpha": {False: [('alpha', None)], True: []}, u"blank": {False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: []}, u"cntrl": {False: [('gc', 'Control')], True: []}, u"digit": {False: [('gc', 'Decimal_Number')], True: []}, u"graph": {False: [], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')]}, u"lower": {False: [('Lowercase', None)], True: []}, u"print": {False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')]}, u"punct": {False: [('gc', 'Punctuation')], True: []}, u"space": {False: [('Whitespace', None)], True: []}, u"xdigit": {False: [('gc', 'Decimal_Number'), ('Hex_Digit', None)], True: []}, u"upper": {False: [('Uppercase', None)], True: []} } self.expect(u':') class_name = "" while self.next_is(string.ascii_letters): class_name += self.get_next() self.expect(":") self.expect("]") if self.is_unicode_defaults and class_name in unicode_classes: class_queries = unicode_classes[class_name] instance = UnicodeQuery.instance(self.unicode_db) coverage = CoverageSet() for k, v in class_queries[False]: coverage.update(instance.query(k, v)) if len(class_queries[True]) > 0: inverted_coverage = CoverageSet([(1, 0x10ffff)]) for k, v in class_queries[True]: inverted_coverage.difference_update(instance.query(k, v)) coverage.update(inverted_coverage) return coverage elif not self.is_unicode_defaults and class_name in classes: return CoverageSet(classes[class_name]) else: raise RegexParserExceptionInternal("Character class '%s' not recognized" % class_name)
def parse_unicode_subexpression(self): """ Parse a single "NAME=VALUE" term for a unicode property. @returns: A CoverageSet object containing the values covered by the term """ name = self.parse_unicode_word() if self.get_next_if(u':='): value = self.parse_unicode_word() else: value = None coverage = UnicodeQuery.instance(self.unicode_db).query(name, value) return coverage
def parse_unicode_name(self): """ Parse a unicode "{...}" expresion for querying unicode names. @returns: A CoverageSet object containing all characters covered by the specified unicode query """ self.expect("{") name = self.parse_unicode_word() coverage = UnicodeQuery.instance(self.unicode_db).query('na', name) if coverage.empty(): raise ValueError("Name '{name}' not found".format(name=name)) return coverage
def parse_named_character_class(self): """ Parse a named character class ([:...:]) subexpresion from the string at its current index. @return: a CoverageSet object covering all the characters covered by the named class """ classes = { u"alnum": [Parser.lowercase, Parser.uppercase, Parser.digits], u"word": [ Parser.lowercase, Parser.uppercase, Parser.digits, Parser.underscore ], u"alpha": [Parser.uppercase, Parser.lowercase], u"blank": [Parser.space, Parser.tab], u"cntrl": [(0, 31)], u"digit": [Parser.digits], u"graph": [(33, 127)], u"lower": [Parser.lowercase], u"print": [(32, 127)], u"punct": [(ord(i), ord(i)) for i in "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"], u"space": [(ord(i), ord(i)) for i in string.whitespace], u"xdigit": [Parser.digits, (ord('a'), ord('f')), (ord('A'), ord('F'))], u"upper": [Parser.uppercase] } unicode_classes = { u"alnum": { False: [('alpha', None), ('digit', None)], True: [] }, u"word": { False: [('alpha', None), ('gc', 'Mark'), ('digit', None), ('gc', 'Connector_Punctuation'), ('Join_Control', None)], True: [] }, u"alpha": { False: [('alpha', None)], True: [] }, u"blank": { False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: [] }, u"cntrl": { False: [('gc', 'Control')], True: [] }, u"digit": { False: [('gc', 'Decimal_Number')], True: [] }, u"graph": { False: [], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')] }, u"lower": { False: [('Lowercase', None)], True: [] }, u"print": { False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')] }, u"punct": { False: [('gc', 'Punctuation')], True: [] }, u"space": { False: [('Whitespace', None)], True: [] }, u"xdigit": { False: [('gc', 'Decimal_Number'), ('Hex_Digit', None)], True: [] }, u"upper": { False: [('Uppercase', None)], True: [] } } self.expect(u':') class_name = "" while self.next_is(string.ascii_letters): class_name += self.get_next() self.expect(":") self.expect("]") if self.is_unicode_defaults and class_name in unicode_classes: class_queries = unicode_classes[class_name] instance = UnicodeQuery.instance(self.unicode_db) coverage = CoverageSet() for k, v in class_queries[False]: coverage.update(instance.query(k, v)) if len(class_queries[True]) > 0: inverted_coverage = CoverageSet([(1, 0x10ffff)]) for k, v in class_queries[True]: inverted_coverage.difference_update(instance.query(k, v)) coverage.update(inverted_coverage) return coverage elif not self.is_unicode_defaults and class_name in classes: return CoverageSet(classes[class_name]) else: raise RegexParserExceptionInternal( "Character class '%s' not recognized" % class_name)