示例#1
0
 def parse_named_character_class(self):
     """
     Parse a named character class ([:...:]) subexpresion from the string at its current index.
     @return: a CoverageSet object covering all the characters covered by the named class
     """
     classes = {
         u"alnum": [Parser.lowercase, Parser.uppercase, Parser.digits],
         u"word": [Parser.lowercase, Parser.uppercase, Parser.digits, Parser.underscore],
         u"alpha": [Parser.uppercase, Parser.lowercase],
         u"blank": [Parser.space, Parser.tab],
         u"cntrl": [(0, 31)],
         u"digit": [Parser.digits],
         u"graph": [(33, 127)],
         u"lower": [Parser.lowercase],
         u"print": [(32, 127)],
         u"punct": [(ord(i), ord(i)) for i in "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"],
         u"space": [(ord(i), ord(i)) for i in string.whitespace],
         u"xdigit": [Parser.digits, (ord('a'), ord('f')), (ord('A'), ord('F'))],
         u"upper": [Parser.uppercase]
     }
     unicode_classes = {
         u"alnum": {False: [('alpha', None), ('digit', None)], True: []},
         u"word": {False: [('alpha', None), ('gc', 'Mark'), ('digit', None), ('gc', 'Connector_Punctuation'), ('Join_Control', None)], True: []},
         u"alpha": {False: [('alpha', None)], True: []},
         u"blank": {False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: []},
         u"cntrl": {False: [('gc', 'Control')], True: []},
         u"digit": {False: [('gc', 'Decimal_Number')], True: []},
         u"graph": {False: [], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')]},
         u"lower": {False: [('Lowercase', None)], True: []},
         u"print": {False: [('gc', 'Space_Separator'), ('na1', 'CHARACTER TABULATION')], True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'), ('gc', 'Unassigned')]},
         u"punct": {False: [('gc', 'Punctuation')], True: []},
         u"space": {False: [('Whitespace', None)], True: []},
         u"xdigit": {False: [('gc', 'Decimal_Number'), ('Hex_Digit', None)], True: []},
         u"upper": {False: [('Uppercase', None)], True: []}
     }
     
     self.expect(u':')
     class_name = ""
     while self.next_is(string.ascii_letters):
         class_name += self.get_next()
     self.expect(":")
     self.expect("]")
     
     if self.is_unicode_defaults and class_name in unicode_classes:
         class_queries = unicode_classes[class_name]
         instance = UnicodeQuery.instance(self.unicode_db)
         coverage = CoverageSet()
         for k, v in class_queries[False]:
             coverage.update(instance.query(k, v))
         if len(class_queries[True]) > 0:
             inverted_coverage = CoverageSet([(1, 0x10ffff)])
             for k, v in class_queries[True]:
                 inverted_coverage.difference_update(instance.query(k, v))
             coverage.update(inverted_coverage)
         return coverage
             
     elif not self.is_unicode_defaults and class_name in classes:
         return CoverageSet(classes[class_name])
     else:
         raise RegexParserExceptionInternal("Character class '%s' not recognized" % class_name)
示例#2
0
 def parse_unicode_subexpression(self):
     """
     Parse a single "NAME=VALUE" term for a unicode property.
     @returns: A CoverageSet object containing the values covered by the term
     """
     name = self.parse_unicode_word()
     if self.get_next_if(u':='):
         value = self.parse_unicode_word()
     else:
         value = None
     coverage = UnicodeQuery.instance(self.unicode_db).query(name, value)
     return coverage
示例#3
0
 def parse_unicode_name(self):
     """
     Parse a unicode "{...}" expresion for querying unicode names.
     @returns: A CoverageSet object containing all characters covered by
         the specified unicode query
     """
     self.expect("{")
     name = self.parse_unicode_word()
     coverage = UnicodeQuery.instance(self.unicode_db).query('na', name)
     if coverage.empty():
         raise ValueError("Name '{name}' not found".format(name=name))
     return coverage
示例#4
0
 def parse_unicode_subexpression(self):
     """
     Parse a single "NAME=VALUE" term for a unicode property.
     @returns: A CoverageSet object containing the values covered by the term
     """
     name = self.parse_unicode_word()
     if self.get_next_if(u':='):
         value = self.parse_unicode_word()
     else:
         value = None
     coverage = UnicodeQuery.instance(self.unicode_db).query(name, value)
     return coverage
示例#5
0
 def parse_unicode_name(self):
     """
     Parse a unicode "{...}" expresion for querying unicode names.
     @returns: A CoverageSet object containing all characters covered by
         the specified unicode query
     """
     self.expect("{")
     name = self.parse_unicode_word()
     coverage = UnicodeQuery.instance(self.unicode_db).query('na', name)
     if coverage.empty():
         raise ValueError("Name '{name}' not found".format(name=name))
     return coverage
示例#6
0
    def parse_named_character_class(self):
        """
        Parse a named character class ([:...:]) subexpresion from the string at its current index.
        @return: a CoverageSet object covering all the characters covered by the named class
        """
        classes = {
            u"alnum": [Parser.lowercase, Parser.uppercase, Parser.digits],
            u"word": [
                Parser.lowercase, Parser.uppercase, Parser.digits,
                Parser.underscore
            ],
            u"alpha": [Parser.uppercase, Parser.lowercase],
            u"blank": [Parser.space, Parser.tab],
            u"cntrl": [(0, 31)],
            u"digit": [Parser.digits],
            u"graph": [(33, 127)],
            u"lower": [Parser.lowercase],
            u"print": [(32, 127)],
            u"punct":
            [(ord(i), ord(i)) for i in "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"],
            u"space": [(ord(i), ord(i)) for i in string.whitespace],
            u"xdigit":
            [Parser.digits, (ord('a'), ord('f')), (ord('A'), ord('F'))],
            u"upper": [Parser.uppercase]
        }
        unicode_classes = {
            u"alnum": {
                False: [('alpha', None), ('digit', None)],
                True: []
            },
            u"word": {
                False: [('alpha', None), ('gc', 'Mark'), ('digit', None),
                        ('gc', 'Connector_Punctuation'),
                        ('Join_Control', None)],
                True: []
            },
            u"alpha": {
                False: [('alpha', None)],
                True: []
            },
            u"blank": {
                False: [('gc', 'Space_Separator'),
                        ('na1', 'CHARACTER TABULATION')],
                True: []
            },
            u"cntrl": {
                False: [('gc', 'Control')],
                True: []
            },
            u"digit": {
                False: [('gc', 'Decimal_Number')],
                True: []
            },
            u"graph": {
                False: [],
                True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'),
                       ('gc', 'Unassigned')]
            },
            u"lower": {
                False: [('Lowercase', None)],
                True: []
            },
            u"print": {
                False: [('gc', 'Space_Separator'),
                        ('na1', 'CHARACTER TABULATION')],
                True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'),
                       ('gc', 'Unassigned')]
            },
            u"punct": {
                False: [('gc', 'Punctuation')],
                True: []
            },
            u"space": {
                False: [('Whitespace', None)],
                True: []
            },
            u"xdigit": {
                False: [('gc', 'Decimal_Number'), ('Hex_Digit', None)],
                True: []
            },
            u"upper": {
                False: [('Uppercase', None)],
                True: []
            }
        }

        self.expect(u':')
        class_name = ""
        while self.next_is(string.ascii_letters):
            class_name += self.get_next()
        self.expect(":")
        self.expect("]")

        if self.is_unicode_defaults and class_name in unicode_classes:
            class_queries = unicode_classes[class_name]
            instance = UnicodeQuery.instance(self.unicode_db)
            coverage = CoverageSet()
            for k, v in class_queries[False]:
                coverage.update(instance.query(k, v))
            if len(class_queries[True]) > 0:
                inverted_coverage = CoverageSet([(1, 0x10ffff)])
                for k, v in class_queries[True]:
                    inverted_coverage.difference_update(instance.query(k, v))
                coverage.update(inverted_coverage)
            return coverage

        elif not self.is_unicode_defaults and class_name in classes:
            return CoverageSet(classes[class_name])
        else:
            raise RegexParserExceptionInternal(
                "Character class '%s' not recognized" % class_name)
示例#7
0
class Parser(object):
    """
    Converts a string containing a regular expression into a visitable regular expression object.
    """
    lowercase = (ord('a'), ord('z'))
    uppercase = (ord('A'), ord('Z'))
    digits = (ord('0'), ord('9'))
    underscore = (ord('_'), ord('_'))
    tab = (9, 9)
    line_feed = (10, 10)
    vertical_tab = (11, 11)
    form_feed = (12, 12)
    carriage_return = (13, 13)
    space = ((ord(' '), ord(' ')))
    special = u'(){}[].^$*+-~&?|:'
    closing = u')}]'
    unicode_db = UnicodeQuery.find_db()
    set_operators = "-&~|"

    def __init__(self,
                 text,
                 is_case_insensitive=False,
                 is_unicode_defaults=False,
                 is_literal=False,
                 unicode_db=None):
        """
        @param text: string containing the regular expression
        @param is_case_insensitive: boolean which is true if the regular expression should be case insensitive
        @param unicode_db: location of the folder containing the JSON unicode property data files
        """
        self.text = text
        self.index = 0
        self.is_case_insensitive = is_case_insensitive
        self.is_unicode_defaults = is_unicode_defaults
        self.is_literal = is_literal
        if unicode_db is not None:
            self.unicode_db = unicode_db
        elif Parser.unicode_db is None and unicode_db is None:
            raise ValueError(
                "Unicode database not found and no database provided")

    def parse(self):
        """
        Parse the string from its current index and return a regular expression
        @return: a visitable regular expression object from the Regex package
        """
        regex_object = self.parse_alternation()
        if self.index < len(self.text):
            self.expect(u'end of pattern')
        return regex_object

    def parse_alternation(self):
        """
        Parse the string from its current index into an alternation or an expression that can be contained by an alternation.
        @return: a visitable regular expression object from the Regex package.
        """
        concatenations = [self.parse_concatenation()]
        while self.get_next_if(u'|'):
            concatenations.append(self.parse_concatenation())
        if len(concatenations) > 1:
            return Regex.Alternation(concatenations)
        else:
            return concatenations[0]

    def parse_concatenation(self):
        """
        Parse the string from its current index into a concatenation or an expression that can be contained by an concatenation.
        @return: a visitable regular expression object from the Regex package.
        """
        repetitions = [self.parse_qualified()]
        while self.next_is_not(u'|)'):
            repetitions.append(self.parse_qualified())
        if len(repetitions) > 1:
            return Regex.Concatenation(repetitions)
        else:
            return repetitions[0]

    def parse_qualified(self):
        """
        Parse the string from its current index into a repetition or an expression that can be contained by an repetition.
        @return: a visitable regular expression object from the Regex package.
        """
        child = self.parse_character()
        if self.get_next_if(u'*'):
            return Regex.Repetition(child, 0, Regex.Repetition.Infinity)
        elif self.get_next_if(u'+'):
            return Regex.Repetition(child, 1, Regex.Repetition.Infinity)
        elif self.get_next_if(u'?'):
            return Regex.Repetition(child, 0, 1)
        # Need to look ahead two characters because it might be a variable following this one
        elif self.next_is(u'{') and self.nth_next_is_not(
                2, string.ascii_letters + '.'):
            self.get_next()
            return self.parse_repetition(child)
        else:
            return child

    def parse_character(self):
        """
        Parse the string from its current index ino a character set or a sub-expression
        @return: a visitable regular expression object from the Regex package.
        """
        if self.get_next_if(u'['):
            return self.parse_character_class()

        elif self.get_next_if(u'('):
            child = self.parse_alternation()
            self.expect(u')')
            return child

        elif self.get_next_if(u'{'):
            return self.parse_variable()

        else:
            return self.parse_literal()

    def parse_literal(self, suppress_case_insensitive=False):
        """
        Parse a single character from the string, from its current index into a literal expression
        @return: a Regex.Literal or Regex.LiteralExcept object representing the character.
        """
        def get_literal(character, is_case_insensitive):
            if is_case_insensitive and not suppress_case_insensitive:
                lowercase = character.lower()
                uppercase = character.upper()
                if lowercase != uppercase:
                    return Regex.Literal([(ord(lowercase), ord(lowercase)),
                                          (ord(uppercase), ord(uppercase))])
            return Regex.Literal([(ord(character), ord(character))])

        if self.get_next_if(u'.'):
            return Regex.Literal([(1, 0x10FFFF)])
        elif self.get_next_if(u'\\'):
            if self.get_next_if(u'w'):
                return Regex.Literal(
                    [Parser.lowercase, Parser.uppercase, Parser.underscore])
            elif self.get_next_if(u'W'):
                return Regex.LiteralExcept(
                    [Parser.lowercase, Parser.uppercase, Parser.underscore])
            elif self.get_next_if(u'r'):
                return Regex.Literal([Parser.carriage_return])
            elif self.get_next_if(u'n'):
                return Regex.Literal([Parser.line_feed])
            elif self.get_next_if(u't'):
                return Regex.Literal([Parser.tab])
            elif self.get_next_if(u's'):
                return Regex.Literal([Parser.space])
            elif self.get_next_if(u'd'):
                return Regex.Literal([Parser.digits])
            elif self.get_next_if(u'v'):
                return Regex.Literal([Parser.vertical_tab])
            elif self.get_next_if(u'f'):
                return Regex.Literal([Parser.form_feed])
            elif self.get_next_if(u'x'):
                codepoint = self.parse_hex_digits(2)
                return Regex.Literal([(codepoint, codepoint)])
            elif self.get_next_if(u'p'):
                coverage = self.parse_unicode_expression()
                return Regex.Literal(coverage)
            elif self.get_next_if(u'N'):
                coverage = self.parse_unicode_name()
                return Regex.Literal(coverage)
            elif self.get_next_if(u'P'):
                coverage = self.parse_unicode_expression()
                return Regex.LiteralExcept(coverage)
            elif self.get_next_if(u'u'):
                codepoint = self.parse_hex_digits(4)
                return Regex.Literal([(codepoint, codepoint)])
            elif self.get_next_if(u'U'):
                codepoint = self.parse_hex_digits(6)
                return Regex.Literal([(codepoint, codepoint)])
            else:
                return get_literal(self.get_next(), self.is_case_insensitive)
        else:
            character = self.get_next()
            if character in Parser.closing:
                raise RegexParserExpected("character", self.text,
                                          self.index - 1)
            elif character in Parser.special:
                raise RegexParserInvalidCharacter(character)
            return get_literal(character, self.is_case_insensitive)

    def parse_hex_digits(self, num_digits):
        hex_text = ''
        for i in range(num_digits):
            if not self.next_is(string.hexdigits):
                self.expect(string.hexdigits, name='hexadecimal digit')
            hex_text += self.get_next()
        value = int(hex_text, 16)
        if (value > 0x10ffff):
            raise RegexParserUnicodeCodepointOutOfRange(value)
        return value

    def parse_variable(self):
        """
        Parse a variable instance from the string at its current index
        @return: a Regex.Variable object representing the variable instance
        """
        variable_name = ''
        while self.next_is(string.ascii_letters + '.'):
            variable_name += self.get_next()
        self.expect("}")
        return Regex.Variable(variable_name)

    def parse_repetition(self, child):
        """
        Parse a {min, max} expression from the string at its current index
        @param child: a visital object from the Regex package containing the repeated expression.
        @return: a Regex.Repetition object representing the repetition.
        """
        first = self.parse_integer()
        self.expect(u',')
        last = self.parse_integer()
        self.expect(u'}')
        if first > last:
            raise RegexParserExceptionInternal(
                "Minimum repetition (%d) cannot be larger than maximum repetition (%d)."
                % (first, last))
        return Regex.Repetition(child, first, last)

    def parse_integer(self):
        """
        Parse an integer from the string at its current index.
        @return: the integer parsed from the string.
        """
        number = ''
        if not self.next_is(string.digits):
            self.expect(u'integer')
        while self.next_is(string.digits):
            number += self.get_next()
        return int(number)

    def expect(self, character, name=None):
        """
        Raise an exception if the character at the current index of the string is not a specific value.
        @param character: the value that the current character should be.
        """
        if not self.next_is(character):
            if name is None:
                raise RegexParserExpected(unicode(character), self.text,
                                          self.index)
            else:
                raise RegexParserExpected(name, self.text, self.index)
        self.index += 1

    def get_next(self):
        """
        Returns the next character, advances the stream, and throws an exception if at the end of the pattern
        @return: String containing the next character
        """
        if self.index >= len(self.text):
            self.expect('character')
        self.index += 1
        return unicode(self.text[self.index - 1])

    def get_next_if(self, characters):
        """
        Advances the sream if the next character is one of a set of characters
        @param characters: A string containing the characters for which to check
        @return: Boolean if the next character is one of characters
        """
        if self.next_is(characters):
            self.get_next()
            return True
        return False

    def next_is(self, characters):
        """
        Returns true if the next character is at the current index of the string is one of a set of characters. False if not.
        @param characters: the value that the current character should be.
        @return: True if the next characters is in the expected set, False otherwise or if at end of string
        """
        return self.nth_next_is(1, characters)

    def nth_next_is(self, n, characters):
        """
        Looks ahead n characters and returns true if the next nth character of the string is one of a set of characters. False if not.
        @param n: how many characters to look ahead. Must be greater than 0
        @param characters: the value that the current character should be.
        @return: True if the next characters is in the expected set, False otherwise or if at end of string
        """
        return n > 0 and self.index + n - 1 < len(
            self.text) and self.text[self.index + n - 1] in characters

    def next_is_not(self, characters):
        """
        Returns true if the next character is not end-of-string and is not one of a set of characters. False if not
        @param characters: the value that the current character should not be.
        @return: True if the next character is not in the expected set, Fase otherwise or if at end of string
        """
        return self.index < len(
            self.text) and self.text[self.index] not in characters

    def nth_next_is_not(self, n, characters):
        """
        Looks ahead n characters and returns true if the next nth character of the string is not one of a set of characters. False if not.
        @param n: how many characters to look ahead. Must be greater than 0
        @param characters: the value that the current character should be.
        @return: True if the next characters is in the expected set, False otherwise or if at end of string
        """
        return n > 0 and self.index + n - 1 < len(
            self.text) and self.text[self.index + n - 1] not in characters

    def next_is_set_operator(self):
        return self.next_is(Parser.set_operators) and self.nth_next_is(
            2, self.text[self.index])

    def parse_character_class(self):
        """
        Parse a character class ([...]) expression from the string at its current index.
        @return: a Regex.Literal object representing the characters
        """
        characters = self.parse_character_class_expression()
        return Regex.Literal([i for i in characters])

    def parse_named_character_class(self):
        """
        Parse a named character class ([:...:]) subexpresion from the string at its current index.
        @return: a CoverageSet object covering all the characters covered by the named class
        """
        classes = {
            u"alnum": [Parser.lowercase, Parser.uppercase, Parser.digits],
            u"word": [
                Parser.lowercase, Parser.uppercase, Parser.digits,
                Parser.underscore
            ],
            u"alpha": [Parser.uppercase, Parser.lowercase],
            u"blank": [Parser.space, Parser.tab],
            u"cntrl": [(0, 31)],
            u"digit": [Parser.digits],
            u"graph": [(33, 127)],
            u"lower": [Parser.lowercase],
            u"print": [(32, 127)],
            u"punct":
            [(ord(i), ord(i)) for i in "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"],
            u"space": [(ord(i), ord(i)) for i in string.whitespace],
            u"xdigit":
            [Parser.digits, (ord('a'), ord('f')), (ord('A'), ord('F'))],
            u"upper": [Parser.uppercase]
        }
        unicode_classes = {
            u"alnum": {
                False: [('alpha', None), ('digit', None)],
                True: []
            },
            u"word": {
                False: [('alpha', None), ('gc', 'Mark'), ('digit', None),
                        ('gc', 'Connector_Punctuation'),
                        ('Join_Control', None)],
                True: []
            },
            u"alpha": {
                False: [('alpha', None)],
                True: []
            },
            u"blank": {
                False: [('gc', 'Space_Separator'),
                        ('na1', 'CHARACTER TABULATION')],
                True: []
            },
            u"cntrl": {
                False: [('gc', 'Control')],
                True: []
            },
            u"digit": {
                False: [('gc', 'Decimal_Number')],
                True: []
            },
            u"graph": {
                False: [],
                True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'),
                       ('gc', 'Unassigned')]
            },
            u"lower": {
                False: [('Lowercase', None)],
                True: []
            },
            u"print": {
                False: [('gc', 'Space_Separator'),
                        ('na1', 'CHARACTER TABULATION')],
                True: [('space', None), ('gc', 'Control'), ('gc', 'Surrogate'),
                       ('gc', 'Unassigned')]
            },
            u"punct": {
                False: [('gc', 'Punctuation')],
                True: []
            },
            u"space": {
                False: [('Whitespace', None)],
                True: []
            },
            u"xdigit": {
                False: [('gc', 'Decimal_Number'), ('Hex_Digit', None)],
                True: []
            },
            u"upper": {
                False: [('Uppercase', None)],
                True: []
            }
        }

        self.expect(u':')
        class_name = ""
        while self.next_is(string.ascii_letters):
            class_name += self.get_next()
        self.expect(":")
        self.expect("]")

        if self.is_unicode_defaults and class_name in unicode_classes:
            class_queries = unicode_classes[class_name]
            instance = UnicodeQuery.instance(self.unicode_db)
            coverage = CoverageSet()
            for k, v in class_queries[False]:
                coverage.update(instance.query(k, v))
            if len(class_queries[True]) > 0:
                inverted_coverage = CoverageSet([(1, 0x10ffff)])
                for k, v in class_queries[True]:
                    inverted_coverage.difference_update(instance.query(k, v))
                coverage.update(inverted_coverage)
            return coverage

        elif not self.is_unicode_defaults and class_name in classes:
            return CoverageSet(classes[class_name])
        else:
            raise RegexParserExceptionInternal(
                "Character class '%s' not recognized" % class_name)

    def parse_character_class_subexpression(self):
        """
        Parse either a group "[...]", a named character class "[:...:]", or a range "...-..."
        @returns: a CoverageSet object containing the characters in the sub-expression
        """
        if self.next_is(':'):
            return self.parse_named_character_class()
        expression = self.parse_character_class_expression()
        return expression

        return self.parse_character_class_range()

    def parse_character_class_character(self):
        """
        Parse a lingle literal, but accomodate sub-expressions such as "[:...:]" or "[...]"
        @return: a CoverageSet object containing the characters that were parsed
        """
        if self.get_next_if(u'['):
            return self.parse_character_class_subexpression()
        else:
            return self.parse_literal(True).characters

    def parse_character_class_range(self):
        """
        Parse a range (e.g. a-z) expression from the string at its current index
        @return: a CoverageSet object containing all the characters in the range
        """
        start_literal = self.parse_character_class_character()
        if self.next_is('-') and not self.nth_next_is(2, '-'):
            self.get_next()
            end_literal = self.parse_character_class_character()
        else:
            return start_literal

        # Range must be two single characters with the end having a larger value than the start
        if len(start_literal) > 1 or len(end_literal) > 1:
            raise RegexParserInvalidCharacterRange(start_literal, end_literal)
        start_ordinal = next(iter(start_literal))[0]
        end_ordinal = next(iter(end_literal))[0]
        if start_ordinal > end_ordinal:
            raise RegexParserInvalidCharacterRange(unichr(start_ordinal),
                                                   unichr(end_ordinal))

        if self.is_case_insensitive:
            lower_start = ord(unichr(start_ordinal).lower())
            lower_end = ord(unichr(end_ordinal).lower())
            upper_start = ord(unichr(start_ordinal).upper())
            upper_end = ord(unichr(end_ordinal).upper())
            return CoverageSet([(lower_start, lower_end),
                                (upper_start, upper_end)])
        else:
            return CoverageSet([(start_ordinal, end_ordinal)])

    def parse_character_class_terms(self):
        """
        Parse a series of terms within a character class ("a-zbc", etc).
        @returns: the union of their coverage        
        """
        coverage = CoverageSet()
        coverage.update(self.parse_character_class_range())
        while self.next_is_not(']\r\n') and not self.next_is_set_operator():
            coverage.update(self.parse_character_class_range())
        return coverage

    def parse_character_class_expression(self):
        """
        Parses the text after the left bracket of a "[...]" expression
        @returns: a CoverageSet object covering all the characters covered by the character class.
        """
        inverse = self.get_next_if(u'^')

        coverage = self.parse_character_class_terms()
        while self.next_is_set_operator():
            operator = self.get_next()
            self.get_next()
            if self.next_is(u'])}'):
                raise RegexParserExpected("expression term", self.text,
                                          self.index)
            rhs = self.parse_character_class_terms()
            if operator == u'|':
                coverage.update(rhs)
            elif operator == u'&':
                coverage.intersection_update(rhs)
            elif operator == u'-':
                coverage.difference_update(rhs)
            elif operator == u'~':
                intersection = CoverageSet.intersection(coverage, rhs)
                coverage.update(rhs)
                coverage.difference_update(intersection)
        self.expect("]")

        if inverse:
            inverted_coverage = CoverageSet()
            inverted_coverage.add(1, 0x10FFFF)
            inverted_coverage.difference_update(coverage)
            return inverted_coverage

        return coverage

    def parse_unicode_expression(self):
        """
        Parse a unicode "{...}" expression for querying unicode properties.
        @returns: A CoverageSet object containing all characters covered by
            the specified unicode query.
        """
        self.expect("{")
        coverage = self.parse_unicode_subexpression()
        while self.get_next_if(u'|'):
            coverage.update(self.parse_unicode_subexpression())
        self.expect("}")
        return coverage

    def parse_unicode_name(self):
        """
        Parse a unicode "{...}" expresion for querying unicode names.
        @returns: A CoverageSet object containing all characters covered by
            the specified unicode query
        """
        self.expect("{")
        name = self.parse_unicode_word()
        coverage = UnicodeQuery.instance(self.unicode_db).query('na', name)
        if coverage.empty():
            raise ValueError("Name '{name}' not found".format(name=name))
        return coverage

    def parse_unicode_subexpression(self):
        """
        Parse a single "NAME=VALUE" term for a unicode property.
        @returns: A CoverageSet object containing the values covered by the term
        """
        name = self.parse_unicode_word()
        if self.get_next_if(u':='):
            value = self.parse_unicode_word()
        else:
            value = None
        coverage = UnicodeQuery.instance(self.unicode_db).query(name, value)
        return coverage

    def parse_unicode_word(self):
        """
        Scan a string, while skipping over whitespace, hyphens, and underscores.
        @return: the filtered name that was parsed out.
        """
        variable_name = ''
        while True:
            if self.next_is(' \t-_'):
                self.get_next()
            elif self.next_is(string.ascii_letters):
                variable_name += self.get_next()
            else:
                break
        if variable_name == '':
            raise RegexParserExpected("string", self.text, self.index)
        return variable_name