예제 #1
0
def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()
    
    start = fh.read(1)
    if start == "":  
        seek(pos); return -1

    elif start == "'": 
        # read an utf-8 char an get the token-id
        # Example: '+'
        if check(fh, "\\"):
            # snap_backslashed_character throws an exception if 'backslashed char' is nonsense.
            character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True)
        else:
            character_code = __read_one_utf8_code_from_stream(fh)

        if character_code == 0xFF:
            error_msg("Missing utf8-character for definition of character code by character.", fh)

        elif fh.read(1) != '\'':
            error_msg("Missing closing ' for definition of character code by character.", fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C": seek(pos); return -1
        # read Unicode Name 
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = __read_token_identifier(fh)
        if ucs_name == "": seek(pos); return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db,
                                "The string %s\ndoes not identify a known unicode character." % ucs_name, 
                                fh)
        elif type(character_code) not in [int, long]:
            error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) 
        return character_code

    fh.seek(pos)
    character_code = read_integer(fh)
    if character_code != None: return character_code

    # Try to interpret it as something else ...
    fh.seek(pos)
    return -1               
예제 #2
0
def do_shortcut(stream, ShortcutLetter, PropertyAlias):
    """Name property shortcut '\ShortcutLetter{...}' which is a shortcut
       for '\P{PropertyAlias=...}'.
    
       Parse an expression of the form '\N{CHARACTER NAME}'
       and return the related character set of characters that 
       match the given name. Wildcards in are allowed.
    """
    content = __parse_property_expression(stream, ShortcutLetter, EqualConditionPossibleF=False)
    # if len(content) != 1 then an exception is thrown

    property_value = content[0]

    result = ucs_property_db.get_character_set(PropertyAlias, property_value)

    if type(result) == str:
        raise RegularExpressionException(result)

    return result
예제 #3
0
def do(stream):
    """Property expression: '\P{...}'
    
       Parse an expression of the forms:

       '\P{property = value}' or '\P{binary_property}'

        and return the related character set.
    """
    content = __parse_property_expression(stream, "P")
    # if len(content) < 1 or > 2 then an exception is thrown

    property_name = content[0]
    if len(content) == 1: property_value = None
    else:                 property_value = content[1]

    result = ucs_property_db.get_character_set(property_name, property_value)

    if type(result) == str:
        raise RegularExpressionException(result)

    return result
예제 #4
0
def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()
    
    start = fh.read(1)
    if start == "":  
        seek(pos); return -1

    elif start == "'": 
        # read an utf-8 char an get the token-id
        # Example: '+'
        character_code = __read_one_utf8_code_from_stream(fh)
        if character_code == 0xFF:
            error_msg("Missing utf8-character for definition of character code by character.", fh)

        elif fh.read(1) != '\'':
            error_msg("Missing closing ' for definition of character code by character.", fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C": seek(pos); return -1
        # read Unicode Name 
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = read_identifier(fh)
        if ucs_name == "": seek(pos); return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            error_msg("%s does not identify a known unicode character." % ucs_name, fh) 
        if type(character_code) not in [int, long]:
            error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) 
        return character_code

    second = fh.read(1)
    if start == "0" and second.isdigit() == False:
        base = second
        if base not in ["x", "o", "b"]: 
            error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \
                      "'0o' for octal, or '0b' for binary.", fh)
        number_txt = read_integer(fh)
        if number_txt == "":
            error_msg("Missing integer number after '0%s'" % base, fh)
        try: 
            if   base == "x": character_code = int("0x" + number_txt, 16) 
            elif base == "o": character_code = int(number_txt, 8) 
            elif base == "b": 
                character_code = 0
                for letter in number_txt:
                    character_code = character_code << 1
                    if   letter == "1":   character_code += 1
                    elif letter != "0":
                        error_msg("Letter '%s' not permitted in binary number (something start with '0b')" % letter, fh)
            else:
                # A normal integer number (starting with '0' though)
                character_code = int(base + number_text)
        except:
            error_msg("The string '%s' is not appropriate for number base '0%s'." % (number_txt, base), fh)

        return character_code

    elif start.isdigit():
        fh.seek(-2, 1) # undo 'start' and 'second'
        # All that remains is that it is a 'normal' integer
        number_txt = read_integer(fh)

        if number_txt == "": fh.seek(pos); return -1
        
        try:    return int(number_txt)
        except: error_msg("The string '%s' is not appropriate for number base '10'." % number_txt, fh)

    else:
        # Try to interpret it as something else ...
        fh.seek(pos); return -1               
예제 #5
0
def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()

    start = fh.read(1)
    if start == "":
        seek(pos)
        return -1

    elif start == "'":
        # read an utf-8 char an get the token-id
        # Example: '+'
        character_code = __read_one_utf8_code_from_stream(fh)
        if character_code == 0xFF:
            error_msg(
                "Missing utf8-character for definition of character code by character.",
                fh)

        elif fh.read(1) != '\'':
            error_msg(
                "Missing closing ' for definition of character code by character.",
                fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C":
            seek(pos)
            return -1
        # read Unicode Name
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = read_identifier(fh)
        if ucs_name == "":
            seek(pos)
            return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            error_msg(
                "%s does not identify a known unicode character." % ucs_name,
                fh)
        if type(character_code) not in [int, long]:
            error_msg(
                "%s relates to more than one character in unicode database." %
                ucs_name, fh)
        return character_code

    second = fh.read(1)
    if start == "0" and second.isdigit() == False:
        base = second
        if base not in ["x", "o", "b"]:
            error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \
                      "'0o' for octal, or '0b' for binary.", fh)
        number_txt = read_integer(fh)
        if number_txt == "":
            error_msg("Missing integer number after '0%s'" % base, fh)
        try:
            if base == "x": character_code = int("0x" + number_txt, 16)
            elif base == "o": character_code = int(number_txt, 8)
            elif base == "b":
                character_code = 0
                for letter in number_txt:
                    character_code = character_code << 1
                    if letter == "1": character_code += 1
                    elif letter != "0":
                        error_msg(
                            "Letter '%s' not permitted in binary number (something start with '0b')"
                            % letter, fh)
            else:
                # A normal integer number (starting with '0' though)
                character_code = int(base + number_text)
        except:
            error_msg(
                "The string '%s' is not appropriate for number base '0%s'." %
                (number_txt, base), fh)

        return character_code

    elif start.isdigit():
        fh.seek(-2, 1)  # undo 'start' and 'second'
        # All that remains is that it is a 'normal' integer
        number_txt = read_integer(fh)

        if number_txt == "":
            fh.seek(pos)
            return -1

        try:
            return int(number_txt)
        except:
            error_msg(
                "The string '%s' is not appropriate for number base '10'." %
                number_txt, fh)

    else:
        # Try to interpret it as something else ...
        fh.seek(pos)
        return -1
예제 #6
0
def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()

    start = fh.read(1)
    if start == "":
        seek(pos)
        return -1

    elif start == "'":
        # read an utf-8 char an get the token-id
        # Example: '+'
        if check(fh, "\\"):
            # snap_backslashed_character throws an exception if 'backslashed char' is nonsense.
            character_code = snap_backslashed_character.do(
                fh, ReducedSetOfBackslashedCharactersF=True)
        else:
            character_code = __read_one_utf8_code_from_stream(fh)

        if character_code == 0xFF:
            error_msg(
                "Missing utf8-character for definition of character code by character.",
                fh)

        elif fh.read(1) != '\'':
            error_msg(
                "Missing closing ' for definition of character code by character.",
                fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C":
            seek(pos)
            return -1
        # read Unicode Name
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = __read_token_identifier(fh)
        if ucs_name == "":
            seek(pos)
            return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            verify_word_in_list(
                ucs_name, ucs_property_db["Name"].code_point_db,
                "The string %s\ndoes not identify a known unicode character." %
                ucs_name, fh)
        elif type(character_code) not in [int, long]:
            error_msg(
                "%s relates to more than one character in unicode database." %
                ucs_name, fh)
        return character_code

    fh.seek(pos)
    character_code = read_integer(fh)
    if character_code != None: return character_code

    # Try to interpret it as something else ...
    fh.seek(pos)
    return -1