def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position) return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) if char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' requires a preceding character as in 'a-z'." ) elif char_code == 0xFF: raise RegularExpressionException( "Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException( "Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: return tracker.match_set.inverse() else: return tracker.match_set
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position); return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) if char_code == ord("-"): raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.") elif char_code == 0xFF: raise RegularExpressionException("Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException("Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException("Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: return tracker.match_set.inverse() else: return tracker.match_set
def get_character_code_sequence(sh): assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" # Only \" is a special character '"', any other backslashed character # remains as the sequence 'backslash' + character sequence = [] while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) if char_code == 0xFF: raise RegularExpressionException( "End of file reached while parsing quoted string.") elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if char_code == None: raise RegularExpressionException( "Unidentified backslash-sequence in quoted string.") elif char_code == ord('"'): break sequence.append(char_code) return sequence
def snap_non_control_character(stream, PatternDict): __debug_entry("non-control characters", stream) # (*) read first character char_code = utf8.__read_one_utf8_code_from_stream(stream) if char_code == 0xFF: error_msg( "Character could not be interpreted as UTF8 code or End of File reached prematurely.", stream) result = StateMachine() result.add_transition(result.init_state_index, char_code, AcceptanceF=True) return __debug_exit(result, stream)
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos); return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg("Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg("Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos); return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": seek(pos); return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code != None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def get_character_code_sequence(sh): assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" # Only \" is a special character '"', any other backslashed character # remains as the sequence 'backslash' + character sequence = [] while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) if char_code == 0xFF: raise RegularExpressionException("End of file reached while parsing quoted string.") elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if char_code == None: raise RegularExpressionException("Unidentified backslash-sequence in quoted string.") elif char_code == ord('"'): break sequence.append(char_code) return sequence
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos); return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg("Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg("Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos); return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = read_identifier(fh) if ucs_name == "": seek(pos); return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: error_msg("%s does not identify a known unicode character." % ucs_name, fh) if type(character_code) not in [int, long]: error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) return character_code second = fh.read(1) if start == "0" and second.isdigit() == False: base = second if base not in ["x", "o", "b"]: error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \ "'0o' for octal, or '0b' for binary.", fh) number_txt = read_integer(fh) if number_txt == "": error_msg("Missing integer number after '0%s'" % base, fh) try: if base == "x": character_code = int("0x" + number_txt, 16) elif base == "o": character_code = int(number_txt, 8) elif base == "b": character_code = 0 for letter in number_txt: character_code = character_code << 1 if letter == "1": character_code += 1 elif letter != "0": error_msg("Letter '%s' not permitted in binary number (something start with '0b')" % letter, fh) else: # A normal integer number (starting with '0' though) character_code = int(base + number_text) except: error_msg("The string '%s' is not appropriate for number base '0%s'." % (number_txt, base), fh) return character_code elif start.isdigit(): fh.seek(-2, 1) # undo 'start' and 'second' # All that remains is that it is a 'normal' integer number_txt = read_integer(fh) if number_txt == "": fh.seek(pos); return -1 try: return int(number_txt) except: error_msg("The string '%s' is not appropriate for number base '10'." % number_txt, fh) else: # Try to interpret it as something else ... fh.seek(pos); return -1
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos) return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg( "Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg( "Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos) return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = read_identifier(fh) if ucs_name == "": seek(pos) return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: error_msg( "%s does not identify a known unicode character." % ucs_name, fh) if type(character_code) not in [int, long]: error_msg( "%s relates to more than one character in unicode database." % ucs_name, fh) return character_code second = fh.read(1) if start == "0" and second.isdigit() == False: base = second if base not in ["x", "o", "b"]: error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \ "'0o' for octal, or '0b' for binary.", fh) number_txt = read_integer(fh) if number_txt == "": error_msg("Missing integer number after '0%s'" % base, fh) try: if base == "x": character_code = int("0x" + number_txt, 16) elif base == "o": character_code = int(number_txt, 8) elif base == "b": character_code = 0 for letter in number_txt: character_code = character_code << 1 if letter == "1": character_code += 1 elif letter != "0": error_msg( "Letter '%s' not permitted in binary number (something start with '0b')" % letter, fh) else: # A normal integer number (starting with '0' though) character_code = int(base + number_text) except: error_msg( "The string '%s' is not appropriate for number base '0%s'." % (number_txt, base), fh) return character_code elif start.isdigit(): fh.seek(-2, 1) # undo 'start' and 'second' # All that remains is that it is a 'normal' integer number_txt = read_integer(fh) if number_txt == "": fh.seek(pos) return -1 try: return int(number_txt) except: error_msg( "The string '%s' is not appropriate for number base '10'." % number_txt, fh) else: # Try to interpret it as something else ... fh.seek(pos) return -1
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position); return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.") elif char_code == 0xFF: raise RegularExpressionException("Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException("Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException("Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: result = tracker.match_set.inverse() if Setup.get_character_value_limit() != sys.maxint: result.intersect_with(Interval(0, Setup.get_character_value_limit())) return result else: return tracker.match_set
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position) return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker( ) # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' requires a preceding character as in 'a-z'." ) elif char_code == 0xFF: raise RegularExpressionException( "Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException( "Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: result = tracker.match_set.inverse() if Setup.get_character_value_limit() != sys.maxint: result.intersect_with( Interval(0, Setup.get_character_value_limit())) return result else: return tracker.match_set
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": seek(pos) return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do( fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code == 0xFF: error_msg( "Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg( "Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": seek(pos) return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": seek(pos) return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list( ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg( "%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code != None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def snap_non_control_characters(stream): """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that in UTF8 a character may consist of more than one byte. Creates a state machine that contains solely one trigger for each character to a acceptance state. This function **concatinates** incoming characters, but **repetition** has preceedence over concatination, so it checks after each character whether it is followed by a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character is appended. """ __debug_entry("non-control characters", stream) result = StateMachine() state_index = result.init_state_index # (*) read first character position = stream.tell() char_code = utf8.__read_one_utf8_code_from_stream(stream) while char_code != 0xFF: # (1) check against occurence of control characters # this needs to come **before** the backslashed character interpretation. # NOTE: A backslashed character can be a whitespace (for example '\n'). # (check against 0xFF to avoid overflow in function 'chr()') if char_code < 0xFF \ and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()): stream.seek(-1, 1) break # (2) treat backslashed characters if char_code == ord('\\'): stream.seek(-1, 1) trigger_set = character_set_expression.snap_property_set(stream) if trigger_set == None: stream.seek(1, 1) # snap_property_set() leaves tream right before '\\' char_code = snap_backslashed_character.do(stream) if char_code == None: raise RegularExpressionException("Backslash followed by unrecognized character code.") trigger_set = char_code else: trigger_set = char_code # (3) read next character position = stream.tell() next_char_code = utf8.__read_one_utf8_code_from_stream(stream) # -- check for repetition (repetition has preceedence over concatination) if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]: # (*) create state machine that consist of a single transition tmp = StateMachine() tmp.add_transition(tmp.init_state_index, trigger_set, AcceptanceF=True) # -- repeat the single character state machine stream.seek(position) tmp_repeated = __snap_repetition_range(tmp, stream) # -- append it to the result (last state must be set to acceptance for concatenation) result.states[state_index].set_acceptance() result = sequentialize.do([result, tmp_repeated], MountToFirstStateMachineF=True) # as soon as there is repetition there might be more than one acceptance # state and thus simple concatination via 'add_transition' fails. # let us return and check treat the remaining chars # at the next call to this function. return __debug_exit(result, stream) else: # (*) add new transition from current state to a new state triggering # on the given character. state_index = result.add_transition(state_index, trigger_set) char_code = next_char_code # last character in the chain triggers an 'acceptance state' result.states[state_index].set_acceptance() return __debug_exit(result, stream)
def snap_non_control_characters(stream): """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that in UTF8 a character may consist of more than one byte. Creates a state machine that contains solely one trigger for each character to a acceptance state. This function **concatinates** incoming characters, but **repetition** has preceedence over concatination, so it checks after each character whether it is followed by a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character is appended. """ __debug_entry("non-control characters", stream) result = StateMachine() state_index = result.init_state_index # (*) read first character position = stream.tell() char_code = utf8.__read_one_utf8_code_from_stream(stream) while char_code != 0xFF: # (1) check against occurence of control characters # this needs to come **before** the backslashed character interpretation. # NOTE: A backslashed character can be a whitespace (for example '\n'). # (check against 0xFF to avoid overflow in function 'chr()') if char_code < 0xFF \ and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()): stream.seek(-1, 1) break # (2) treat backslashed characters if char_code == ord('\\'): stream.seek(-1, 1) trigger_set = character_set_expression.snap_property_set(stream) if trigger_set == None: stream.seek( 1, 1) # snap_property_set() leaves tream right before '\\' char_code = snap_backslashed_character.do(stream) if char_code == None: raise RegularExpressionException( "Backslash followed by unrecognized character code.") trigger_set = char_code else: trigger_set = char_code # (3) read next character position = stream.tell() next_char_code = utf8.__read_one_utf8_code_from_stream(stream) # -- check for repetition (repetition has preceedence over concatination) if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]: # (*) create state machine that consist of a single transition tmp = StateMachine() tmp.add_transition(tmp.init_state_index, trigger_set, AcceptanceF=True) # -- repeat the single character state machine stream.seek(position) tmp_repeated = __snap_repetition_range(tmp, stream) # -- append it to the result (last state must be set to acceptance for concatenation) result.states[state_index].set_acceptance() result = sequentialize.do([result, tmp_repeated], MountToFirstStateMachineF=True) # as soon as there is repetition there might be more than one acceptance # state and thus simple concatination via 'add_transition' fails. # let us return and check treat the remaining chars # at the next call to this function. return __debug_exit(result, stream) else: # (*) add new transition from current state to a new state triggering # on the given character. state_index = result.add_transition(state_index, trigger_set) char_code = next_char_code # last character in the chain triggers an 'acceptance state' result.states[state_index].set_acceptance() return __debug_exit(result, stream)