def __parse_section(fh, descriptor, already_defined_list): global token_type_code_fragment_db assert type(already_defined_list) == list SubsectionList = ["name", "file_name", "standard", "distinct", "union", "inheritable", "noid"] \ + token_type_code_fragment_db.keys() position = fh.tell() skip_whitespace(fh) word = read_identifier(fh) if word == "": fh.seek(position) if check(fh, "}"): fh.seek(position) return False error_msg("Missing token_type section ('standard', 'distinct', or 'union').", fh) verify_word_in_list(word, SubsectionList, "Subsection '%s' not allowed in token_type section." % word, fh) if word == "name": if not check(fh, "="): error_msg("Missing '=' in token_type 'name' specification.", fh) descriptor.class_name, descriptor.name_space, descriptor.class_name_safe = read_namespaced_name(fh, "token_type") if not check(fh, ";"): error_msg("Missing terminating ';' in token_type 'name' specification.", fh) elif word == "inheritable": descriptor.open_for_derivation_f = True check_or_die(fh, ";") elif word == "noid": descriptor.token_contains_token_id_f = False; check_or_die(fh, ";") elif word == "file_name": if not check(fh, "="): error_msg("Missing '=' in token_type 'file_name' specification.", fh) descriptor.set_file_name(read_until_letter(fh, ";")) if not check(fh, ";"): error_msg("Missing terminating ';' in token_type 'file_name' specification.", fh) elif word in ["standard", "distinct", "union"]: if word == "standard": parse_standard_members(fh, word, descriptor, already_defined_list) elif word == "distinct": parse_distinct_members(fh, word, descriptor, already_defined_list) elif word == "union": parse_union_members(fh, word, descriptor, already_defined_list) if not check(fh, "}"): fh.seek(position) error_msg("Missing closing '}' at end of token_type section '%s'." % word, fh); elif word in token_type_code_fragment_db.keys(): fragment = code_fragment.parse(fh, word, AllowBriefTokenSenderF=False) descriptor.__dict__[word] = fragment else: assert False, "This code section section should not be reachable because 'word'\n" + \ "was checked to fit in one of the 'elif' cases." return True
def __parse_event(new_mode, fh, word): pos = fh.tell() # Allow '<<EOF>>' and '<<FAIL>>' out of respect for classical tools like 'lex' if word == "<<EOF>>": word = "on_end_of_stream" elif word == "<<FAIL>>": word = "on_failure" elif word in blackboard.all_section_title_list: error_msg("Pattern '%s' is a quex section title. Has the closing '}' of mode %s \n" % (word, new_mode.name) \ + "been forgotten? Else use quotes, i.e. \"%s\"." % word, fh) elif len(word) < 3 or word[:3] != "on_": return False comment = "Unknown event handler '%s'. \n" % word + \ "Note, that any pattern starting with 'on_' is considered an event handler.\n" + \ "use double quotes to bracket patterns that start with 'on_'." __general_validate(fh, new_mode, word, pos) verify_word_in_list(word, event_handler_db.keys(), comment, fh) __validate_required_token_policy_queue(word, fh, pos) continue_f = True if word == "on_end_of_stream": # When a termination token is sent, no other token shall follow. # => Enforce return from the analyzer! Do not allow CONTINUE! continue_f = False new_mode.events[word] = code_fragment.parse(fh, "%s::%s event handler" % (new_mode.name, word), ContinueF=continue_f) return True
def parse(fh, new_mode): source_reference = SourceRef.from_FileHandle(fh) identifier = read_option_start(fh) if identifier is None: return False verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option", fh.name, get_current_line_info_number(fh)) if identifier == "skip": value = __parse_skip_option(fh, new_mode, identifier) elif identifier in ["skip_range", "skip_nested_range"]: value = __parse_range_skipper_option(fh, identifier, new_mode) elif identifier == "indentation": value = counter.parse_indentation(fh) value.set_containing_mode_name(new_mode.name) blackboard.required_support_indentation_count_set() elif identifier == "counter": value = counter.parse_line_column_counter(fh) elif identifier in ("entry", "exit", "restrict"): value = read_option_value(fh, ListF=True) # A 'list' of strings else: value = read_option_value(fh) # A single string # Finally, set the option new_mode.option_db.enter(identifier, value, source_reference, new_mode.name) return True
def __validate_definition(TheCodeFragment, NameStr, AlreadyMentionedList, StandardMembersF): FileName = TheCodeFragment.sr.file_name LineN = TheCodeFragment.sr.line_n if StandardMembersF: verify_word_in_list( NameStr, TokenType_StandardMemberList, "Member name '%s' not allowed in token_type section 'standard'." % NameStr, FileName, LineN) # Standard Members are all numeric types if TheCodeFragment.contains_string(Lng.Match_string) \ or TheCodeFragment.contains_string(Lng.Match_vector) \ or TheCodeFragment.contains_string(Lng.Match_map): type_str = TheCodeFragment.get_text() error_msg("Numeric type required.\n" + \ "Example: <token_id: uint16_t>, Found: '%s'\n" % type_str, FileName, LineN) else: if NameStr in TokenType_StandardMemberList: error_msg( "Member '%s' only allowed in 'standard' section." % NameStr, FileName, LineN) for candidate in AlreadyMentionedList: if candidate[0] != NameStr: continue error_msg("Token type member name '%s' defined twice." % NameStr, FileName, LineN, DontExitF=True) error_msg("Previously defined here.", candidate[1].sr.file_name, candidate[1].sr.line_n)
def __validate_definition(TheCodeFragment, NameStr, AlreadyMentionedList, StandardMembersF): FileName = TheCodeFragment.sr.file_name LineN = TheCodeFragment.sr.line_n if StandardMembersF: verify_word_in_list(NameStr, TokenType_StandardMemberList, "Member name '%s' not allowed in token_type section 'standard'." % NameStr, FileName, LineN) # Standard Members are all numeric types if TheCodeFragment.contains_string(Lng.Match_string) \ or TheCodeFragment.contains_string(Lng.Match_vector) \ or TheCodeFragment.contains_string(Lng.Match_map): type_str = TheCodeFragment.get_text() error_msg("Numeric type required.\n" + \ "Example: <token_id: uint16_t>, Found: '%s'\n" % type_str, FileName, LineN) else: if NameStr in TokenType_StandardMemberList: error_msg("Member '%s' only allowed in 'standard' section." % NameStr, FileName, LineN) for candidate in AlreadyMentionedList: if candidate[0] != NameStr: continue error_msg("Token type member name '%s' defined twice." % NameStr, FileName, LineN, DontExitF=True) error_msg("Previously defined here.", candidate[1].sr.file_name, candidate[1].sr.line_n)
def __start_mode(applicable_mode_name_list, mode_name_list): """If more then one mode is defined, then that requires an explicit definition 'start = mode'. """ assert len(applicable_mode_name_list) != 0 start_mode = blackboard.initial_mode.get_pure_code() if start_mode == "": # Choose an applicable mode as start mode start_mode = applicable_mode_name_list[0] blackboard.initial_mode = CodeFragment(start_mode) if len(applicable_mode_name_list) > 1: error_msg("No initial mode defined via 'start' while more than one applicable mode exists.\n" + \ "Use for example 'start = %s;' in the quex source file to define an initial mode." \ % start_mode) # This Branch: start mode is applicable and present else: FileName = blackboard.initial_mode.filename LineN = blackboard.initial_mode.line_n # Start mode present and applicable? verify_word_in_list(start_mode, mode_name_list, "Start mode '%s' is not defined." % start_mode, FileName, LineN) verify_word_in_list( start_mode, applicable_mode_name_list, "Start mode '%s' is inheritable only and cannot be instantiated." % start_mode, FileName, LineN)
def __start_mode(applicable_mode_name_list, mode_name_list): """If more then one mode is defined, then that requires an explicit definition 'start = mode'. """ assert len(applicable_mode_name_list) != 0 start_mode = blackboard.initial_mode.get_pure_code() if start_mode == "": # Choose an applicable mode as start mode start_mode = applicable_mode_name_list[0] blackboard.initial_mode = CodeFragment(start_mode) if len(applicable_mode_name_list) > 1: error_msg("No initial mode defined via 'start' while more than one applicable mode exists.\n" + \ "Use for example 'start = %s;' in the quex source file to define an initial mode." \ % start_mode) # This Branch: start mode is applicable and present else: FileName = blackboard.initial_mode.filename LineN = blackboard.initial_mode.line_n # Start mode present and applicable? verify_word_in_list(start_mode, mode_name_list, "Start mode '%s' is not defined." % start_mode, FileName, LineN) verify_word_in_list(start_mode, applicable_mode_name_list, "Start mode '%s' is inheritable only and cannot be instantiated." % start_mode, FileName, LineN)
def __validate_definition(TypeCodeFragment, NameStr, AlreadyMentionedList, StandardMembersF): FileName = TypeCodeFragment.filename LineN = TypeCodeFragment.line_n if StandardMembersF: verify_word_in_list(NameStr, TokenType_StandardMemberList, "Member name '%s' not allowed in token_type section 'standard'." % NameStr, FileName, LineN) # Standard Members are all numeric types TypeStr = TypeCodeFragment.get_pure_code() if TypeStr.find("string") != -1 \ or TypeStr.find("vector") != -1 \ or TypeStr.find("map") != -1: error_msg("Numeric type required.\n" + \ "Example: <token_id: uint16_t>, Found: '%s'\n" % TypeStr, FileName, LineN) else: if NameStr in TokenType_StandardMemberList: error_msg("Member '%s' only allowed in 'standard' section." % NameStr, FileName, LineN) for candidate in AlreadyMentionedList: if candidate[0] != NameStr: continue error_msg("Token type member name '%s' defined twice." % NameStr, FileName, LineN, DontExitF=True) error_msg("Previously defined here.", candidate[1].filename, candidate[1].line_n)
def __validate_definition(TypeCodeFragment, NameStr, AlreadyMentionedList, StandardMembersF): FileName = TypeCodeFragment.filename LineN = TypeCodeFragment.line_n if StandardMembersF: verify_word_in_list( NameStr, TokenType_StandardMemberList, "Member name '%s' not allowed in token_type section 'standard'." % NameStr, FileName, LineN) # Standard Members are all numeric types TypeStr = TypeCodeFragment.get_pure_code() if TypeStr.find("string") != -1 \ or TypeStr.find("vector") != -1 \ or TypeStr.find("map") != -1: error_msg("Numeric type required.\n" + \ "Example: <token_id: uint16_t>, Found: '%s'\n" % TypeStr, FileName, LineN) else: if NameStr in TokenType_StandardMemberList: error_msg( "Member '%s' only allowed in 'standard' section." % NameStr, FileName, LineN) for candidate in AlreadyMentionedList: if candidate[0] != NameStr: continue error_msg("Token type member name '%s' defined twice." % NameStr, FileName, LineN, DontExitF=True) error_msg("Previously defined here.", candidate[1].filename, candidate[1].line_n)
def snap_replacement(stream, PatternDict, StateMachineF=True): """Snaps a predefined pattern from the input string and returns the resulting state machine. """ skip_whitespace(stream) pattern_name = read_identifier(stream) if pattern_name == "": raise RegularExpressionException( "Pattern replacement expression misses identifier after '{'.") skip_whitespace(stream) if not check(stream, "}"): raise RegularExpressionException("Pattern replacement expression misses closing '}' after '%s'." \ % pattern_name) verify_word_in_list( pattern_name, PatternDict.keys(), "Specifier '%s' not found in any preceeding 'define { ... }' section." % pattern_name, stream) reference = PatternDict[pattern_name] assert reference.__class__.__name__ == "PatternShorthand" # The replacement may be a state machine or a number set if StateMachineF: # Get a cloned version of state machine state_machine = reference.get_state_machine() assert isinstance(state_machine, StateMachine) # It is essential that state machines defined as patterns do not # have origins. Otherwise, the optimization of patterns that # contain pattern replacements might get confused and can # not find all optimizations. assert state_machine.has_origins() == False # A state machine, that contains pre- or post- conditions cannot be part # of a replacement. The addition of new post-contexts would mess up the pattern. ## if state_machine.has_pre_or_post_context(): ## error_msg("Pre- or post-conditioned pattern was used in replacement.\n" + \ ## "Quex's regular expression grammar does not allow this.", stream) return state_machine else: # Get a cloned version of character set character_set = reference.get_character_set() if character_set is None: error_msg( "Replacement in character set expression must be a character set.\n" "Specifier '%s' relates to a pattern state machine." % pattern_name, stream) if character_set.is_empty(): error_msg( "Referenced character set '%s' is empty.\nAborted." % pattern_name, stream) return character_set
def __determine_base_mode_sequence(self, ModeDescr, InheritancePath, base_mode_sequence): """Determine the sequence of base modes. The type of sequencing determines also the pattern precedence. The 'deep first' scheme is chosen here. For example a mode hierarchie of A / \ B C / \ / \ D E F G results in a sequence: (A, B, D, E, C, F, G).reverse() => That is the mode itself is base_mode_sequence[-1] => Patterns and event handlers of 'E' have precedence over 'C' because they are the childs of a preceding base mode. This function detects circular inheritance. __dive -- inserted this keyword for the sole purpose to signal that here is a case of recursion, which may be solved later on by a TreeWalker. """ if ModeDescr.name in InheritancePath: msg = "mode '%s'\n" % InheritancePath[0] for mode_name in InheritancePath[InheritancePath.index(ModeDescr. name) + 1:]: msg += " inherits mode '%s'\n" % mode_name msg += " inherits mode '%s'" % ModeDescr.name error_msg("circular inheritance detected:\n" + msg, ModeDescr.sr.file_name, ModeDescr.sr.line_n) base_mode_name_list_reversed = deepcopy(ModeDescr.derived_from_list) #base_mode_name_list_reversed.reverse() for name in base_mode_name_list_reversed: # -- does mode exist? verify_word_in_list( name, blackboard.mode_description_db.keys(), "Mode '%s' inherits mode '%s' which does not exist." % (ModeDescr.name, name), ModeDescr.sr.file_name, ModeDescr.sr.line_n) if name in map(lambda m: m.name, base_mode_sequence): continue # -- grab the mode description mode_descr = blackboard.mode_description_db[name] self.__determine_base_mode_sequence( mode_descr, InheritancePath + [ModeDescr.name], base_mode_sequence) base_mode_sequence.append(ModeDescr) return base_mode_sequence
def __get_distinct_codec_name_for_alias(CodecAlias, FH=-1, LineN=None): """Arguments FH and LineN correspond to the arguments of error_msg.""" assert len(CodecAlias) != 0 for record in get_codec_list_db(): if CodecAlias in record[1] or CodecAlias == record[0]: return record[0] verify_word_in_list(CodecAlias, get_supported_codec_list(), "Character encoding '%s' unknown to current version of quex." % CodecAlias, FH, LineN)
def _get_distinct_codec_name_for_alias(CodecAlias, FH=-1, LineN=None): """Arguments FH and LineN correspond to the arguments of error_msg.""" assert len(CodecAlias) != 0 for record in parser.get_codec_list_db(): if CodecAlias in record[1] or CodecAlias == record[0]: return record[0] verify_word_in_list(CodecAlias, get_supported_codec_list(), "Character encoding '%s' unknown to current version of quex." % CodecAlias, FH, LineN)
def get_codecs_for_language(Language): result = [] for record in parser.get_codec_list_db(): codec = record[0] if codec not in get_supported_codec_list(): continue if Language in record[2]: result.append(record[0]) if len(result) == 0: verify_word_in_list(Language, get_supported_language_list(), "No codec found for language '%s'." % Language) return result
def snap_set_term(stream, PatternDict): global special_character_set_db __debug_entry("set_term", stream) operation_list = [ "union", "intersection", "difference", "inverse"] character_set_list = special_character_set_db.keys() skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below word = read_identifier(stream) if word in operation_list: set_list = snap_set_list(stream, word, PatternDict) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for character_set in set_list[1:]: result.unite_with(character_set) return __debug_exit(result.get_complement(Setup.buffer_codec.source_set), stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in character_set_list: reg_expr = special_character_set_db[word] result = traditional_character_set.do_string(reg_expr) elif word != "": verify_word_in_list(word, character_set_list + operation_list, "Unknown keyword '%s'." % word, stream) else: stream.seek(position) result = snap_set_expression(stream, PatternDict) return __debug_exit(result, stream)
def __entry_exit_transitions(mode, mode_name_list): FileName = mode.filename LineN = mode.line_n for mode_name in mode.options["exit"]: verify_word_in_list(mode_name, mode_name_list, "Mode '%s' allows entry from\nmode '%s' but no such mode exists." % \ (mode.name, mode_name), FileName, LineN) that_mode = blackboard.mode_db[mode_name] # Other mode allows all entries => don't worry. if len(that_mode.options["entry"]) == 0: continue # Other mode restricts the entries from other modes # => check if this mode or one of the base modes can enter for base_mode in mode.get_base_mode_sequence(): if base_mode.name in that_mode.options["entry"]: break else: error_msg("Mode '%s' has an exit to mode '%s' but" % (mode.name, mode_name), FileName, LineN, DontExitF=True, WarningF=False) error_msg("mode '%s' has no entry for mode '%s'\n" % (mode_name, mode.name) + \ "or any of its base modes.", that_mode.filename, that_mode.line_n) for mode_name in mode.options["entry"]: # Does that mode exist? verify_word_in_list(mode_name, mode_name_list, "Mode '%s' allows entry from\nmode '%s' but no such mode exists." % \ (mode.name, mode_name), FileName, LineN) that_mode = blackboard.mode_db[mode_name] # Other mode allows all exits => don't worry. if len(that_mode.options["exit"]) == 0: continue # Other mode restricts the exits to other modes # => check if this mode or one of the base modes can be reached for base_mode in mode.get_base_mode_sequence(): if base_mode.name in that_mode.options["exit"]: break else: error_msg("Mode '%s' has an entry for mode '%s' but" % (mode.name, mode_name), FileName, LineN, DontExitF=True, WarningF=False) error_msg("mode '%s' has no exit to mode '%s'\n" % (mode_name, mode.name) + \ "or any of its base modes.", that_mode.filename, that_mode.line_n)
def snap_replacement(stream, PatternDict, StateMachineF=True): """Snaps a predefined pattern from the input string and returns the resulting state machine. """ skip_whitespace(stream) pattern_name = read_identifier(stream) if pattern_name == "": raise RegularExpressionException("Pattern replacement expression misses identifier after '{'.") skip_whitespace(stream) if not check(stream, "}"): raise RegularExpressionException("Pattern replacement expression misses closing '}' after '%s'." \ % pattern_name) verify_word_in_list(pattern_name, PatternDict.keys(), "Specifier '%s' not found in any preceeding 'define { ... }' section." % pattern_name, stream) reference = PatternDict[pattern_name] assert reference.__class__.__name__ == "PatternShorthand" # The replacement may be a state machine or a number set if StateMachineF: # Get a cloned version of state machine state_machine = reference.get_state_machine() assert isinstance(state_machine, StateMachine) # It is essential that state machines defined as patterns do not # have origins. Otherwise, the optimization of patterns that # contain pattern replacements might get confused and can # not find all optimizations. assert state_machine.has_origins() == False # A state machine, that contains pre- or post- conditions cannot be part # of a replacement. The addition of new post-contexts would mess up the pattern. ## if state_machine.has_pre_or_post_context(): ## error_msg("Pre- or post-conditioned pattern was used in replacement.\n" + \ ## "Quex's regular expression grammar does not allow this.", stream) return state_machine else: # Get a cloned version of character set character_set = reference.get_character_set() if character_set is None: error_msg("Replacement in character set expression must be a character set.\n" "Specifier '%s' relates to a pattern state machine." % pattern_name, stream) if character_set.is_empty(): error_msg("Referenced character set '%s' is empty.\nAborted." % pattern_name, stream) return character_set
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": fh.seek(pos); return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code is None: error_msg("Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg("Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": fh.seek(pos); return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": fh.seek(pos); return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code is not None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def __determine_base_mode_sequence(self, ModeDescr, InheritancePath, base_mode_sequence): """Determine the sequence of base modes. The type of sequencing determines also the pattern precedence. The 'deep first' scheme is chosen here. For example a mode hierarchie of A / \ B C / \ / \ D E F G results in a sequence: (A, B, D, E, C, F, G).reverse() => That is the mode itself is base_mode_sequence[-1] => Patterns and event handlers of 'E' have precedence over 'C' because they are the childs of a preceding base mode. This function detects circular inheritance. __dive -- inserted this keyword for the sole purpose to signal that here is a case of recursion, which may be solved later on by a TreeWalker. """ if ModeDescr.name in InheritancePath: msg = "mode '%s'\n" % InheritancePath[0] for mode_name in InheritancePath[InheritancePath.index(ModeDescr.name) + 1:]: msg += " inherits mode '%s'\n" % mode_name msg += " inherits mode '%s'" % ModeDescr.name error_msg("circular inheritance detected:\n" + msg, ModeDescr.sr.file_name, ModeDescr.sr.line_n) base_mode_name_list_reversed = deepcopy(ModeDescr.derived_from_list) #base_mode_name_list_reversed.reverse() for name in base_mode_name_list_reversed: # -- does mode exist? verify_word_in_list(name, blackboard.mode_description_db.keys(), "Mode '%s' inherits mode '%s' which does not exist." % (ModeDescr.name, name), ModeDescr.sr.file_name, ModeDescr.sr.line_n) if name in map(lambda m: m.name, base_mode_sequence): continue # -- grab the mode description mode_descr = blackboard.mode_description_db[name] self.__determine_base_mode_sequence(mode_descr, InheritancePath + [ModeDescr.name], base_mode_sequence) base_mode_sequence.append(ModeDescr) return base_mode_sequence
def __determine_base_mode_sequence(self, ModeDescr, InheritancePath): """Determine the sequence of base modes. The type of sequencing determines also the pattern precedence. The 'deep first' scheme is chosen here. For example a mode hierarchie of A / \ B C / \ / \ D E F G results in a sequence: (A, B, D, E, C, F, G).reverse() This means, that patterns and event handlers of 'E' have precedence over 'C' because they are the childs of a preceding base mode. This function detects circular inheritance. """ if ModeDescr.name in InheritancePath: msg = "mode '%s'\n" % InheritancePath[0] for mode_name in InheritancePath[InheritancePath.index(ModeDescr. name) + 1:]: msg += " inherits mode '%s'\n" % mode_name msg += " inherits mode '%s'" % ModeDescr.name error_msg("circular inheritance detected:\n" + msg, ModeDescr.filename, ModeDescr.line_n) base_mode_name_list_reversed = deepcopy(ModeDescr.base_modes) #base_mode_name_list_reversed.reverse() for name in base_mode_name_list_reversed: # -- does mode exist? verify_word_in_list( name, mode_description_db.keys(), "Mode '%s' inherits mode '%s' which does not exist." % (ModeDescr.name, name), ModeDescr.filename, ModeDescr.line_n) if name in map(lambda m: m.name, self.__base_mode_sequence): continue # -- grab the mode description mode_descr = mode_description_db[name] self.__determine_base_mode_sequence( mode_descr, InheritancePath + [ModeDescr.name]) self.__base_mode_sequence.append(ModeDescr) return self.__base_mode_sequence
def __parse_definition_head(fh, result): if check(fh, "\\default"): error_msg("'\\default' has been replaced by keyword '\\else' since quex 0.64.9!", fh) elif check(fh, "\\else"): pattern = None else: pattern = regular_expression.parse(fh) skip_whitespace(fh) check_or_die(fh, "=>", " after character set definition.") skip_whitespace(fh) identifier = read_identifier(fh, OnMissingStr="Missing identifier for indentation element definition.") verify_word_in_list(identifier, result.identifier_list, "Unrecognized specifier '%s'." % identifier, fh) skip_whitespace(fh) return pattern, identifier, SourceRef.from_FileHandle(fh)
def __determine_base_mode_sequence(self, ModeDescr, InheritancePath): """Determine the sequence of base modes. The type of sequencing determines also the pattern precedence. The 'deep first' scheme is chosen here. For example a mode hierarchie of A / \ B C / \ / \ D E F G results in a sequence: (A, B, D, E, C, F, G).reverse() This means, that patterns and event handlers of 'E' have precedence over 'C' because they are the childs of a preceding base mode. This function detects circular inheritance. """ if ModeDescr.name in InheritancePath: msg = "mode '%s'\n" % InheritancePath[0] for mode_name in InheritancePath[InheritancePath.index(ModeDescr.name) + 1:]: msg += " inherits mode '%s'\n" % mode_name msg += " inherits mode '%s'" % ModeDescr.name error_msg("circular inheritance detected:\n" + msg, ModeDescr.filename, ModeDescr.line_n) base_mode_name_list_reversed = deepcopy(ModeDescr.base_modes) #base_mode_name_list_reversed.reverse() for name in base_mode_name_list_reversed: # -- does mode exist? verify_word_in_list(name, mode_description_db.keys(), "Mode '%s' inherits mode '%s' which does not exist." % (ModeDescr.name, name), ModeDescr.filename, ModeDescr.line_n) if name in map(lambda m: m.name, self.__base_mode_sequence): continue # -- grab the mode description mode_descr = mode_description_db[name] self.__determine_base_mode_sequence(mode_descr, InheritancePath + [ModeDescr.name]) self.__base_mode_sequence.append(ModeDescr) return self.__base_mode_sequence
def __start_mode(implemented_mode_name_list, mode_name_list): """If more then one mode is defined, then that requires an explicit definition 'start = mode'. """ assert len(implemented_mode_name_list) != 0 assert blackboard.initial_mode is not None start_mode = blackboard.initial_mode.get_pure_text() FileName = blackboard.initial_mode.sr.file_name LineN = blackboard.initial_mode.sr.line_n # Start mode present and applicable? verify_word_in_list(start_mode, mode_name_list, "Start mode '%s' is not defined." % start_mode, FileName, LineN) verify_word_in_list(start_mode, implemented_mode_name_list, "Start mode '%s' is inheritable only and cannot be instantiated." % start_mode, FileName, LineN)
def __parse_definition_head(fh, result): if check(fh, "\\default"): error_msg( "'\\default' has been replaced by keyword '\\else' since quex 0.64.9!", fh) elif check(fh, "\\else"): pattern = None else: pattern = regular_expression.parse(fh) skip_whitespace(fh) check_or_die(fh, "=>", " after character set definition.") skip_whitespace(fh) identifier = read_identifier( fh, OnMissingStr="Missing identifier for indentation element definition.") verify_word_in_list(identifier, result.identifier_list, "Unrecognized specifier '%s'." % identifier, fh) skip_whitespace(fh) return pattern, identifier, SourceRef.from_FileHandle(fh)
def __perform_setup(command_line, argv): """RETURN: True, if process needs to be started. False, if job is done. """ global setup # (*) Classes and their namespace __setup_analyzer_class(setup) __setup_token_class(setup) __setup_token_id_prefix(setup) __setup_lexeme_null(setup) # Requires 'token_class_name_space' # (*) Output programming language setup.language = setup.language.upper() verify_word_in_list( setup.language, quex_core_engine_generator_languages_db.keys(), "Programming language '%s' is not supported." % setup.language) setup.language_db = quex_core_engine_generator_languages_db[setup.language] setup.extension_db = global_extension_db[setup.language] # Is the output file naming scheme provided by the extension database # (Validation must happen immediately) if setup.extension_db.has_key(setup.output_file_naming_scheme) == False: error_msg("File extension scheme '%s' is not provided for language '%s'.\n" \ % (setup.output_file_naming_scheme, setup.language) + \ "Available schemes are: %s." % repr(setup.extension_db.keys())[1:-1]) # Before file names can be prepared, determine the output directory # If 'source packaging' is enabled and no output directory is specified # then take the directory of the source packaging. if setup.source_package_directory != "" and setup.output_directory == "": setup.output_directory = setup.source_package_directory if setup.buffer_codec in ["utf8", "utf16"]: setup.buffer_codec_transformation_info = setup.buffer_codec + "-state-split" elif setup.buffer_codec_file != "": try: setup.buffer_codec = os.path.splitext( os.path.basename(setup.buffer_codec_file))[0] except: error_msg("cannot interpret string following '--codec-file'") setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info( FileName=setup.buffer_codec_file) elif setup.buffer_codec != "unicode": setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info( setup.buffer_codec) if setup.buffer_codec != "unicode": setup.buffer_element_size_irrelevant = True # (*) Output files if setup.language not in ["DOT"]: prepare_file_names(setup) if setup.buffer_byte_order == "<system>": setup.buffer_byte_order = sys.byteorder setup.byte_order_is_that_of_current_system_f = True else: setup.byte_order_is_that_of_current_system_f = False if setup.buffer_element_size == "wchar_t": error_msg( "Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n" "with option '--buffer-element-size' or '-bes'. Please, specify\n" "'--buffer-element-type wchar_t' or '--bet'.") if setup.buffer_element_type == "wchar_t": setup.converter_ucs_coding_name = "WCHAR_T" make_numbers(setup) # (*) Determine buffer element type and size (in bytes) if setup.buffer_element_size == -1: if global_character_type_db.has_key(setup.buffer_element_type): setup.buffer_element_size = global_character_type_db[ setup.buffer_element_type][3] elif setup.buffer_element_type == "": setup.buffer_element_size = 1 else: # If the buffer element type is defined, then here we know that it is 'unknown' # and Quex cannot know its size on its own. setup.buffer_element_size = -1 if setup.buffer_element_type == "": if setup.buffer_element_size in [1, 2, 4]: setup.buffer_element_type = { 1: "uint8_t", 2: "uint16_t", 4: "uint32_t", }[setup.buffer_element_size] elif setup.buffer_element_size == -1: pass else: error_msg("Buffer element type cannot be determined for size '%i' which\n" \ % setup.buffer_element_size + "has been specified by '-b' or '--buffer-element-size'.") setup.converter_f = False if setup.converter_iconv_f or setup.converter_icu_f: setup.converter_f = True # The only case where no converter helper is required is where ASCII # (Unicode restricted to [0, FF] is used. setup.converter_helper_required_f = True if setup.converter_f == False and setup.buffer_element_size == 1 and setup.buffer_codec == "unicode": setup.converter_helper_required_f = False validation.do(setup, command_line, argv) if setup.converter_ucs_coding_name == "": if global_character_type_db.has_key(setup.buffer_element_type): if setup.buffer_byte_order == "little": index = 1 else: index = 2 setup.converter_ucs_coding_name = global_character_type_db[ setup.buffer_element_type][index] if setup.token_id_foreign_definition_file != "": CommentDelimiterList = [["//", "\n"], ["/*", "*/"]] # Regular expression to find '#include <something>' and extract the 'something' # in a 'group'. Note that '(' ')' cause the storage of parts of the match. IncludeRE = "#[ \t]*include[ \t]*[\"<]([^\">]+)[\">]" # parse_token_id_file(setup.token_id_foreign_definition_file, setup.token_id_prefix, CommentDelimiterList, IncludeRE) if setup.token_id_prefix_plain != setup.token_id_prefix: # The 'plain' name space less token indices are also supported parse_token_id_file(setup.token_id_foreign_definition_file, setup.token_id_prefix_plain, CommentDelimiterList, IncludeRE) # (*) Compression Types compression_type_list = [] for name, ctype in [ ("compression_template_f", E_Compression.TEMPLATE), ("compression_template_uniform_f", E_Compression.TEMPLATE_UNIFORM), ("compression_path_f", E_Compression.PATH), ("compression_path_uniform_f", E_Compression.PATH_UNIFORM) ]: if command_line_args_defined(command_line, name): compression_type_list.append( (command_line_arg_position(name), ctype)) compression_type_list.sort(key=itemgetter(0)) setup.compression_type_list = map(lambda x: x[1], compression_type_list) # (*) return setup ___________________________________________________________________ return True
def __create_token_sender_by_token_name(fh, TokenName): assert type(TokenName) in [str, unicode] # Enter token_id into database, if it is not yet defined. token_id_db_verify_or_enter_token_id(fh, TokenName) # Parse the token argument list argument_list = __parse_function_argument_list(fh, TokenName) # Create the token sender explicit_member_names_f = False for arg in argument_list: if arg.find("=") != -1: explicit_member_names_f = True assert blackboard.token_type_definition is not None, \ "A valid token_type_definition must have been parsed at this point." if not explicit_member_names_f: # There are only two allowed cases for implicit token member names: # QUEX_TKN_XYZ(Lexeme) --> call take_text(Lexeme, LexemeEnd) # QUEX_TKN_XYZ(Begin, End) --> call to take_text(Begin, End) if len(argument_list) == 2: return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, (%s), (%s));\n" % \ (argument_list[0], argument_list[1]) + \ "self_send(%s);\n" % (TokenName) elif len(argument_list) == 1: if argument_list[0] == "Lexeme": return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, self.buffer._lexeme_start_p, self.buffer._input_p);\n" \ "self_send(%s);\n" % (TokenName) elif argument_list[0] == "LexemeNull": return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, LexemeNull, LexemeNull);\n" \ "self_send(%s);\n" % (TokenName) else: error_msg("If one unnamed argument is specified it must be 'Lexeme'\n" + \ "or 'LexemeNull'. Found '%s'.\n" % argument_list[0] + \ "To cut parts of the lexeme, please, use the 2 argument sender, e.g.\n" + \ "QUEX_TKN_MY_ID(Lexeme + 1, LexemeEnd - 2);\n" + \ "Alternatively, use named parameters such as 'number=...'.", fh) elif len(argument_list) == 0: return "self_send(%s);\n" % TokenName else: error_msg( "Since 0.49.1, there are only the following brief token senders that can take\n" "unnamed token arguments:\n" " one argument: 'Lexeme' => token.take_text(..., LexemeBegin, LexemeEnd);\n" " two arguments: Begin, End => token.take_text(..., Begin, End);\n" + "Found: " + repr(argument_list)[1:-1] + ".", fh) # Returned from Function if implicit member names member_value_pairs = map(lambda x: x.split("="), argument_list) txt = "" for member, value in member_value_pairs: if value == "": error_msg("One explicit argument name mentioned requires all arguments to\n" + \ "be mentioned explicitly. Value '%s' mentioned without argument.\n" \ % member, fh) if Setup.token_class_file != "": error_msg("Member assignments in brief token senders are inadmissible\n" + \ "with manually written token classes. User provided file '%s'.\n" % Setup.token_class_file + \ "Found member assignment: '%s' = '%s'." % (member, value), fh) else: member_name = member.strip() verify_word_in_list( member_name, blackboard.token_type_definition.get_member_db(), "No member: '%s' in token type description." % member_name, fh) idx = value.find("Lexeme") if idx != -1: if idx != 0 and value[idx - 1] == "(": pass else: error_msg( "Assignment of token member '%s' with 'Lexeme' directly being involved. The\n" % member_name + "'Lexeme' points into the text buffer and it is not owned by the token object.\n" "\n" "Proposals:\n\n" " (1) Use '(Lexeme)', i.e. surround 'Lexeme' by brackets to indicate\n" " that you are aware of the danger. Do this, if at the end of the\n" " process, the member can be assumed to relate to an object that\n" " is not directly dependent anymore on 'Lexeme'. This is particularly\n" " true if the member is of type 'std::string'. Its constructor\n" " creates a copy of the zero terminated string.\n\n" " (2) Use token senders without named arguments, for example\n" " \"%s(Lexeme+1, LexemeEnd-2)\"\n" % TokenName + " \"%s(Lexeme)\"\n" % TokenName + " These token senders create a copy of the lexeme and let the token\n" " own it.", fh) access = blackboard.token_type_definition.get_member_access( member_name) txt += "self_write_token_p()->%s = %s;\n" % (access, value.strip()) # Box the token, stamp it with an id and 'send' it txt += "self_send(%s);\n" % TokenName return txt
def read_character_code(fh): # NOTE: This function is tested with the regeression test for feature request 2251359. # See directory $QUEX_PATH/TEST/2251359. pos = fh.tell() start = fh.read(1) if start == "": fh.seek(pos) return -1 elif start == "'": # read an utf-8 char an get the token-id # Example: '+' if check(fh, "\\"): # snap_backslashed_character throws an exception if 'backslashed char' is nonsense. character_code = snap_backslashed_character.do( fh, ReducedSetOfBackslashedCharactersF=True) else: character_code = __read_one_utf8_code_from_stream(fh) if character_code is None: error_msg( "Missing utf8-character for definition of character code by character.", fh) elif fh.read(1) != '\'': error_msg( "Missing closing ' for definition of character code by character.", fh) return character_code if start == "U": if fh.read(1) != "C": fh.seek(pos) return -1 # read Unicode Name # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE skip_whitespace(fh) ucs_name = __read_token_identifier(fh) if ucs_name == "": fh.seek(pos) return -1 # Get the character set related to the given name. Note, the size of the set # is supposed to be one. character_code = ucs_property_db.get_character_set("Name", ucs_name) if type(character_code) in [str, unicode]: verify_word_in_list( ucs_name, ucs_property_db["Name"].code_point_db, "The string %s\ndoes not identify a known unicode character." % ucs_name, fh) elif type(character_code) not in [int, long]: error_msg( "%s relates to more than one character in unicode database." % ucs_name, fh) return character_code fh.seek(pos) character_code = read_integer(fh) if character_code is not None: return character_code # Try to interpret it as something else ... fh.seek(pos) return -1
def do(fh): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ indentation_setup = IndentationSetup(fh) # NOTE: Catching of EOF happens in caller: parse_section(...) # skip_whitespace(fh) while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): indentation_setup.seal() indentation_setup.consistency_check(fh) return indentation_setup # A regular expression state machine pattern_str, pattern = regular_expression.parse(fh) skip_whitespace(fh) if not check(fh, "=>"): error_msg("Missing '=>' after character set definition.", fh) skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": error_msg("Missing identifier for indentation element definition.", fh) verify_word_in_list(identifier, ["space", "grid", "bad", "newline", "suppressor"], "Unrecognized indentation specifier '%s'." % identifier, fh) trigger_set = None if identifier in ["space", "bad", "grid"]: if len(pattern.sm.states) != 2: error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \ "can be matched by a single character, e.g. \" \" or [a-z].", fh) transition_map = pattern.sm.get_init_state().transitions().get_map() assert len(transition_map) == 1 trigger_set = transition_map.values()[0] skip_whitespace(fh) if identifier == "space": value = read_integer(fh) if value is not None: indentation_setup.specify_space(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? variable = read_identifier(fh) if variable != "": indentation_setup.specify_space(pattern_str, trigger_set, variable, fh) else: indentation_setup.specify_space(pattern_str, trigger_set, 1, fh) elif identifier == "grid": value = read_integer(fh) if value is not None: indentation_setup.specify_grid(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? skip_whitespace(fh) variable = read_identifier(fh) if variable != "": indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh) else: error_msg("Missing integer or variable name after keyword 'grid'.", fh) elif identifier == "bad": indentation_setup.specify_bad(pattern_str, trigger_set, fh) elif identifier == "newline": indentation_setup.specify_newline(pattern_str, pattern.sm, fh) elif identifier == "suppressor": indentation_setup.specify_suppressor(pattern_str, pattern.sm, fh) else: assert False, "Unreachable code reached." if not check(fh, ";"): error_msg("Missing ';' after indentation '%s' specification." % identifier, fh)
def __parse_option(fh, new_mode): def get_pattern_object(SM): if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM) else: result = SM result = hopcroft.do(result, CreateNewStateMachineF=False) return Pattern(result, AllowStateMachineTrafoF=True) identifier = read_option_start(fh) if identifier is None: return False verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option", fh.name, get_current_line_info_number(fh)) if identifier == "skip": # A skipper 'eats' characters at the beginning of a pattern that belong # to a specified set of characters. A useful application is most probably # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to # implement a very effective way to skip these regions. pattern_str, trigger_set = regular_expression.parse_character_set( fh, PatternStringF=True) skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'." % identifier, fh) if trigger_set.is_empty(): error_msg("Empty trigger set for skipper." % identifier, fh) # TriggerSet skipping is implemented the following way: As soon as one element of the # trigger set appears, the state machine enters the 'trigger set skipper section'. # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action. # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)' pattern_sm = StateMachine() pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True) # Skipper code is to be generated later action = GeneratedCode(skip_character_set.do, FileName=fh.name, LineN=get_current_line_info_number(fh)) action.data["character_set"] = trigger_set new_mode.add_match(pattern_str, action, get_pattern_object(pattern_sm), Comment=E_SpecialPatterns.SKIP) return True elif identifier in ["skip_range", "skip_nested_range"]: # A non-nesting skipper can contain a full fledged regular expression as opener, # since it only effects the trigger. Not so the nested range skipper-see below. # -- opener skip_whitespace(fh) if identifier == "skip_nested_range": # Nested range state machines only accept 'strings' not state machines opener_str, opener_sequence = __parse_string( fh, "Opener pattern for 'skip_nested_range'") opener_sm = StateMachine.from_sequence(opener_sequence) else: opener_str, opener_pattern = regular_expression.parse(fh) opener_sm = opener_pattern.sm # For 'range skipping' the opener sequence is not needed, only the opener state # machine is webbed into the pattern matching state machine. opener_sequence = None skip_whitespace(fh) # -- closer closer_str, closer_sequence = __parse_string( fh, "Closing pattern for 'skip_range' or 'skip_nested_range'") skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'" % identifier, fh) # Skipper code is to be generated later generator_function, comment = { "skip_range": (skip_range.do, E_SpecialPatterns.SKIP_RANGE), "skip_nested_range": (skip_nested_range.do, E_SpecialPatterns.SKIP_NESTED_RANGE), }[identifier] action = GeneratedCode(generator_function, FileName=fh.name, LineN=get_current_line_info_number(fh)) action.data["opener_sequence"] = opener_sequence action.data["closer_sequence"] = closer_sequence action.data["mode_name"] = new_mode.name new_mode.add_match(opener_str, action, get_pattern_object(opener_sm), Comment=comment) return True elif identifier == "indentation": value = indentation_setup.do(fh) # Enter 'Newline' and 'Suppressed Newline' as matches into the engine. # Similar to skippers, the indentation count is then triggered by the newline. # -- Suppressed Newline = Suppressor followed by Newline, # then newline does not trigger indentation counting. suppressed_newline_pattern_str = "" if value.newline_suppressor_state_machine.get() is not None: suppressed_newline_pattern_str = \ "(" + value.newline_suppressor_state_machine.pattern_string() + ")" \ + "(" + value.newline_state_machine.pattern_string() + ")" suppressed_newline_sm = \ sequentialize.do([value.newline_suppressor_state_machine.get(), value.newline_state_machine.get()]) FileName = value.newline_suppressor_state_machine.file_name LineN = value.newline_suppressor_state_machine.line_n # Go back to start. code = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN) new_mode.add_match( suppressed_newline_pattern_str, code, get_pattern_object(suppressed_newline_sm), Comment=E_SpecialPatterns.SUPPRESSED_INDENTATION_NEWLINE) # When there is an empty line, then there shall be no indentation count on it. # Here comes the trick: # # Let newline # be defined as: newline ([space]* newline])* # # This way empty lines are eating away before the indentation count is activated. # -- 'space' x0 = StateMachine() x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), AcceptanceF=True) # -- '[space]*' x1 = repeat.do(x0) # -- '[space]* newline' x2 = sequentialize.do([x1, value.newline_state_machine.get()]) # -- '([space]* newline)*' x3 = repeat.do(x2) # -- 'newline ([space]* newline)*' x4 = sequentialize.do([value.newline_state_machine.get(), x3]) # -- nfa to dfa; hopcroft optimization sm = beautifier.do(x4) FileName = value.newline_state_machine.file_name LineN = value.newline_state_machine.line_n action = GeneratedCode(indentation_counter.do, FileName, LineN) action.data["indentation_setup"] = value new_mode.add_match(value.newline_state_machine.pattern_string(), action, get_pattern_object(sm), Comment=E_SpecialPatterns.INDENTATION_NEWLINE) # Announce the mode to which the setup belongs value.set_containing_mode_name(new_mode.name) else: value = read_option_value(fh) # The 'verify_word_in_list()' call must have ensured that the following holds assert mode_option_info_db.has_key(identifier) # Is the option of the appropriate value? option_info = mode_option_info_db[identifier] if option_info.domain is not None and value not in option_info.domain: error_msg("Tried to set value '%s' for option '%s'. " % (value, identifier) + \ "Though, possible for this option are only: %s." % repr(option_info.domain)[1:-1], fh) # Finally, set the option new_mode.add_option(identifier, value) return True
def __create_token_sender_by_token_name(fh, TokenName): assert type(TokenName) in [str, unicode] # Enter token_id into database, if it is not yet defined. token_id_db_verify_or_enter_token_id(fh, TokenName) # Parse the token argument list argument_list = __parse_function_argument_list(fh, TokenName) # Create the token sender explicit_member_names_f = False for arg in argument_list: if arg.find("=") != -1: explicit_member_names_f = True assert blackboard.token_type_definition is not None, \ "A valid token_type_definition must have been parsed at this point." if not explicit_member_names_f: # There are only two allowed cases for implicit token member names: # QUEX_TKN_XYZ(Lexeme) --> call take_text(Lexeme, LexemeEnd) # QUEX_TKN_XYZ(Begin, End) --> call to take_text(Begin, End) if len(argument_list) == 2: return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, (%s), (%s));\n" % \ (argument_list[0], argument_list[1]) + \ "self_send(%s);\n" % (TokenName) elif len(argument_list) == 1: if argument_list[0] == "Lexeme": return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, self.buffer._lexeme_start_p, self.buffer._input_p);\n" \ "self_send(%s);\n" % (TokenName) elif argument_list[0] == "LexemeNull": return "QUEX_NAME_TOKEN(take_text)(self_write_token_p(), &self, LexemeNull, LexemeNull);\n" \ "self_send(%s);\n" % (TokenName) else: error_msg("If one unnamed argument is specified it must be 'Lexeme'\n" + \ "or 'LexemeNull'. Found '%s'.\n" % argument_list[0] + \ "To cut parts of the lexeme, please, use the 2 argument sender, e.g.\n" + \ "QUEX_TKN_MY_ID(Lexeme + 1, LexemeEnd - 2);\n" + \ "Alternatively, use named parameters such as 'number=...'.", fh) elif len(argument_list) == 0: return "self_send(%s);\n" % TokenName else: error_msg("Since 0.49.1, there are only the following brief token senders that can take\n" "unnamed token arguments:\n" " one argument: 'Lexeme' => token.take_text(..., LexemeBegin, LexemeEnd);\n" " two arguments: Begin, End => token.take_text(..., Begin, End);\n" + "Found: " + repr(argument_list)[1:-1] + ".", fh) # Returned from Function if implicit member names member_value_pairs = map(lambda x: x.split("="), argument_list) txt = "" for member, value in member_value_pairs: if value == "": error_msg("One explicit argument name mentioned requires all arguments to\n" + \ "be mentioned explicitly. Value '%s' mentioned without argument.\n" \ % member, fh) if Setup.token_class_file != "": error_msg("Member assignments in brief token senders are inadmissible\n" + \ "with manually written token classes. User provided file '%s'.\n" % Setup.token_class_file + \ "Found member assignment: '%s' = '%s'." % (member, value), fh) else: member_name = member.strip() verify_word_in_list(member_name, blackboard.token_type_definition.get_member_db(), "No member: '%s' in token type description." % member_name, fh) idx = value.find("Lexeme") if idx != -1: if idx != 0 and value[idx-1] == "(": pass else: error_msg("Assignment of token member '%s' with 'Lexeme' directly being involved. The\n" % member_name + "'Lexeme' points into the text buffer and it is not owned by the token object.\n" "\n" "Proposals:\n\n" " (1) Use '(Lexeme)', i.e. surround 'Lexeme' by brackets to indicate\n" " that you are aware of the danger. Do this, if at the end of the\n" " process, the member can be assumed to relate to an object that\n" " is not directly dependent anymore on 'Lexeme'. This is particularly\n" " true if the member is of type 'std::string'. Its constructor\n" " creates a copy of the zero terminated string.\n\n" " (2) Use token senders without named arguments, for example\n" " \"%s(Lexeme+1, LexemeEnd-2)\"\n" % TokenName + " \"%s(Lexeme)\"\n" % TokenName + " These token senders create a copy of the lexeme and let the token\n" " own it.", fh) access = blackboard.token_type_definition.get_member_access(member_name) txt += "self_write_token_p()->%s = %s;\n" % (access, value.strip()) # Box the token, stamp it with an id and 'send' it txt += "self_send(%s);\n" % TokenName return txt
def parse_section(fh): global default_token_type_definition_triggered_by_mode_definition_f # NOTE: End of File is supposed to be reached when trying to read a new # section. Thus, the end-of-file catcher does not encompass the beginning. position = fh.tell() skip_whitespace(fh) word = read_identifier(fh, OnMissingStr="Missing section title") verify_word_in_list(word, blackboard.all_section_title_list, "Unknown quex section '%s'" % word, fh) try: # (*) determine what is defined # # -- 'mode { ... }' => define a mode # -- 'start = ...;' => define the name of the initial mode # -- 'header { ... }' => define code that is to be pasted on top # of the engine (e.g. "#include<...>") # -- 'body { ... }' => define code that is to be pasted in the class' body # of the engine (e.g. "public: int my_member;") # -- 'init { ... }' => define code that is to be pasted in the class' constructors # of the engine (e.g. "my_member = -1;") # -- 'define { ... }' => define patterns shorthands such as IDENTIFIER for [a-z]+ # -- 'repeated_token_id = QUEX_TKN_ ...;' => enables token repetition, defines # the token id to be repeated. # -- 'token { ... }' => define token ids # -- 'token_type { ... }' => define a customized token type # if word in blackboard.fragment_db.keys(): element_name = blackboard.fragment_db[word] fragment = code_fragment.parse(fh, word, AllowBriefTokenSenderF=False) blackboard.__dict__[element_name] = fragment return elif word == "start": mode_name = parse_identifier_assignment(fh) if mode_name == "": error_msg("Missing mode_name after 'start ='", fh) elif not blackboard.initial_mode.sr.is_void(): error_msg("start mode defined more than once!", fh, DontExitF=True) error_msg("previously defined here", blackboard.initial_mode.sr) blackboard.initial_mode = CodeUser(mode_name, SourceRef.from_FileHandle(fh)) return elif word == "repeated_token": blackboard.token_repetition_token_id_list = parse_token_id_definitions(fh, NamesOnlyF=True) for token_name in blackboard.token_repetition_token_id_list: verify_word_in_list(token_name[len(Setup.token_id_prefix):], blackboard.token_id_db.keys(), "Token ID '%s' not yet defined." % token_name, fh, ExitF=False, SuppressCode=NotificationDB.warning_repeated_token_not_yet_defined) return elif word == "define": parse_pattern_name_definitions(fh) return elif word == "token": if Setup.token_id_foreign_definition: error_msg("Token id file '%s' has been specified.\n" \ % Setup.token_id_foreign_definition_file \ + "All token ids must be specified there. Section 'token'\n" \ + "is not allowed.", fh) parse_token_id_definitions(fh) return elif word == "token_type": if Setup.token_class_file != "": error_msg("Section 'token_type' is intended to generate a token class.\n" \ + "However, the manually written token class file '%s'" \ % repr(Setup.token_class_file) \ + "has been specified on the command line.", fh) if blackboard.token_type_definition is None: blackboard.token_type_definition = token_type.parse(fh) return # Error case: if default_token_type_definition_triggered_by_mode_definition_f: error_msg("Section 'token_type' must appear before first mode definition.", fh) else: error_msg("Section 'token_type' has been defined twice.", fh, DontExitF=True) error_msg("Previously defined here.", blackboard.token_type_definition.sr.file_name, blackboard.token_type_definition.sr.line_n) return elif word == "mode": # When the first mode is parsed then a token_type definition must be # present. If not, the default token type definition is considered. if blackboard.token_type_definition is None: parse_default_token_definition() default_token_type_definition_triggered_by_mode_definition_f = True mode.parse(fh) return else: # This case should have been caught by the 'verify_word_in_list' function assert False except EndOfStreamException: fh.seek(position) error_eof(word, fh)
def prepare(command_line, argv): """RETURN: True, if process needs to be started. False, if job is done. """ global Setup # (*) Classes and their namespace __setup_analyzer_class(Setup) __setup_token_class(Setup) __setup_token_id_prefix(Setup) __setup_lexeme_null(Setup) # Requires 'token_class_name_space' # (*) Output programming language Setup.language = Setup.language.upper() verify_word_in_list(Setup.language, output_language_db.keys(), "Programming language '%s' is not supported." % Setup.language) Setup.language_db = output_language_db[Setup.language] Setup.extension_db = global_extension_db[Setup.language] # Is the output file naming scheme provided by the extension database # (Validation must happen immediately) if Setup.extension_db.has_key(Setup.output_file_naming_scheme) == False: error_msg("File extension scheme '%s' is not provided for language '%s'.\n" \ % (Setup.output_file_naming_scheme, Setup.language) + \ "Available schemes are: %s." % repr(Setup.extension_db.keys())[1:-1]) # (*) Output files if Setup.buffer_codec_name == "utf8": module = utf8_state_split elif Setup.buffer_codec_name == "utf16": module = utf16_state_split else: module = None Setup.buffer_codec_prepare(Setup.buffer_codec_name, Setup.buffer_codec_file, module) # AFTER: Setup.buffer_codec_prepare() !!! if Setup.language not in ["DOT"]: prepare_file_names(Setup) if Setup.buffer_byte_order == "<system>": Setup.buffer_byte_order = sys.byteorder Setup.byte_order_is_that_of_current_system_f = True else: Setup.byte_order_is_that_of_current_system_f = False if Setup.buffer_element_size == "wchar_t": error_msg("Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n" "with option '--buffer-element-size' or '-bes'. Please, specify\n" "'--buffer-element-type wchar_t' or '--bet'.") if Setup.buffer_element_type == "wchar_t": Setup.converter_ucs_coding_name = "WCHAR_T" # (*) Determine buffer element type and size (in bytes) if Setup.buffer_element_size == -1: if global_character_type_db.has_key(Setup.buffer_element_type): Setup.buffer_element_size = global_character_type_db[Setup.buffer_element_type][3] elif Setup.buffer_element_type == "": Setup.buffer_element_size = 1 else: # Buffer element type is not identified in 'global_character_type_db'. # => here Quex cannot know its size on its own. Setup.buffer_element_size = -1 if Setup.buffer_element_type == "": if Setup.buffer_element_size in [1, 2, 4]: Setup.buffer_element_type = { 1: "uint8_t", 2: "uint16_t", 4: "uint32_t", }[Setup.buffer_element_size] elif Setup.buffer_element_size == -1: pass else: error_msg("Buffer element type cannot be determined for size '%i' which\n" \ % Setup.buffer_element_size + "has been specified by '-b' or '--buffer-element-size'.") type_info = global_character_type_db.get(Setup.buffer_element_type) if type_info is not None and len(type_info) >= 4 \ and type_info[3] != -1 and Setup.buffer_element_size != -1 \ and type_info[3] != Setup.buffer_element_size: error_msg("\nBuffer element type ('--bet' or '--buffer-element-type') was set to '%s'.\n" \ % Setup.buffer_element_type \ + "It is well known to be of size %s[byte]. However, the buffer element size\n" \ % type_info[3] \ + "('-b' or '--buffer-element-type') was specified as '%s'.\n\n" \ % Setup.buffer_element_size \ + "Quex can continue, but the result is questionable.\n", \ DontExitF=True) Setup.converter_f = False if Setup.converter_iconv_f or Setup.converter_icu_f or len(Setup.converter_user_new_func) != 0: Setup.converter_f = True # The only case where no converter helper is required is where ASCII # (Unicode restricted to [0, FF] is used. Setup.converter_helper_required_f = True if Setup.converter_f == False and Setup.buffer_element_size == 1 and Setup.buffer_codec.name == "unicode": Setup.converter_helper_required_f = False validation.do(Setup, command_line, argv) if Setup.converter_ucs_coding_name == "": if global_character_type_db.has_key(Setup.buffer_element_type): if Setup.buffer_byte_order == "little": index = 1 else: index = 2 Setup.converter_ucs_coding_name = global_character_type_db[Setup.buffer_element_type][index] if len(Setup.token_id_foreign_definition) != 0: if len(Setup.token_id_foreign_definition) > 3: error_msg("Option '--foreign-token-id-file' received > 3 followers.\n" "Found: %s" % str(Setup.token_id_foreign_definition)[1:-1]) if len(Setup.token_id_foreign_definition) > 1: Setup.token_id_foreign_definition_file_region_begin_re = \ __compile_regular_expression(Setup.token_id_foreign_definition[1], "token id region begin") if len(Setup.token_id_foreign_definition) > 2: Setup.token_id_foreign_definition_file_region_end_re = \ __compile_regular_expression(Setup.token_id_foreign_definition[2], "token id region end") Setup.token_id_foreign_definition_file = \ Setup.token_id_foreign_definition[0] CommentDelimiterList = [["//", "\n"], ["/*", "*/"]] token_id_file_parse(Setup.token_id_foreign_definition_file, CommentDelimiterList) # (*) Compression Types compression_type_list = [] for name, ctype in [("compression_template_f", E_Compression.TEMPLATE), ("compression_template_uniform_f", E_Compression.TEMPLATE_UNIFORM), ("compression_path_f", E_Compression.PATH), ("compression_path_uniform_f", E_Compression.PATH_UNIFORM)]: if command_line_args_defined(command_line, name): compression_type_list.append((command_line_arg_position(name), ctype)) compression_type_list.sort(key=itemgetter(0)) Setup.compression_type_list = map(lambda x: x[1], compression_type_list) # (*) return Setup ___________________________________________________________________ return True
def do(setup, command_line, argv): """Does a consistency check for setup and the command line. """ setup.output_directory = os.path.normpath(setup.output_directory) if setup.output_directory != "": # Check, if the output directory exists if os.access(setup.output_directory, os.F_OK) == False: error_msg("The directory %s was specified for output, but does not exists." % setup.output_directory) if os.access(setup.output_directory, os.W_OK) == False: error_msg("The directory %s was specified for output, but is not writeable." % setup.output_directory) # if the mode is '--language dot' => check character display options. if setup.character_display not in ["hex", "utf8"]: error_msg("Character display must be either 'hex' or 'utf8'.\nFound: '%s'" % setup.character_display) # ensure that options are not specified twice for parameter, info in SETUP_INFO.items(): if type(info) != list: continue occurence_n = 0 for option in info[0]: occurence_n += argv.count(option) if occurence_n > 1: error_msg("Received more than one of the following options:\n" + \ "%s" % repr(info[0])[1:-1]) # (*) Check for 'Depraceted' Options ___________________________________________________ for name, info in DEPRECATED.items(): command_line_options = SETUP_INFO[name][0] comment = info[0] depreciated_since_version = info[1] for option in command_line_options: if command_line.search(option): error_msg("Command line option '%s' is ignored.\n" % option + \ comment + "\n" + \ "Last version of Quex supporting this option is version %s. Please, visit\n" % \ depreciated_since_version + \ "http://quex.org for further information.") # (*) Check for 'Straying' Options ___________________________________________________ options = [] for key, info in SETUP_INFO.items(): if type(info) != list: continue if key in DEPRECATED: continue if info[1] is not None: options.extend(info[0]) options.sort(lambda a,b: cmp(a.replace("-",""), b.replace("-",""))) ufos = command_line.unidentified_options(options) if len(ufos) != 0: error_msg("Unidentified option(s) = " + repr(ufos) + "\n" + \ __get_supported_command_line_option_description(options)) if setup.analyzer_derived_class_name != "" and \ setup.analyzer_derived_class_file == "": error_msg("Specified derived class '%s' on command line, but it was not\n" % \ setup.analyzer_derived_class_name + \ "specified which file contains the definition of it.\n" + \ "use command line option '--derived-class-file'.\n") if setup.buffer_element_size not in [-1, 1, 2, 4]: error_msg("The setting of '--buffer-element-size' (or '-b') can only be\n" "1, 2, or 4 (found %s)." % repr(setup.buffer_element_size)) if setup.buffer_byte_order not in ["<system>", "little", "big"]: error_msg("Byte order (option --endian) must be 'little', 'big', or '<system>'.\n" + \ "Note, that this option is only interesting for cross plattform development.\n" + \ "By default, quex automatically chooses the endian type of your system.") # Manually written token class requires token class name to be specified if setup.token_class_file != "" and command_line.search("--token-class", "--tc") == False: error_msg("The use of a manually written token class requires that the name of the class\n" "is specified on the command line via the '--token-class' option.") # Token queue if setup.token_policy != "queue" and command_line.search("--token-queue-size"): error_msg("Option --token-queue-size determines a fixed token queue size. This makes\n" + \ "only sense in conjunction with '--token-policy queue'.\n") if setup.token_queue_size <= setup.token_queue_safety_border + 1: if setup.token_queue_size == setup.token_queue_safety_border: cmp_str = "equal to" else: cmp_str = "less than" error_msg("Token queue size is %i is %s token queue safety border %i + 1.\n" % \ (setup.token_queue_size, cmp_str, setup.token_queue_safety_border) + "Set appropriate values with --token-queue-size and --token-queue-safety-border.") # Check that names are valid identifiers __check_identifier(setup, "token_id_prefix_plain", "Token prefix") __check_identifier(setup, "analyzer_class_name", "Engine name") if setup.analyzer_derived_class_name != "": __check_identifier(setup, "analyzer_derived_class_name", "Derived class name") __check_file_name(setup, "token_class_file", "file containing token class definition") __check_file_name(setup, "analyzer_derived_class_file", "file containing user derived lexer class") __check_file_name(setup, "token_id_foreign_definition_file", "file containing user token ids") __check_file_name(setup, "input_mode_files", "quex source file") # Check that not more than one converter is specified converter_n = 0 if setup.converter_iconv_f: converter_n += 1 if setup.converter_icu_f: converter_n += 1 if setup.converter_user_new_func != "": converter_n += 1 if converter_n > 1: error_msg("More than one character converter has been specified. Note, that the\n" + \ "options '--icu', '--iconv', and '--converter-new' (or '--cn') are\n" + \ "to be used mutually exclusively.") if converter_n == 1 and setup.buffer_codec != "unicode": # If the buffer codec is other than unicode, then no converter shall # be used to fill the buffer. Instead, the engine is transformed, so # that it works directly on the codec. error_msg("An engine that is to be generated for a specific codec cannot rely\n" + \ "on converters. Do no use '--codec' together with '--icu', '--iconv', or\n" + \ "`--converter-new`.") # If a converter has been specified and no bytes-element-size has been specified, # it defaults to '1 byte' which is most likely not what is desired for unicode. if converter_n == 1 \ and setup.buffer_element_size == 1 \ and not command_line_args_defined(command_line, "buffer_element_size") \ and not command_line_args_defined(command_line, "buffer_element_type"): error_msg("A converter has been specified, but the default buffer element size\n" + \ "is left to 1 byte. Consider %s or %s." \ % (command_line_args_string("buffer_element_size"), command_line_args_string("buffer_element_type"))) # If a user defined type is specified for 'engine character type' and # a converter, then the name of the target type must be specified explicitly. if setup.buffer_element_type != "" \ and not global_character_type_db.has_key(setup.buffer_element_type) \ and setup.converter_ucs_coding_name == "" \ and converter_n != 0: tc = setup.buffer_element_type error_msg("A character code converter has been specified. It is supposed to convert\n" + \ "incoming data into an internal buffer of unicode characters. The size of\n" + \ "each character is determined by '%s' which is a user defined type.\n" % tc + \ "\n" + \ "Quex cannot determine automatically the name that the converter requires\n" + \ "to produce unicode characters for type '%s'. It must be specified by the\n" % tc + \ "command line option %s." \ % command_line_args_string("converter_ucs_coding_name")) # Token transmission policy token_policy_list = ["queue", "single", "users_token", "users_queue"] if setup.token_policy not in token_policy_list: error_msg("Token policy '%s' not supported. Use one of the following:\n" % setup.token_policy + \ repr(token_policy_list)[1:-1]) elif setup.token_policy == "users_token": error_msg("Token policy 'users_queue' has be deprecated since 0.49.1. Use\n" "equivalent policy 'single'.") elif setup.token_policy == "users_queue": error_msg("Token policy 'users_queue' has be deprecated since 0.49.1\n") # Internal engine character encoding def __codec_vs_buffer_element_size(CodecName, RequiredBufferElementSize): if setup.buffer_codec != CodecName: return elif setup.buffer_element_size == RequiredBufferElementSize: return if setup.buffer_element_size == -1: msg_str = "undetermined (found type '%s')" % setup.buffer_element_type else: msg_str = "is not %i (found %i)" % (RequiredBufferElementSize, setup.buffer_element_size) error_msg("Using codec '%s' while buffer element size %s.\n" % (CodecName, msg_str) + "Consult command line argument %s" \ % command_line_args_string("buffer_element_size")) if setup.buffer_codec != "unicode": if setup.buffer_codec_file == "": verify_word_in_list(setup.buffer_codec, codec_db.get_supported_codec_list() + ["utf8", "utf16"], "Codec '%s' is not supported." % setup.buffer_codec) __codec_vs_buffer_element_size("utf8", 1) __codec_vs_buffer_element_size("utf16", 2) if setup.external_lexeme_null_object and setup.token_class_only_f: error_msg("Specifying an external lexeme null object signalizes an\n" "external token class implementation. The 'token class only\n" "flag' generates a token class considered to be externally\n" "shared. Both flags are mutually exclusive.")
def __parse_option(fh, new_mode): def get_pattern_object(SM): if not SM.is_DFA_compliant(): result = nfa_to_dfa.do(SM) else: result = SM result = hopcroft.do(result, CreateNewStateMachineF=False) return Pattern(result, AllowStateMachineTrafoF=True) identifier = read_option_start(fh) if identifier is None: return False verify_word_in_list(identifier, mode_option_info_db.keys(), "mode option", fh.name, get_current_line_info_number(fh)) if identifier == "skip": # A skipper 'eats' characters at the beginning of a pattern that belong # to a specified set of characters. A useful application is most probably # the whitespace skipper '[ \t\n]'. The skipper definition allows quex to # implement a very effective way to skip these regions. pattern_str, trigger_set = regular_expression.parse_character_set(fh, PatternStringF=True) skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'." % identifier, fh) if trigger_set.is_empty(): error_msg("Empty trigger set for skipper." % identifier, fh) # TriggerSet skipping is implemented the following way: As soon as one element of the # trigger set appears, the state machine enters the 'trigger set skipper section'. # Enter the skipper as if the opener pattern was a normal pattern and the 'skipper' is the action. # NOTE: The correspondent CodeFragment for skipping is created in 'implement_skippers(...)' pattern_sm = StateMachine() pattern_sm.add_transition(pattern_sm.init_state_index, trigger_set, AcceptanceF=True) # Skipper code is to be generated later action = GeneratedCode(skip_character_set.do, FileName = fh.name, LineN = get_current_line_info_number(fh)) action.data["character_set"] = trigger_set new_mode.add_match(pattern_str, action, get_pattern_object(pattern_sm), Comment=E_SpecialPatterns.SKIP) return True elif identifier in ["skip_range", "skip_nested_range"]: # A non-nesting skipper can contain a full fledged regular expression as opener, # since it only effects the trigger. Not so the nested range skipper-see below. # -- opener skip_whitespace(fh) if identifier == "skip_nested_range": # Nested range state machines only accept 'strings' not state machines opener_str, opener_sequence = __parse_string(fh, "Opener pattern for 'skip_nested_range'") opener_sm = StateMachine.from_sequence(opener_sequence) else: opener_str, opener_pattern = regular_expression.parse(fh) opener_sm = opener_pattern.sm # For 'range skipping' the opener sequence is not needed, only the opener state # machine is webbed into the pattern matching state machine. opener_sequence = None skip_whitespace(fh) # -- closer closer_str, closer_sequence = __parse_string(fh, "Closing pattern for 'skip_range' or 'skip_nested_range'") skip_whitespace(fh) if fh.read(1) != ">": error_msg("missing closing '>' for mode option '%s'" % identifier, fh) # Skipper code is to be generated later generator_function, comment = { "skip_range": (skip_range.do, E_SpecialPatterns.SKIP_RANGE), "skip_nested_range": (skip_nested_range.do, E_SpecialPatterns.SKIP_NESTED_RANGE), }[identifier] action = GeneratedCode(generator_function, FileName = fh.name, LineN = get_current_line_info_number(fh)) action.data["opener_sequence"] = opener_sequence action.data["closer_sequence"] = closer_sequence action.data["mode_name"] = new_mode.name new_mode.add_match(opener_str, action, get_pattern_object(opener_sm), Comment=comment) return True elif identifier == "indentation": value = indentation_setup.do(fh) # Enter 'Newline' and 'Suppressed Newline' as matches into the engine. # Similar to skippers, the indentation count is then triggered by the newline. # -- Suppressed Newline = Suppressor followed by Newline, # then newline does not trigger indentation counting. suppressed_newline_pattern_str = "" if value.newline_suppressor_state_machine.get() is not None: suppressed_newline_pattern_str = \ "(" + value.newline_suppressor_state_machine.pattern_string() + ")" \ + "(" + value.newline_state_machine.pattern_string() + ")" suppressed_newline_sm = \ sequentialize.do([value.newline_suppressor_state_machine.get(), value.newline_state_machine.get()]) FileName = value.newline_suppressor_state_machine.file_name LineN = value.newline_suppressor_state_machine.line_n # Go back to start. code = UserCodeFragment("goto %s;" % get_label("$start", U=True), FileName, LineN) new_mode.add_match(suppressed_newline_pattern_str, code, get_pattern_object(suppressed_newline_sm), Comment=E_SpecialPatterns.SUPPRESSED_INDENTATION_NEWLINE) # When there is an empty line, then there shall be no indentation count on it. # Here comes the trick: # # Let newline # be defined as: newline ([space]* newline])* # # This way empty lines are eating away before the indentation count is activated. # -- 'space' x0 = StateMachine() x0.add_transition(x0.init_state_index, value.indentation_count_character_set(), AcceptanceF=True) # -- '[space]*' x1 = repeat.do(x0) # -- '[space]* newline' x2 = sequentialize.do([x1, value.newline_state_machine.get()]) # -- '([space]* newline)*' x3 = repeat.do(x2) # -- 'newline ([space]* newline)*' x4 = sequentialize.do([value.newline_state_machine.get(), x3]) # -- nfa to dfa; hopcroft optimization sm = beautifier.do(x4) FileName = value.newline_state_machine.file_name LineN = value.newline_state_machine.line_n action = GeneratedCode(indentation_counter.do, FileName, LineN) action.data["indentation_setup"] = value new_mode.add_match(value.newline_state_machine.pattern_string(), action, get_pattern_object(sm), Comment=E_SpecialPatterns.INDENTATION_NEWLINE) # Announce the mode to which the setup belongs value.set_containing_mode_name(new_mode.name) else: value = read_option_value(fh) # The 'verify_word_in_list()' call must have ensured that the following holds assert mode_option_info_db.has_key(identifier) # Is the option of the appropriate value? option_info = mode_option_info_db[identifier] if option_info.domain is not None and value not in option_info.domain: error_msg("Tried to set value '%s' for option '%s'. " % (value, identifier) + \ "Though, possible for this option are only: %s." % repr(option_info.domain)[1:-1], fh) # Finally, set the option new_mode.add_option(identifier, value) return True
def do(setup, command_line, argv): """Does a consistency check for setup and the command line. """ setup.output_directory = os.path.normpath(setup.output_directory) if setup.output_directory: # Check, if the output directory exists if os.access(setup.output_directory, os.F_OK) == False: error_msg( "The directory %s was specified for output, but does not exists." % setup.output_directory) if os.access(setup.output_directory, os.W_OK) == False: error_msg( "The directory %s was specified for output, but is not writeable." % setup.output_directory) # if the mode is '--language dot' => check character display options. if setup.character_display not in ["hex", "utf8"]: error_msg( "Character display must be either 'hex' or 'utf8'.\nFound: '%s'" % setup.character_display) # ensure that options are not specified twice for parameter, info in SETUP_INFO.items(): if type(info) != list: continue occurence_n = 0 for option in info[0]: occurence_n += argv.count(option) if occurence_n > 1 and info[1] not in (SetupParTypes.LIST, SetupParTypes.INT_LIST): error_msg("Received more than one of the following options:\n" + \ "%s" % repr(info[0])[1:-1]) # (*) Check for 'Depraceted' Options ___________________________________________________ for name, info in DEPRECATED.items(): command_line_options = SETUP_INFO[name][0] comment = info[0] depreciated_since_version = info[1] for option in command_line_options: if command_line.search(option): error_msg("Command line option '%s' is ignored.\n" % option + \ comment + "\n" + \ "Last version of Quex supporting this option is version %s. Please, visit\n" % \ depreciated_since_version + \ "http://quex.org for further information.") # (*) Check for 'Straying' Options ___________________________________________________ options = [] for key, info in SETUP_INFO.items(): if type(info) != list: continue if key in DEPRECATED: continue if info[1] is not None: options.extend(info[0]) options.sort(lambda a, b: cmp(a.replace("-", ""), b.replace("-", ""))) ufos = command_line.unidentified_options(options) if len(ufos) != 0: error_msg("Unidentified option(s) = " + repr(ufos) + "\n" + \ __get_supported_command_line_option_description(options)) if setup.analyzer_derived_class_name != "" and \ setup.analyzer_derived_class_file == "": error_msg("Specified derived class '%s' on command line, but it was not\n" % \ setup.analyzer_derived_class_name + \ "specified which file contains the definition of it.\n" + \ "use command line option '--derived-class-file'.\n") if setup.buffer_element_size not in [-1, 1, 2, 4]: error_msg( "The setting of '--buffer-element-size' (or '-b') can only be\n" "1, 2, or 4 (found %s)." % repr(setup.buffer_element_size)) if setup.buffer_byte_order not in ["<system>", "little", "big"]: error_msg("Byte order (option --endian) must be 'little', 'big', or '<system>'.\n" + \ "Note, that this option is only interesting for cross plattform development.\n" + \ "By default, quex automatically chooses the endian type of your system.") # Manually written token class requires token class name to be specified if setup.token_class_file != "" and command_line.search( "--token-class", "--tc") == False: error_msg( "The use of a manually written token class requires that the name of the class\n" "is specified on the command line via the '--token-class' option.") # Token queue if setup.token_policy != "queue" and command_line.search( "--token-queue-size"): error_msg("Option --token-queue-size determines a fixed token queue size. This makes\n" + \ "only sense in conjunction with '--token-policy queue'.\n") if setup.token_queue_size <= setup.token_queue_safety_border + 1: if setup.token_queue_size == setup.token_queue_safety_border: cmp_str = "equal to" else: cmp_str = "less than" error_msg("Token queue size is %i is %s token queue safety border %i + 1.\n" % \ (setup.token_queue_size, cmp_str, setup.token_queue_safety_border) + "Set appropriate values with --token-queue-size and --token-queue-safety-border.") # Check that names are valid identifiers if len(setup.token_id_prefix_plain) != 0: __check_identifier(setup, "token_id_prefix_plain", "Token prefix") __check_identifier(setup, "analyzer_class_name", "Engine name") if setup.analyzer_derived_class_name != "": __check_identifier(setup, "analyzer_derived_class_name", "Derived class name") __check_file_name(setup, "token_class_file", "file containing token class definition") __check_file_name(setup, "analyzer_derived_class_file", "file containing user derived lexer class") __check_file_name( setup, "token_id_foreign_definition_file", "file containing user token ids", 0, CommandLineOption=SETUP_INFO["token_id_foreign_definition"][0]) __check_file_name(setup, "input_mode_files", "quex source file") # Check that not more than one converter is specified converter_n = 0 if setup.converter_iconv_f: converter_n += 1 if setup.converter_icu_f: converter_n += 1 if len(setup.converter_user_new_func) != 0: converter_n += 1 if converter_n > 1: error_msg("More than one character converter has been specified. Note, that the\n" + \ "options '--icu', '--iconv', and '--converter-new' (or '--cn') are\n" + \ "to be used mutually exclusively.") if converter_n == 1 and setup.buffer_codec.name != "unicode": # If the buffer codec is other than unicode, then no converter shall # be used to fill the buffer. Instead, the engine is transformed, so # that it works directly on the codec. error_msg("An engine that is to be generated for a specific codec cannot rely\n" + \ "on converters. Do no use '--codec' together with '--icu', '--iconv', or\n" + \ "`--converter-new`.") # If a converter has been specified and no bytes-element-size has been specified, # it defaults to '1 byte' which is most likely not what is desired for unicode. if converter_n == 1 \ and setup.buffer_element_size == 1 \ and not command_line_args_defined(command_line, "buffer_element_size") \ and not command_line_args_defined(command_line, "buffer_element_type"): error_msg("A converter has been specified, but the default buffer element size\n" + \ "is left to 1 byte. Consider %s or %s." \ % (command_line_args_string("buffer_element_size"), command_line_args_string("buffer_element_type"))) # If a user defined type is specified for 'engine character type' and # a converter, then the name of the target type must be specified explicitly. if setup.buffer_element_type != "" \ and not global_character_type_db.has_key(setup.buffer_element_type) \ and setup.converter_ucs_coding_name == "" \ and converter_n != 0: tc = setup.buffer_element_type error_msg("A character code converter has been specified. It is supposed to convert\n" + \ "incoming data into an internal buffer of unicode characters. The size of\n" + \ "each character is determined by '%s' which is a user defined type.\n" % tc + \ "\n" + \ "Quex cannot determine automatically the name that the converter requires\n" + \ "to produce unicode characters for type '%s'. It must be specified by the\n" % tc + \ "command line option %s." \ % command_line_args_string("converter_ucs_coding_name")) # Token transmission policy token_policy_list = ["queue", "single", "users_token", "users_queue"] if setup.token_policy not in token_policy_list: error_msg("Token policy '%s' not supported. Use one of the following:\n" % setup.token_policy + \ repr(token_policy_list)[1:-1]) elif setup.token_policy == "users_token": error_msg( "Token policy 'users_queue' has be deprecated since 0.49.1. Use\n" "equivalent policy 'single'.") elif setup.token_policy == "users_queue": error_msg( "Token policy 'users_queue' has be deprecated since 0.49.1\n") # Internal engine character encoding def __codec_vs_buffer_element_size(CodecName, RequiredBufferElementSize): if setup.buffer_codec.name != CodecName: return elif setup.buffer_element_size == RequiredBufferElementSize: return if setup.buffer_element_size == -1: msg_str = "undetermined (found type '%s')" % setup.buffer_element_type else: msg_str = "is not %i (found %i)" % (RequiredBufferElementSize, setup.buffer_element_size) error_msg("Using codec '%s' while buffer element size %s.\n" % (CodecName, msg_str) + "Consult command line argument %s" \ % command_line_args_string("buffer_element_size")) if setup.buffer_codec.name != "unicode": if not setup.buffer_codec_file: verify_word_in_list( setup.buffer_codec_name, codec_db.get_supported_codec_list() + ["utf8", "utf16"], "Codec '%s' is not supported." % setup.buffer_codec.name) __codec_vs_buffer_element_size("utf8", 1) __codec_vs_buffer_element_size("utf16", 2) if setup.external_lexeme_null_object and setup.token_class_only_f: error_msg( "Specifying an external lexeme null object signalizes an\n" "external token class implementation. The 'token class only\n" "flag' generates a token class considered to be externally\n" "shared. Both flags are mutually exclusive.") if setup.string_accumulator_f: error_n = NotificationDB.warning_on_no_token_class_take_text if error_n in setup.suppressed_notification_list: error_msg( "The warning upon missing 'take_text' in token type definition is de-\n" + "activated by '--suppress %i'. This is dangerous, if there is a string\n" % error_n + "accumulator. May be, use '--no-string-accumulator'.", DontExitF=True, WarningF=True, SuppressCode=NotificationDB. warning_on_no_warning_on_missing_take_text)
def __parse_section(fh, descriptor, already_defined_list): global token_type_code_fragment_db assert type(already_defined_list) == list SubsectionList = ["name", "file_name", "standard", "distinct", "union", "inheritable", "noid"] \ + token_type_code_fragment_db.keys() position = fh.tell() skip_whitespace(fh) word = read_identifier(fh) if word == "": fh.seek(position) if check(fh, "}"): fh.seek(position) return False error_msg( "Missing token_type section ('standard', 'distinct', or 'union').", fh) verify_word_in_list( word, SubsectionList, "Subsection '%s' not allowed in token_type section." % word, fh) if word == "name": if not check(fh, "="): error_msg("Missing '=' in token_type 'name' specification.", fh) descriptor.class_name, descriptor.name_space, descriptor.class_name_safe = read_namespaced_name( fh, "token_type") if not check(fh, ";"): error_msg( "Missing terminating ';' in token_type 'name' specification.", fh) elif word == "inheritable": descriptor.open_for_derivation_f = True check_or_die(fh, ";") elif word == "noid": descriptor.token_contains_token_id_f = False check_or_die(fh, ";") elif word == "file_name": if not check(fh, "="): error_msg("Missing '=' in token_type 'file_name' specification.", fh) descriptor.set_file_name(read_until_letter(fh, ";")) if not check(fh, ";"): error_msg( "Missing terminating ';' in token_type 'file_name' specification.", fh) elif word in ["standard", "distinct", "union"]: if word == "standard": parse_standard_members(fh, word, descriptor, already_defined_list) elif word == "distinct": parse_distinct_members(fh, word, descriptor, already_defined_list) elif word == "union": parse_union_members(fh, word, descriptor, already_defined_list) if not check(fh, "}"): fh.seek(position) error_msg( "Missing closing '}' at end of token_type section '%s'." % word, fh) elif word in token_type_code_fragment_db.keys(): fragment = code_fragment.parse(fh, word, AllowBriefTokenSenderF=False) descriptor.__dict__[word] = fragment else: assert False, "This code section section should not be reachable because 'word'\n" + \ "was checked to fit in one of the 'elif' cases." return True
def __perform_setup(command_line, argv): """RETURN: True, if process needs to be started. False, if job is done. """ global setup # (*) Classes and their namespace __setup_analyzer_class(setup) __setup_token_class(setup) __setup_token_id_prefix(setup) __setup_lexeme_null(setup) # Requires 'token_class_name_space' # (*) Output programming language setup.language = setup.language.upper() verify_word_in_list(setup.language, quex_core_engine_generator_languages_db.keys(), "Programming language '%s' is not supported." % setup.language) setup.language_db = quex_core_engine_generator_languages_db[setup.language] setup.extension_db = global_extension_db[setup.language] # Is the output file naming scheme provided by the extension database # (Validation must happen immediately) if setup.extension_db.has_key(setup.output_file_naming_scheme) == False: error_msg("File extension scheme '%s' is not provided for language '%s'.\n" \ % (setup.output_file_naming_scheme, setup.language) + \ "Available schemes are: %s." % repr(setup.extension_db.keys())[1:-1]) # Before file names can be prepared, determine the output directory # If 'source packaging' is enabled and no output directory is specified # then take the directory of the source packaging. if setup.source_package_directory != "" and setup.output_directory == "": setup.output_directory = setup.source_package_directory if setup.buffer_codec in ["utf8", "utf16"]: setup.buffer_codec_transformation_info = setup.buffer_codec + "-state-split" elif setup.buffer_codec_file != "": try: setup.buffer_codec = os.path.splitext(os.path.basename(setup.buffer_codec_file))[0] except: error_msg("cannot interpret string following '--codec-file'") setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info(FileName=setup.buffer_codec_file) elif setup.buffer_codec != "unicode": setup.buffer_codec_transformation_info = codec_db.get_codec_transformation_info(setup.buffer_codec) if setup.buffer_codec != "unicode": setup.buffer_element_size_irrelevant = True # (*) Output files if setup.language not in ["DOT"]: prepare_file_names(setup) if setup.buffer_byte_order == "<system>": setup.buffer_byte_order = sys.byteorder setup.byte_order_is_that_of_current_system_f = True else: setup.byte_order_is_that_of_current_system_f = False if setup.buffer_element_size == "wchar_t": error_msg("Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n" "with option '--buffer-element-size' or '-bes'. Please, specify\n" "'--buffer-element-type wchar_t' or '--bet'.") if setup.buffer_element_type == "wchar_t": setup.converter_ucs_coding_name = "WCHAR_T" make_numbers(setup) # (*) Determine buffer element type and size (in bytes) if setup.buffer_element_size == -1: if global_character_type_db.has_key(setup.buffer_element_type): setup.buffer_element_size = global_character_type_db[setup.buffer_element_type][3] elif setup.buffer_element_type == "": setup.buffer_element_size = 1 else: # If the buffer element type is defined, then here we know that it is 'unknown' # and Quex cannot know its size on its own. setup.buffer_element_size = -1 if setup.buffer_element_type == "": if setup.buffer_element_size in [1, 2, 4]: setup.buffer_element_type = { 1: "uint8_t", 2: "uint16_t", 4: "uint32_t", }[setup.buffer_element_size] elif setup.buffer_element_size == -1: pass else: error_msg("Buffer element type cannot be determined for size '%i' which\n" \ % setup.buffer_element_size + "has been specified by '-b' or '--buffer-element-size'.") setup.converter_f = False if setup.converter_iconv_f or setup.converter_icu_f: setup.converter_f = True # The only case where no converter helper is required is where ASCII # (Unicode restricted to [0, FF] is used. setup.converter_helper_required_f = True if setup.converter_f == False and setup.buffer_element_size == 1 and setup.buffer_codec == "unicode": setup.converter_helper_required_f = False validation.do(setup, command_line, argv) if setup.converter_ucs_coding_name == "": if global_character_type_db.has_key(setup.buffer_element_type): if setup.buffer_byte_order == "little": index = 1 else: index = 2 setup.converter_ucs_coding_name = global_character_type_db[setup.buffer_element_type][index] if setup.token_id_foreign_definition_file != "": CommentDelimiterList = [["//", "\n"], ["/*", "*/"]] # Regular expression to find '#include <something>' and extract the 'something' # in a 'group'. Note that '(' ')' cause the storage of parts of the match. IncludeRE = "#[ \t]*include[ \t]*[\"<]([^\">]+)[\">]" # parse_token_id_file(setup.token_id_foreign_definition_file, setup.token_id_prefix, CommentDelimiterList, IncludeRE) if setup.token_id_prefix_plain != setup.token_id_prefix: # The 'plain' name space less token indices are also supported parse_token_id_file(setup.token_id_foreign_definition_file, setup.token_id_prefix_plain, CommentDelimiterList, IncludeRE) # (*) Compression Types compression_type_list = [] for name, ctype in [("compression_template_f", E_Compression.TEMPLATE), ("compression_template_uniform_f", E_Compression.TEMPLATE_UNIFORM), ("compression_path_f", E_Compression.PATH), ("compression_path_uniform_f", E_Compression.PATH_UNIFORM)]: if command_line_args_defined(command_line, name): compression_type_list.append((command_line_arg_position(name), ctype)) compression_type_list.sort(key=itemgetter(0)) setup.compression_type_list = map(lambda x: x[1], compression_type_list) # (*) return setup ___________________________________________________________________ return True
def do(fh): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ indentation_setup = IndentationSetup(fh) # NOTE: Catching of EOF happens in caller: parse_section(...) # skip_whitespace(fh) while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): indentation_setup.seal() indentation_setup.consistency_check(fh) return indentation_setup # A regular expression state machine pattern_str, pattern = regular_expression.parse(fh) skip_whitespace(fh) if not check(fh, "=>"): error_msg("Missing '=>' after character set definition.", fh) skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": error_msg("Missing identifier for indentation element definition.", fh) verify_word_in_list( identifier, ["space", "grid", "bad", "newline", "suppressor"], "Unrecognized indentation specifier '%s'." % identifier, fh) trigger_set = None if identifier in ["space", "bad", "grid"]: if len(pattern.sm.states) != 2: error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \ "can be matched by a single character, e.g. \" \" or [a-z].", fh) transition_map = pattern.sm.get_init_state().transitions().get_map( ) assert len(transition_map) == 1 trigger_set = transition_map.values()[0] skip_whitespace(fh) if identifier == "space": value = read_integer(fh) if value is not None: indentation_setup.specify_space(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? variable = read_identifier(fh) if variable != "": indentation_setup.specify_space(pattern_str, trigger_set, variable, fh) else: indentation_setup.specify_space(pattern_str, trigger_set, 1, fh) elif identifier == "grid": value = read_integer(fh) if value is not None: indentation_setup.specify_grid(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? skip_whitespace(fh) variable = read_identifier(fh) if variable != "": indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh) else: error_msg( "Missing integer or variable name after keyword 'grid'.", fh) elif identifier == "bad": indentation_setup.specify_bad(pattern_str, trigger_set, fh) elif identifier == "newline": indentation_setup.specify_newline(pattern_str, pattern.sm, fh) elif identifier == "suppressor": indentation_setup.specify_suppressor(pattern_str, pattern.sm, fh) else: assert False, "Unreachable code reached." if not check(fh, ";"): error_msg( "Missing ';' after indentation '%s' specification." % identifier, fh)