def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence): script_list = [ "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common", "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi" ] sets = [X(name) for name in script_list] orig = combination.do(map(lambda x: x.sm, sets)) state_n_before, result = transform(Trafo, orig) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result, unicode_to_transformed_sequence) print "Translated %i groups without abortion on error (OK)" % len(sets) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), unicode_to_transformed_sequence)
def general_checks(loop_map, appendix_sm_list): print "#_[ Checks ]__________________________________________________" print print "character sets do not intersect", all_set = NumberSet() for lei in loop_map: assert lei.character_set is not None assert not lei.character_set.has_intersection(all_set) all_set.unite_with(lei.character_set) print "[ok]" print "count actions do not appear more than once", count_action_couple_set = set() count_action_plain_set = set() exit_exists_f = False appendix_sm_id_set = set() for lei in loop_map: if lei.count_action is None: assert lei.appendix_sm_id is None exit_exists_f = True elif lei.appendix_sm_id is None: assert lei.incidence_id not in count_action_plain_set count_action_plain_set.add(lei.incidence_id) else: assert lei.incidence_id not in count_action_couple_set count_action_couple_set.add(lei.incidence_id) appendix_sm_id_set.add(lei.appendix_sm_id) print "[ok]" list_id_set = set(sm.get_id() for sm in appendix_sm_list) assert appendix_sm_id_set == list_id_set print "appendix sm-ids are the same in loop map and sm list: [ok]" print "exit character set exits: [%s]" % exit_exists_f print
def general_checks(loop_map, appendix_sm_list): print "#_[ Checks ]__________________________________________________" print print "character sets do not intersect", all_set = NumberSet() for lei in loop_map: assert lei.character_set is not None assert not lei.character_set.has_intersection(all_set) all_set.unite_with(lei.character_set) print "[ok]" print "count actions do not appear more than once", count_action_couple_set = set() count_action_plain_set = set() appendix_sm_id_set = set() print "[ok]" ## if "Split" in sys.argv or "Plain" in sys.argv: ## list_id_set = set(sm.get_id() for sm in appendix_sm_list) ## assert appendix_sm_id_set == list_id_set ## print "appendix sm-ids are the same in loop map and sm list: [ok]" print "exit character set exits: [%s]" % any(lei.aux_count_action is None for lei in loop_map) print
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence): script_list = [ "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid", "Canadian_Aboriginal", "Cherokee", "Common", "Cuneiform", "Cypriot", "Deseret", "Gothic", "Greek", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian", "Phoenician", "Shavian", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi" ] sets = [ X(name) for name in script_list ] orig = get_combined_state_machine(map(lambda x: x.sm, sets)) state_n_before, result = transform(Trafo, orig) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result, unicode_to_transformed_sequence) print "Translated %i groups without abortion on error (OK)" % len(sets) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), unicode_to_transformed_sequence)
def general_checks(loop_map, appendix_sm_list): print "#_[ Checks ]__________________________________________________" print print "character sets do not intersect", all_set = NumberSet() for lei in loop_map: assert lei.character_set is not None assert not lei.character_set.has_intersection(all_set) all_set.unite_with(lei.character_set) print "[ok]" print "count actions do not appear more than once", count_action_couple_set = set() count_action_plain_set = set() exit_exists_f = False appendix_sm_id_set = set() for lei in loop_map: if lei.count_action is None: assert lei.appendix_sm_id is None exit_exists_f = True elif lei.appendix_sm_id is None: assert lei.incidence_id not in count_action_plain_set count_action_plain_set.add(lei.incidence_id) else: assert lei.incidence_id not in count_action_couple_set count_action_couple_set.add(lei.incidence_id) appendix_sm_id_set.add(lei.appendix_sm_id) print "[ok]" list_id_set = set(sm.get_id() for sm in appendix_sm_list) assert appendix_sm_id_set == list_id_set print "appendix sm-ids are the same in loop map and sm list: [ok]" print "exit character set exits: [%s]" % exit_exists_f print
def __get_remaining_set(self): ignored = (E_CharacterCountType.BAD, E_CharacterCountType.BEGIN_NEWLINE_SUPPRESSOR, E_CharacterCountType.BEGIN_NEWLINE, E_CharacterCountType.END_NEWLINE) result = NumberSet() for character_set, info in self.__map: if info.cc_type in ignored: continue result.unite_with(character_set) return result.get_complement(Setup.buffer_codec.source_set)
def get_ending_character_set(self): """Returns the union of all characters that trigger to an acceptance state in the given state machine. This is to detect whether the newline or suppressor end with an indentation character (grid or space). """ result = NumberSet() for end_state_index in self.get_acceptance_state_index_list(): for state in self.states.itervalues(): if state.target_map.has_target(end_state_index) == False: continue result.unite_with(state.target_map.get_trigger_set_to_target(end_state_index)) return result
def _assert_consistency(self): assert not any(lme is None for lme in self) assert not any(lme.character_set is None for lme in self) assert not any((lme.iid_couple_terminal is None) and (lme.code is None) for lme in self) # Assert: Transition triggers do not intersect! total = NumberSet() for lme in self: assert not lme.character_set.has_intersection(total) total.unite_with(lme.character_set)
def __wildcard_value_match(self, WildCardValue): result = NumberSet() value_list = self.get_wildcard_value_matches(WildCardValue) if len(value_list) == 0: return None for value in value_list: result.unite_with(NumberSet(self.code_point_db[value])) # No decoupling, since result is computed each fresh and new return result
def __wildcard_value_match(self, WildCardValue): result = NumberSet() value_list = self.get_wildcard_value_matches(WildCardValue) if len(value_list) == 0: return None for value in value_list: result.unite_with(NumberSet(self.code_point_db[value])) # No decoupling, since result is computed each fresh and new return result
def get_ending_character_set(self): """Returns the union of all characters that trigger to an acceptance state in the given state machine. This is to detect whether the newline or suppressor end with an indentation character (grid or space). """ result = NumberSet() for end_state_index in self.get_acceptance_state_index_list(): for state in self.states.itervalues(): if state.target_map.has_target(end_state_index) == False: continue result.unite_with( state.target_map.get_trigger_set_to_target( end_state_index)) return result
def __whitespace_default(self): """Try to define default whitespace ' ' or '\t' if their positions are not yet occupied in the count_command_map. """ cs0 = NumberSet(ord(" ")) cs1 = NumberSet(ord("\t")) result = NumberSet() if not self.specifier_count_op_map.find_occupier(cs0, set()): result.unite_with(cs0) if not self.specifier_count_op_map.find_occupier(cs1, set()): result.unite_with(cs1) if result.is_empty(): error.log("Trying to implement default whitespace ' ' or '\\t' failed.\n" "Characters are occupied by other elements.", self.sr) return result
def is_DFA_compliant(self): """Checks if the current state transitions are DFA compliant, i.e. it investigates if trigger sets pointing to different targets intersect. RETURNS: True => OK False => Same triggers point to different target. This cannot be part of a deterministic finite automaton (DFA). """ # DFA's do not have epsilon transitions if len(self.__epsilon_target_index_list) != 0: return False # check whether trigger sets intersect all_trigger_sets = NumberSet() for trigger_set in self.__db.itervalues(): if all_trigger_sets.has_intersection(trigger_set): return False else: all_trigger_sets.unite_with(trigger_set) return True
def is_DFA_compliant(self): """Checks if the current state transitions are DFA compliant, i.e. it investigates if trigger sets pointing to different targets intersect. RETURNS: True => OK False => Same triggers point to different target. This cannot be part of a deterministic finite automaton (DFA). """ # DFA's do not have epsilon transitions if len(self.__epsilon_target_index_list) != 0: return False # check whether trigger sets intersect all_trigger_sets = NumberSet() for trigger_set in self.__db.itervalues(): if all_trigger_sets.has_intersection(trigger_set): return False else: all_trigger_sets.unite_with(trigger_set) return True
for cmd in result.states[s_idx].single_entry: assert not cmd.is_acceptance() print " (OK)" sets = map(lambda name: X(name), ["Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam", "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic", "Old_Persian", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", "Tifinagh", "Ugaritic", "Yi"]) orig = get_combined_state_machine(map(lambda x: x.sm, sets)) print "Number of states in state machine:" print " Unicode: %i" % len(orig.states) result = trafo.do(orig) print " UTF8-Splitted: %i" % len(result.states) for set in sets: set.check(result) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True))
def get_character_set(self, Value=None): """Returns the character set that corresponds to 'Property==Value'. 'Value' can be a property value or a property value alias. For binary properties 'Value' must be None. """ assert self.type != "Binary" or Value is None def get_value_combination(CmbAlias): result = [] for alias in self.alias_to_alias_combination_db[CmbAlias]: name = self.alias_to_name_map.get(alias) if name is None: return "Unicode database error: no name related to alias '%s'" % alias result.append(name) return result if self.type != "Binary" and Value is None: return "Property '%s' requires a value setting.\n" % self.name + \ "Possible Values: " + \ self.get_value_list_help() if self.code_point_db is None: self.init_code_point_db() if self.type == "Binary": # Decouple, since we refer to an internal database return deepcopy(self.code_point_db) adapted_value = Value.replace(" ", "_") if self.code_point_db.has_key(adapted_value): # 'value' is present as name in the code point database value = adapted_value elif Value in self.alias_to_name_map.keys(): # 'value' is present as alias in code pointer database value = self.alias_to_name_map[adapted_value] elif Value in self.alias_to_alias_combination_db.keys(): # 'value' is present as a combination of aliases value = get_value_combination(adapted_value) elif self.name_to_alias_map.has_key(adapted_value): # The value was a combination of values value = get_value_combination(self.name_to_alias_map[adapted_value]) else: # -- WILDCARD MATCH: Results in a list of property values character_set = self.__wildcard_value_match(adapted_value) if character_set is None: return "Property '%s' cannot have a value or value alias '%s'.\n" % (self.name, Value) + \ "Possible Values: " + \ self.get_value_list_help() # No need to decouple, since character is not a reference to # internal database (for safety, do it) return deepcopy(character_set) if type(value) == list: result = NumberSet() for element in value: if element == "Unassigned": continue entry = self.code_point_db.get(element) if entry is None: return "%s/%s is not supported by Unicode database." % (self.name, repr(element)) result.unite_with(entry) else: result = self.code_point_db.get(value) if result is None: return "%s/%s is not supported by Unicode database." % (self.name, repr(value)) # Reference to internal database --> decouple with 'deepcopy' return deepcopy(result)
def covers(self, Min, Max): result = NumberSet() for info in self.__map: result.unite_with(info.character_set) return result.covers_range(Min, Max)
def _get_all_character_set(*DbList): result = NumberSet() for db in DbList: for character_set in db.itervalues(): result.unite_with(character_set) return result
def _get_all_character_set(*DbList): result = NumberSet() for db in DbList: for character_set in db.itervalues(): result.unite_with(character_set) return result
def covers(self, Min, Max): result = NumberSet() for info in self.__map: result.unite_with(info.character_set) return result.covers_range(Min, Max)
def get_trigger_set_union(self): result = NumberSet() for trigger_set in self.__db.itervalues(): result.unite_with(trigger_set) return result
"Buhid", "Canadian_Aboriginal", "Cherokee", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Yi", ]) orig = get_combined_state_machine(map(lambda x: x.sm, sets)) print "# Number of states in state machine:" print "# Unicode: %i" % len(orig.states) result = trafo.do(orig) print "# UTF8-Splitted: %i" % len(result.states) # print result.get_graphviz_string(Option="hex") for set in sets: set.check(result) union = NumberSet() for nset in map(lambda set: set.charset, sets): union.unite_with(nset) inverse_union = NumberSet(Interval(0, 0x110000)) inverse_union.subtract(union) # print inverse_union.get_string(Option="hex") check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True))
def get_character_set(self, Value=None): """Returns the character set that corresponds to 'Property==Value'. 'Value' can be a property value or a property value alias. For binary properties 'Value' must be None. """ assert self.type != "Binary" or Value is None def get_value_combination(CmbAlias): result = [] for alias in self.alias_to_alias_combination_db[CmbAlias]: name = self.alias_to_name_map.get(alias) if name is None: return "Unicode database error: no name related to alias '%s'" % alias result.append(name) return result if self.type != "Binary" and Value is None: return "Property '%s' requires a value setting.\n" % self.name + \ "Possible Values: " + \ self.get_value_list_help() if self.code_point_db is None: self.init_code_point_db() if self.type == "Binary": # Decouple, since we refer to an internal database return deepcopy(self.code_point_db) adapted_value = Value.replace(" ", "_") if self.code_point_db.has_key(adapted_value): # 'value' is present as name in the code point database value = adapted_value elif Value in self.alias_to_name_map.keys(): # 'value' is present as alias in code pointer database value = self.alias_to_name_map[adapted_value] elif Value in self.alias_to_alias_combination_db.keys(): # 'value' is present as a combination of aliases value = get_value_combination(adapted_value) elif self.name_to_alias_map.has_key(adapted_value): # The value was a combination of values value = get_value_combination( self.name_to_alias_map[adapted_value]) else: # -- WILDCARD MATCH: Results in a list of property values character_set = self.__wildcard_value_match(adapted_value) if character_set is None: return "Property '%s' cannot have a value or value alias '%s'.\n" % (self.name, Value) + \ "Possible Values: " + \ self.get_value_list_help() # No need to decouple, since character is not a reference to # internal database (for safety, do it) return deepcopy(character_set) if type(value) == list: result = NumberSet() for element in value: if element == "Unassigned": continue entry = self.code_point_db.get(element) if entry is None: return "%s/%s is not supported by Unicode database." % ( self.name, repr(element)) result.unite_with(entry) else: result = self.code_point_db.get(value) if result is None: return "%s/%s is not supported by Unicode database." % ( self.name, repr(value)) # Reference to internal database --> decouple with 'deepcopy' return deepcopy(result)