def add_transition(self, Trigger, TargetStateIdx): """Adds a transition according to trigger and target index. RETURNS: The target state index (may be created newly). """ assert type(TargetStateIdx) == long or TargetStateIdx == None assert Trigger.__class__ in [int, long, list, Interval, NumberSet ] or Trigger == None if Trigger == None: # This is a shorthand to trigger via the remaining triggers Trigger = self.get_trigger_set_union().inverse() elif type(Trigger) == long: Trigger = Interval(int(Trigger), int(Trigger + 1)) elif type(Trigger) == int: Trigger = Interval(Trigger, Trigger + 1) elif type(Trigger) == list: Trigger = NumberSet(Trigger, ArgumentIsYoursF=True) if Trigger.__class__ == Interval: if self.__db.has_key(TargetStateIdx): self.__db[TargetStateIdx].add_interval(Trigger) else: self.__db[TargetStateIdx] = NumberSet(Trigger, ArgumentIsYoursF=True) else: if self.__db.has_key(TargetStateIdx): self.__db[TargetStateIdx].unite_with(Trigger) else: self.__db[TargetStateIdx] = Trigger return TargetStateIdx
def seal(self): if len(self.space_db) == 0 and len(self.grid_db) == 0: default_space = ord(' ') default_tab = ord('\t') bad = self.bad_character_set if bad.get().contains(default_space) == False: self.specify_space("[ ]", NumberSet(default_space), 1, self.fh) if bad.get().contains(default_tab) == False: self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh) if len(self.space_db) == 0 and len(self.grid_db) == 0: error_msg( "No space or grid defined for indentation counting. Default\n" "values ' ' and '\\t' could not be used since they are specified as 'bad'.", bad.file_name, bad.line_n) if self.newline_state_machine.get() == None: sm = StateMachine() end_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\n')), AcceptanceF=True) mid_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\r')), AcceptanceF=False) sm.add_transition(mid_idx, NumberSet(ord('\n')), end_idx, AcceptanceF=False) self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
def __indentation_add(Info): # (0) If all involved counts are single spaces, the 'counting' can be done # easily by subtracting 'end - begin', no adaption. indent_txt = " " * 16 if Info.has_only_single_spaces(): return "" def __do(txt, CharSet, Operation): txt.append(indent_txt + "if( ") __condition(txt, CharSet) txt.append(" ) { ") txt.append(Operation) txt.append(" }\\\n") txt = [] spaces_db = {} # Sort same space counts together grid_db = {} # Sort same grid counts together for name, count_parameter in Info.count_db.items(): count = count_parameter.get() character_set = Info.character_set_db[name].get() if count == "bad": continue # grid counts are indicated by negative integer for count. if count >= 0: spaces_db.setdefault(count, NumberSet()).unite_with(character_set) else: grid_db.setdefault(count, NumberSet()).unite_with(character_set) for count, character_set in spaces_db.items(): __do(txt, character_set, "(I) += %i;" % count) for count, character_set in grid_db.items(): __do(txt, character_set, "(I) += (%i - ((I) %% %i));" % (abs(count), abs(count))) return "".join(txt)
def __wildcard_value_match(self, WildCardValue): result = NumberSet() value_list = self.get_wildcard_value_matches(WildCardValue) if value_list == []: return None for value in value_list: result.unite_with(NumberSet(self.code_point_db[value])) return result
def indentation_count_character_set(self): """Returns the superset of all characters that are involved in indentation counting. That is the set of character that can appear between newline and the first non whitespace character. """ result = NumberSet() for character_set in self.space_db.values(): result.unite_with(character_set.get()) for character_set in self.grid_db.values(): result.unite_with(character_set.get()) return result
def __wildcard_value_match(self, WildCardValue): result = NumberSet() value_list = self.get_wildcard_value_matches(WildCardValue) if value_list == []: return None for value in value_list: result.unite_with(NumberSet(self.code_point_db[value])) # No decoupling, since result is computed each fresh and new return result
def load_UnicodeData(self): fh = open_data_base_file("UnicodeData.txt") # some rows contain aliases, so they need to get converted into values property_general_category = self.db["gc"] property_bidi_class = self.db["bc"] def convert(Property, ValueAlias): """Convert specified ValueAlias to Value of the given property.""" if Property.alias_to_name_map.has_key(ValueAlias): return Property.alias_to_name_map[ValueAlias] return ValueAlias names_db = {} general_category_db = {} bidi_class_db = {} numeric_value_db = {} names_uc1_db = {} iso_comment_db = {} for line in fh.readlines(): if line.find("#") != -1: line = line[:line.find("#")] if line == "" or line.isspace(): continue x = line.split(";") code_point = int("0x" + x[0].strip(), 16) # CodePointIdx = 0 name = x[1].strip().replace(" ", "_") # NameIdx = 1 general_category = x[2].strip().replace(" ", "_") # GeneralCategoryIdx = 2 general_category = convert(property_general_category, general_category) bidi_class = x[4].strip().replace(" ", "_") # BidiClassIdx = 4 bidi_class = convert(property_bidi_class, bidi_class) numeric_value = x[6].strip() # NumericValueIdx = 6 uc1_name = x[10].strip().replace(" ", "_") # NameUC1Idx = 10 iso_comment = x[11].strip().replace(" ", "_") # ISO_CommentIdx = 11 names_db[name] = code_point general_category_db.setdefault(general_category, NumberSet()).quick_append_value(code_point) bidi_class_db.setdefault (bidi_class, NumberSet()).quick_append_value(code_point) numeric_value_db.setdefault (numeric_value, NumberSet()).quick_append_value(code_point) names_uc1_db[uc1_name] = code_point iso_comment_db[iso_comment] = str(code_point) self.db["na"].code_point_db = names_db # Name self.db["gc"].code_point_db = general_category_db # General Category self.db["bc"].code_point_db = bidi_class_db # BidiClass self.db["nv"].code_point_db = numeric_value_db # Numeric Value self.db["na1"].code_point_db = names_uc1_db # Name Unicode 1 self.db["isc"].code_point_db = iso_comment_db # ISO_Comment
def plug_wildcard(self, WildcardPlug): assert isinstance(WildcardPlug, (int, long)) # Finally, if there is a plugging to be performed, then do it. if WildcardPlug == -1: return if self.__skeleton.has_key(WildcardPlug): self.__skeleton[WildcardPlug].unite_with(NumberSet( self.__wildcard)) else: self.__skeleton[WildcardPlug] = NumberSet(self.__wildcard) self.__skeleton_key_set.add(WildcardPlug) self.__wildcard = None # There is no more wildcard now return
def get_trigger_set_to_target(self, TargetIdx): """Returns all triggers that lead to target 'TargetIdx'. If a trigger 'None' is returned it means that the epsilon transition triggers to target state. If the TargetIndex is omitted the set of all triggers, except the epsilon triggers, are returned. """ if self.__db.has_key(TargetIdx): return self.__db[TargetIdx] else: return NumberSet()
def __enter_number_set(db, Key, Value): ValueType = Value.__class__.__name__ assert ValueType in ["Interval", "int"] if ValueType == "int": Value = Interval(Value) if db.has_key(Key): db[Key].quick_append_interval(Value, SortF=False) else: db[Key] = NumberSet(Value)
def is_DFA_compliant(self): """Checks if the current state transitions are DFA compliant, i.e. it investigates if trigger sets pointing to different targets intersect. RETURN: True => OK False => Same triggers point to different target. This cannot be part of a deterministic finite automaton (DFA). """ # DFA's do not have epsilon transitions if len(self.__epsilon_target_index_list) != 0: return False # check wether trigger sets intersect all_trigger_sets = NumberSet() for trigger_set in self.__db.values(): if all_trigger_sets.has_intersection(trigger_set): return False else: all_trigger_sets.unite_with(trigger_set) return True
def __display_set(CharSet, cl): if cl.search("--numeric"): display = "hex" else: display = "utf8" CharSet.intersect_with(NumberSet(Interval(0, 0x110000))) print "Characters:\n", if cl.search("--intervals"): __print_set_in_intervals(CharSet, display, 80) elif cl.search("--names"): __print_set_character_names(CharSet, display, 80) else: __print_set_single_characters(CharSet, display, 80) print
def load_Composition_Exclusion(self): table = parse_table("CompositionExclusions.txt") number_set = NumberSet() for row in table: begin = int("0x" + row[0], 16) number_set.quick_append_interval(Interval(begin, begin + 1)) number_set.clean() self.db["CE"].code_point_db = number_set
def load_Composition_Exclusion(self): # Column 0 contains what is interesting ... table = parse_table("CompositionExclusions.txt", NumberColumnList=[0]) number_set = NumberSet() for row in table: begin = row[0] number_set.quick_append_interval(Interval(begin, begin + 1)) number_set.clean() self.db["CE"].code_point_db = number_set
def __init__(self, fh=-1): self.fh = fh if fh != -1: self.file_name = fh.name self.line_n = get_current_line_info_number(fh) else: self.file_name = "no file handle" self.line_n = -1 self.space_db = {} # Maps: space width --> character_set self.grid_db = {} # Maps: grid width --> character_set self.bad_character_set = LocalizedParameter("bad", NumberSet()) self.newline_state_machine = LocalizedParameter("newline", None) self.newline_suppressor_state_machine = LocalizedParameter( "suppressor", None) self.__containing_mode_name = ""
def contains_only_spaces(state_machine): """Determines wether there are only spaces on the way to the acceptance state. """ for state in state_machine.states.values(): target_state_list = state.transitions().get_target_state_index_list() # (1) if a pattern contains only ' ', then there is no place for more than # one target state, since every state has only one trigger and one target state if len(target_state_list) > 1: return False # (2) does state exclusively trigger on ' '? # (2a) does state trigger on ' '? all_trigger_set = state.transitions().get_trigger_set_union() if all_trigger_set.contains(ord(' ')) == False: return False # (2b) does state trigger on nothing else? if all_trigger_set.difference(NumberSet(ord(' '))).is_empty() == False: return False return True
def convert_table_to_associative_map(table, ValueColumnIdx, ValueType, KeyColumnIdx): """Produces a dictionary that maps from 'keys' to NumberSets. The number sets represent the code points for which the key (property) is valid. ValueColumnIdx: Column that contains the character code interval or string to which one wishes to map. KeyColmnIdx: Column that contains the 'key' to be used for the map self.db = database to contain the associative map. """ db = {} if ValueType == "NumberSet": for record in table: key = record[KeyColumnIdx].strip() key = key.replace(" ", "_") value = record[ValueColumnIdx] if type(value) == int: value = Interval(value) db.setdefault(key, NumberSet()).quick_append_interval(value, SortF=False) elif ValueType == "number" or ValueType == "string": for record in table: key = record[KeyColumnIdx].strip() key = key.replace(" ", "_") value = record[ValueColumnIdx] db[key] = value else: raise BaseException("ValueType = '%s' unknown.\n" % ValueType) # if the content was a number set, it might be simplified, try it. if ValueType == "NumberSet": for key, number_set in db.items(): number_set.clean() return db
def get_supported_unicode_character_set(CodecAlias, FH=-1, LineN=None): result = NumberSet() for source_begin, source_end, target_begin in get_codec_transformation_info(CodecAlias, FH, LineN): result.add_interval(Interval(source_begin, source_end)) return result
def match_skeleton(self, TransitionMap, TargetIdx, TriggerCharToTarget): """A single character transition TriggerCharToTarget --> TargetIdx has been detected. The question is, if the remaining transitions of the state match the skeleton of the current path. There might be a wildcard, that is the character that is overlayed by the first single character transition. As long as a transition map is differs only by this single character, the wildcard is plugged into the position. RETURNS: int > 0, the character that the wildcard shall take so that the skeleton matches the TransitionMap. - 1, if skeleton and TransitionMap match anyway and no wildcard plug is necessary. None, if there is no way that the skeleton and the TransitionMap could match. """ ## ?? The element of a path cannot be triggered by the skeleton! ?? ## ?? if self.__skeleton.has_key(TargetIdx): return False ?? ## ?? Why would it not? (fschaef9: 10y04m11d) if self.__wildcard != None: wildcard_plug = None # unused else: wildcard_plug = -1 # used before transition_map_key_set = set(TransitionMap.keys()) # (1) Target States In TransitionMap and Not in Skeleton # # All target states of TransitionMap must be in Skeleton, # except: # # (1.1) The single char transition target TargetIdx. # (1.2) Maybe, one that is reached by a single char # transition of wildcard. delta_set = transition_map_key_set - self.__skeleton_key_set delta_size = len(delta_set) if delta_size > 2: return None for target_idx in delta_set: if target_idx == TargetIdx: continue # (1.1) elif wildcard_plug != None: return None elif not TransitionMap[target_idx].contains_only(self.__wildcard): return None wildcard_plug = target_idx # (1.2) # (2) Target States In Skeleton and Not in TransitionMap # # All target states of Skeleton must be in TransitionMap, # except: # # (2.1) Transition to the target index in skeleton # is covered by current single transition. delta_set = self.__skeleton_key_set - transition_map_key_set delta_size = len(delta_set) if delta_size > 1: return None if delta_size == 1: for target_idx in delta_set: if not self.__skeleton[target_idx].contains_only( TriggerCharToTarget): return None # (2.1) OK, single char covers the transition in skeleton. # (3) Target States in both, Skeleton and Transition Map # # All correspondent trigger sets must be equal, except: # # (3.1) single char transition covers the hole, i.e. # trigger set in transition map + single char == # trigger set in skeleton. (check this first, # don't waste wildcard). # (3.2) trigger set in skeleton + wildcard == trigger set # in transition map. # common_set = self.__skeleton_key_set & transition_map_key_set for target_idx in common_set: sk_trigger_set = self.__skeleton[target_idx] tm_trigger_set = TransitionMap[target_idx] if sk_trigger_set.is_equal(tm_trigger_set): continue # (3.1) Maybe the current single transition covers the 'hole'. # (check this first, we do not want to waste the wilcard) if can_plug_to_equal(tm_trigger_set, TriggerCharToTarget, sk_trigger_set): continue elif wildcard_plug == None: # (3.2) Can difference between trigger sets be plugged by the wildcard? if can_plug_to_equal(sk_trigger_set, self.__wildcard, tm_trigger_set): wildcard_plug = target_idx continue # (3.3) A set extended by wilcard may have only a 'hole' of the # size of the single transition char. if can_plug_to_equal( tm_trigger_set, TriggerCharToTarget, sk_trigger_set.union(NumberSet(self.__wildcard))): wildcard_plug = target_idx continue # Trigger sets differ and no wildcard or single transition can # 'explain' that => skeleton does not fit. return None if wildcard_plug == None: return -1 # No plugging necessary return wildcard_plug
def get_trigger_set_union(self): result = NumberSet() for trigger_set in self.__db.values(): result.unite_with(trigger_set) return result
def get_elementary_trigger_sets(self, StateIdxList): """Considers the trigger dictionary that contains a mapping from target state index to the trigger set that triggers to it: target_state_index ---> trigger_set The trigger sets of different target state indices may intersect. As a result, this function produces a list of pairs: [ state_index_list, elementary_trigger_set ] where the elementary trigger set is the set of all triggers that trigger at the same time to all states in the state_index_list. The list contains for one state_index_list only one elementary_trigger_set. All elementary trigger sets are disjunct, i.e. they do not intersect. NOTE: A general solution of this problem would have to consider the inspection of all possible subset combinations. The number of combinations for N trigger sets is 2^N - which potentially blows the calculation power of the computer. Excessive optimizations would have to be programmed, if not the following were the case: NOTE: Fortunately, we are dealing with one dimensional sets! Thus, there is a very effective way to determine the elementary trigger sets. Imagine three trigger sets stretching over the range of numbers as follows: different targets, e.g. T0, T1, T2 are triggered by different sets of letters in the alphabet. letters of alphabet ----------------------------------------------------> T0 [---------) [----------) T1 [------) [-----) T2 [----------------------) => elementary sets: only T0 [-------) T0, T1 [-) only T1 [-) T1, T2 [--) only T2 [---) [----) T0, T2 [---) [) T0, T1, T2 [-----) """ def DEBUG_print_history(history): txt = "" for item in history: txt += repr(item) + "\n" print txt # (*) accumulate the transitions for all states in the state list. # transitions to the same target state are combined by union. history = [] for state_idx in StateIdxList: # -- trigger dictionary: target_idx --> trigger set that triggers to target line_up = self.states[state_idx].transitions( ).get_trigger_set_line_up() # NOTE: Doublicate entries in history are perfectly reasonable at this point, # simply if two states trigger on the same character range to the same # target state. When ranges are opened/closed via the history items # this algo keeps track of doublicates (see below). history.extend(line_up) # (*) sort history according to position history.sort(lambda a, b: cmp(a.position, b.position)) ## DEBUG_print_history(history) # (*) build the elementary subset list combinations = {} # use dictionary for uniqueness map_key_str_to_target_index_combination = { } # use dictionary for uniqueness current_interval_begin = None current_involved_target_indices = {} # use dictionary for uniqueness current_involved_targets_epsilon_closure = [] for item in history: # -- add interval and target indice combination to the data # (only build interval when current begin is there, # when the interval size is not zero, and # when the epsilon closure of target states is not empty) if current_interval_begin != None and \ current_interval_begin != item.position and \ current_involved_target_indices.keys() != []: interval = Interval(current_interval_begin, item.position) key_str = repr(current_involved_targets_epsilon_closure) if not combinations.has_key(key_str): combinations[key_str] = NumberSet(interval, ArgumentIsYoursF=True) map_key_str_to_target_index_combination[key_str] = \ current_involved_targets_epsilon_closure else: combinations[key_str].unite_with(interval) # -- BEGIN / END of interval: # add or delete a target state to the set of currently considered target states # NOTE: More than one state can trigger on the same range to the same target state. # Thus, one needs to keep track of the 'opened' target states. if item.change == INTERVAL_BEGIN: if current_involved_target_indices.has_key(item.target_idx): current_involved_target_indices[item.target_idx] += 1 else: current_involved_target_indices[item.target_idx] = 1 else: # == INTERVAL_END if current_involved_target_indices[item.target_idx] > 1: current_involved_target_indices[item.target_idx] -= 1 else: del current_involved_target_indices[item.target_idx] # -- re-compute the epsilon closure of the target states current_involved_targets_epsilon_closure = \ self.get_epsilon_closure_of_state_set(current_involved_target_indices.keys()) current_involved_targets_epsilon_closure.sort() # -- set the begin of interval to come current_interval_begin = item.position # (*) create the list of pairs [target-index-combination, trigger_set] result = [] for key_str, target_index_combination in map_key_str_to_target_index_combination.items( ): result.append([target_index_combination, combinations[key_str]]) return result
def get_supported_unicode_character_set(CodecAlias, FH=-1, LineN=None): result = NumberSet() for source_begin, source_end, target_begin in get_codec_transformation_info( CodecAlias, FH, LineN): result.add_interval(Interval(source_begin, source_end)) return result