Пример #1
0
    def add_transition(self, Trigger, TargetStateIdx):
        """Adds a transition according to trigger and target index.
           RETURNS: The target state index (may be created newly).
        """
        assert type(TargetStateIdx) == long or TargetStateIdx == None
        assert Trigger.__class__ in [int, long, list, Interval, NumberSet
                                     ] or Trigger == None

        if Trigger == None:  # This is a shorthand to trigger via the remaining triggers
            Trigger = self.get_trigger_set_union().inverse()
        elif type(Trigger) == long:
            Trigger = Interval(int(Trigger), int(Trigger + 1))
        elif type(Trigger) == int:
            Trigger = Interval(Trigger, Trigger + 1)
        elif type(Trigger) == list:
            Trigger = NumberSet(Trigger, ArgumentIsYoursF=True)

        if Trigger.__class__ == Interval:
            if self.__db.has_key(TargetStateIdx):
                self.__db[TargetStateIdx].add_interval(Trigger)
            else:
                self.__db[TargetStateIdx] = NumberSet(Trigger,
                                                      ArgumentIsYoursF=True)
        else:
            if self.__db.has_key(TargetStateIdx):
                self.__db[TargetStateIdx].unite_with(Trigger)
            else:
                self.__db[TargetStateIdx] = Trigger

        return TargetStateIdx
Пример #2
0
    def seal(self):
        if len(self.space_db) == 0 and len(self.grid_db) == 0:
            default_space = ord(' ')
            default_tab = ord('\t')
            bad = self.bad_character_set
            if bad.get().contains(default_space) == False:
                self.specify_space("[ ]", NumberSet(default_space), 1, self.fh)
            if bad.get().contains(default_tab) == False:
                self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh)

            if len(self.space_db) == 0 and len(self.grid_db) == 0:
                error_msg(
                    "No space or grid defined for indentation counting. Default\n"
                    "values ' ' and '\\t' could not be used since they are specified as 'bad'.",
                    bad.file_name, bad.line_n)

        if self.newline_state_machine.get() == None:
            sm = StateMachine()
            end_idx = sm.add_transition(sm.init_state_index,
                                        NumberSet(ord('\n')),
                                        AcceptanceF=True)
            mid_idx = sm.add_transition(sm.init_state_index,
                                        NumberSet(ord('\r')),
                                        AcceptanceF=False)
            sm.add_transition(mid_idx,
                              NumberSet(ord('\n')),
                              end_idx,
                              AcceptanceF=False)
            self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
Пример #3
0
def __indentation_add(Info):
    # (0) If all involved counts are single spaces, the 'counting' can be done
    #     easily by subtracting 'end - begin', no adaption.
    indent_txt = " " * 16
    if Info.has_only_single_spaces():
        return ""

    def __do(txt, CharSet, Operation):
        txt.append(indent_txt + "if( ")
        __condition(txt, CharSet)
        txt.append(" ) { ")
        txt.append(Operation)
        txt.append(" }\\\n")

    txt = []
    spaces_db = {}  # Sort same space counts together
    grid_db = {}  # Sort same grid counts together
    for name, count_parameter in Info.count_db.items():
        count = count_parameter.get()
        character_set = Info.character_set_db[name].get()
        if count == "bad": continue
        # grid counts are indicated by negative integer for count.
        if count >= 0:
            spaces_db.setdefault(count, NumberSet()).unite_with(character_set)
        else:
            grid_db.setdefault(count, NumberSet()).unite_with(character_set)

    for count, character_set in spaces_db.items():
        __do(txt, character_set, "(I) += %i;" % count)

    for count, character_set in grid_db.items():
        __do(txt, character_set,
             "(I) += (%i - ((I) %% %i));" % (abs(count), abs(count)))

    return "".join(txt)
Пример #4
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if value_list == []: 
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        return result
Пример #5
0
 def indentation_count_character_set(self):
     """Returns the superset of all characters that are involved in
        indentation counting. That is the set of character that can
        appear between newline and the first non whitespace character.
     """
     result = NumberSet()
     for character_set in self.space_db.values():
         result.unite_with(character_set.get())
     for character_set in self.grid_db.values():
         result.unite_with(character_set.get())
     return result
Пример #6
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if value_list == []: 
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        # No decoupling, since result is computed each fresh and new
        return result
Пример #7
0
    def load_UnicodeData(self):
        fh = open_data_base_file("UnicodeData.txt")

        # some rows contain aliases, so they need to get converted into values
        property_general_category = self.db["gc"]
        property_bidi_class       = self.db["bc"]

        def convert(Property, ValueAlias):
            """Convert specified ValueAlias to Value of the given property."""
            if Property.alias_to_name_map.has_key(ValueAlias):
                return Property.alias_to_name_map[ValueAlias]
            return ValueAlias

        names_db            = {}
        general_category_db = {}
        bidi_class_db       = {}
        numeric_value_db    = {}
        names_uc1_db        = {}
        iso_comment_db      = {}

        for line in fh.readlines():
            if line.find("#") != -1: line = line[:line.find("#")]
            if line == "" or line.isspace(): continue

            x = line.split(";")

            code_point       = int("0x" + x[0].strip(), 16)    # CodePointIdx       = 0
            name             = x[1].strip().replace(" ", "_")  # NameIdx            = 1
            general_category = x[2].strip().replace(" ", "_")  # GeneralCategoryIdx = 2
            general_category = convert(property_general_category, general_category)
            bidi_class       = x[4].strip().replace(" ", "_")  # BidiClassIdx       = 4
            bidi_class       = convert(property_bidi_class, bidi_class)
            numeric_value    = x[6].strip()                    # NumericValueIdx    = 6
            uc1_name         = x[10].strip().replace(" ", "_") # NameUC1Idx         = 10
            iso_comment      = x[11].strip().replace(" ", "_") # ISO_CommentIdx     = 11

            names_db[name]                                                            = code_point
            general_category_db.setdefault(general_category, NumberSet()).quick_append_value(code_point)
            bidi_class_db.setdefault      (bidi_class,       NumberSet()).quick_append_value(code_point)
            numeric_value_db.setdefault   (numeric_value,    NumberSet()).quick_append_value(code_point)
            names_uc1_db[uc1_name]                                                    = code_point
            iso_comment_db[iso_comment]                                               = str(code_point)

        self.db["na"].code_point_db  = names_db             # Name
        self.db["gc"].code_point_db  = general_category_db  # General Category
        self.db["bc"].code_point_db  = bidi_class_db        # BidiClass
        self.db["nv"].code_point_db  = numeric_value_db     # Numeric Value
        self.db["na1"].code_point_db = names_uc1_db         # Name Unicode 1
        self.db["isc"].code_point_db = iso_comment_db       # ISO_Comment
Пример #8
0
    def plug_wildcard(self, WildcardPlug):
        assert isinstance(WildcardPlug, (int, long))

        # Finally, if there is a plugging to be performed, then do it.
        if WildcardPlug == -1: return

        if self.__skeleton.has_key(WildcardPlug):
            self.__skeleton[WildcardPlug].unite_with(NumberSet(
                self.__wildcard))
        else:
            self.__skeleton[WildcardPlug] = NumberSet(self.__wildcard)
        self.__skeleton_key_set.add(WildcardPlug)
        self.__wildcard = None  # There is no more wildcard now

        return
Пример #9
0
 def get_trigger_set_to_target(self, TargetIdx):
     """Returns all triggers that lead to target 'TargetIdx'. If a trigger 'None' is returned
        it means that the epsilon transition triggers to target state. If the TargetIndex is 
        omitted the set of all triggers, except the epsilon triggers, are returned.
     """
     if self.__db.has_key(TargetIdx): return self.__db[TargetIdx]
     else: return NumberSet()
Пример #10
0
def __enter_number_set(db, Key, Value):
    ValueType = Value.__class__.__name__
    assert ValueType in ["Interval", "int"]

    if ValueType == "int": Value = Interval(Value)

    if db.has_key(Key): db[Key].quick_append_interval(Value, SortF=False)
    else:               db[Key] = NumberSet(Value)
Пример #11
0
    def is_DFA_compliant(self):
        """Checks if the current state transitions are DFA compliant, i.e. it
           investigates if trigger sets pointing to different targets intersect.
           RETURN:  True  => OK
                    False => Same triggers point to different target. This cannot
                             be part of a deterministic finite automaton (DFA).
        """
        # DFA's do not have epsilon transitions
        if len(self.__epsilon_target_index_list) != 0: return False

        # check wether trigger sets intersect
        all_trigger_sets = NumberSet()
        for trigger_set in self.__db.values():
            if all_trigger_sets.has_intersection(trigger_set):
                return False
            else:
                all_trigger_sets.unite_with(trigger_set)

        return True
Пример #12
0
    def is_DFA_compliant(self):
        """Checks if the current state transitions are DFA compliant, i.e. it
           investigates if trigger sets pointing to different targets intersect.
           RETURN:  True  => OK
                    False => Same triggers point to different target. This cannot
                             be part of a deterministic finite automaton (DFA).
        """
        # DFA's do not have epsilon transitions
        if len(self.__epsilon_target_index_list) != 0: return False

        # check wether trigger sets intersect
        all_trigger_sets = NumberSet()
        for trigger_set in self.__db.values():
            if all_trigger_sets.has_intersection(trigger_set): 
                return False
            else:
                all_trigger_sets.unite_with(trigger_set)

        return True
Пример #13
0
def __display_set(CharSet, cl):
    if cl.search("--numeric"): display = "hex"
    else: display = "utf8"

    CharSet.intersect_with(NumberSet(Interval(0, 0x110000)))

    print "Characters:\n",
    if cl.search("--intervals"):
        __print_set_in_intervals(CharSet, display, 80)
    elif cl.search("--names"):
        __print_set_character_names(CharSet, display, 80)
    else:
        __print_set_single_characters(CharSet, display, 80)

    print
Пример #14
0
    def load_Composition_Exclusion(self):
        table = parse_table("CompositionExclusions.txt")

        number_set = NumberSet()
        for row in table:
           begin = int("0x" + row[0], 16)
           number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()    

        self.db["CE"].code_point_db = number_set
Пример #15
0
 def indentation_count_character_set(self):
     """Returns the superset of all characters that are involved in
        indentation counting. That is the set of character that can
        appear between newline and the first non whitespace character.
     """
     result = NumberSet()
     for character_set in self.space_db.values():
         result.unite_with(character_set.get())
     for character_set in self.grid_db.values():
         result.unite_with(character_set.get())
     return result
Пример #16
0
    def load_Composition_Exclusion(self):
        # Column 0 contains what is interesting ...
        table = parse_table("CompositionExclusions.txt", NumberColumnList=[0])

        number_set = NumberSet()
        for row in table:
           begin = row[0]
           number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()    

        self.db["CE"].code_point_db = number_set
Пример #17
0
    def __init__(self, fh=-1):
        self.fh = fh
        if fh != -1:
            self.file_name = fh.name
            self.line_n = get_current_line_info_number(fh)
        else:
            self.file_name = "no file handle"
            self.line_n = -1

        self.space_db = {}  # Maps: space width --> character_set
        self.grid_db = {}  # Maps: grid width  --> character_set
        self.bad_character_set = LocalizedParameter("bad", NumberSet())
        self.newline_state_machine = LocalizedParameter("newline", None)
        self.newline_suppressor_state_machine = LocalizedParameter(
            "suppressor", None)

        self.__containing_mode_name = ""
Пример #18
0
def contains_only_spaces(state_machine):
    """Determines wether there are only spaces on the way to the acceptance state.
    """
    for state in state_machine.states.values():
        target_state_list = state.transitions().get_target_state_index_list()
        # (1) if a pattern contains only ' ', then there is no place for more than
        #     one target state, since every state has only one trigger and one target state
        if len(target_state_list) > 1: return False

        # (2) does state exclusively trigger on ' '?
        #    (2a) does state trigger on ' '?
        all_trigger_set = state.transitions().get_trigger_set_union()
        if all_trigger_set.contains(ord(' ')) == False: return False
        #    (2b) does state trigger on nothing else? 
        if all_trigger_set.difference(NumberSet(ord(' '))).is_empty() == False: return False

    return True
Пример #19
0
def convert_table_to_associative_map(table, ValueColumnIdx, ValueType, KeyColumnIdx):
    """Produces a dictionary that maps from 'keys' to NumberSets. The 
       number sets represent the code points for which the key (property)
       is valid.

       ValueColumnIdx: Column that contains the character code interval or
                       string to which one wishes to map.

       KeyColmnIdx:   Column that contains the 'key' to be used for the map

       self.db = database to contain the associative map.
    """

    db = {}
    if ValueType == "NumberSet":
        for record in table:
            key   = record[KeyColumnIdx].strip()
            key   = key.replace(" ", "_")
            value = record[ValueColumnIdx]

            if type(value) == int: value = Interval(value)

            db.setdefault(key, NumberSet()).quick_append_interval(value, SortF=False)

    elif ValueType == "number" or ValueType == "string":
        for record in table:
            key   = record[KeyColumnIdx].strip()
            key   = key.replace(" ", "_")
            value = record[ValueColumnIdx]
            db[key] = value
    else:
        raise BaseException("ValueType = '%s' unknown.\n" % ValueType)

    # if the content was a number set, it might be simplified, try it.
    if ValueType == "NumberSet":
        for key, number_set in db.items():
            number_set.clean()

    return db
Пример #20
0
def get_supported_unicode_character_set(CodecAlias, FH=-1, LineN=None):
    result = NumberSet()
    for source_begin, source_end, target_begin in get_codec_transformation_info(CodecAlias, FH, LineN):
        result.add_interval(Interval(source_begin, source_end))
    return result
Пример #21
0
    def match_skeleton(self, TransitionMap, TargetIdx, TriggerCharToTarget):
        """A single character transition 

                        TriggerCharToTarget --> TargetIdx

           has been detected. The question is, if the remaining transitions of
           the state match the skeleton of the current path. There might be a
           wildcard, that is the character that is overlayed by the first
           single character transition.  As long as a transition map is differs
           only by this single character, the wildcard is plugged into the
           position.

           RETURNS: 
                    int > 0, the character that the wildcard shall take so
                             that the skeleton matches the TransitionMap.

                        - 1, if skeleton and TransitionMap match anyway and
                             no wildcard plug is necessary.

                       None, if there is no way that the skeleton and the
                             TransitionMap could match.
        """
        ## ?? The element of a path cannot be triggered by the skeleton! ??
        ## ?? if self.__skeleton.has_key(TargetIdx): return False        ??
        ## ?? Why would it not? (fschaef9: 10y04m11d)

        if self.__wildcard != None: wildcard_plug = None  # unused
        else: wildcard_plug = -1  # used before

        transition_map_key_set = set(TransitionMap.keys())
        # (1) Target States In TransitionMap and Not in Skeleton
        #
        #     All target states of TransitionMap must be in Skeleton,
        #     except:
        #
        #      (1.1) The single char transition target TargetIdx.
        #      (1.2) Maybe, one that is reached by a single char
        #            transition of wildcard.
        delta_set = transition_map_key_set - self.__skeleton_key_set
        delta_size = len(delta_set)
        if delta_size > 2: return None

        for target_idx in delta_set:
            if target_idx == TargetIdx: continue  # (1.1)
            elif wildcard_plug != None: return None
            elif not TransitionMap[target_idx].contains_only(self.__wildcard):
                return None
            wildcard_plug = target_idx  # (1.2)

        # (2) Target States In Skeleton and Not in TransitionMap
        #
        #     All target states of Skeleton must be in TransitionMap,
        #     except:
        #
        #      (2.1) Transition to the target index in skeleton
        #            is covered by current single transition.
        delta_set = self.__skeleton_key_set - transition_map_key_set
        delta_size = len(delta_set)
        if delta_size > 1: return None
        if delta_size == 1:
            for target_idx in delta_set:
                if not self.__skeleton[target_idx].contains_only(
                        TriggerCharToTarget):
                    return None
            # (2.1) OK, single char covers the transition in skeleton.

        # (3) Target States in both, Skeleton and Transition Map
        #
        #     All correspondent trigger sets must be equal, except:
        #
        #      (3.1) single char transition covers the hole, i.e.
        #            trigger set in transition map + single char ==
        #            trigger set in skeleton. (check this first,
        #            don't waste wildcard).
        #      (3.2) trigger set in skeleton + wildcard == trigger set
        #            in transition map.
        #
        common_set = self.__skeleton_key_set & transition_map_key_set
        for target_idx in common_set:
            sk_trigger_set = self.__skeleton[target_idx]
            tm_trigger_set = TransitionMap[target_idx]

            if sk_trigger_set.is_equal(tm_trigger_set): continue

            # (3.1) Maybe the current single transition covers the 'hole'.
            #       (check this first, we do not want to waste the wilcard)
            if can_plug_to_equal(tm_trigger_set, TriggerCharToTarget,
                                 sk_trigger_set):
                continue

            elif wildcard_plug == None:
                # (3.2) Can difference between trigger sets be plugged by the wildcard?
                if can_plug_to_equal(sk_trigger_set, self.__wildcard,
                                     tm_trigger_set):
                    wildcard_plug = target_idx
                    continue
                # (3.3) A set extended by wilcard may have only a 'hole' of the
                #       size of the single transition char.
                if can_plug_to_equal(
                        tm_trigger_set, TriggerCharToTarget,
                        sk_trigger_set.union(NumberSet(self.__wildcard))):
                    wildcard_plug = target_idx
                    continue

            # Trigger sets differ and no wildcard or single transition can
            # 'explain' that => skeleton does not fit.
            return None

        if wildcard_plug == None: return -1  # No plugging necessary
        return wildcard_plug
Пример #22
0
    def get_trigger_set_union(self):
        result = NumberSet()
        for trigger_set in self.__db.values():
            result.unite_with(trigger_set)

        return result
Пример #23
0
    def get_elementary_trigger_sets(self, StateIdxList):
        """Considers the trigger dictionary that contains a mapping from target state index 
           to the trigger set that triggers to it: 
     
                   target_state_index   --->   trigger_set 
    
           The trigger sets of different target state indices may intersect. As a result,
           this function produces a list of pairs:
    
                  [ state_index_list, elementary_trigger_set ]
    
           where the elementary trigger set is the set of all triggers that trigger
           at the same time to all states in the state_index_list. The list contains 
           for one state_index_list only one elementary_trigger_set. All elementary
           trigger sets are disjunct, i.e. they do not intersect.
    
          NOTE: A general solution of this problem would have to consider the 
                inspection of all possible subset combinations. The number of 
                combinations for N trigger sets is 2^N - which potentially blows
                the calculation power of the computer. Excessive optimizations
                would have to be programmed, if not the following were the case: 
    
          NOTE: Fortunately, we are dealing with one dimensional sets! Thus, there is
                a very effective way to determine the elementary trigger sets. Imagine
                three trigger sets stretching over the range of numbers as follows:

          different targets, e.g. T0, T1, T2 are triggered by different sets of letters
          in the alphabet. 
                                                                    letters of alphabet
                      ---------------------------------------------------->

                  T0  [---------)       [----------)
                  T1          [------)      [-----)
                  T2              [----------------------)    
    
          => elementary sets: 
     
             only T0  [-------)
             T0, T1           [-)
             only T1            [-)
             T1, T2               [--)
             only T2                 [---)          [----)
             T0, T2                      [---)     [)
             T0, T1, T2                      [-----)
        """
        def DEBUG_print_history(history):
            txt = ""
            for item in history:
                txt += repr(item) + "\n"
            print txt

        # (*) accumulate the transitions for all states in the state list.
        #     transitions to the same target state are combined by union.
        history = []
        for state_idx in StateIdxList:
            # -- trigger dictionary:  target_idx --> trigger set that triggers to target
            line_up = self.states[state_idx].transitions(
            ).get_trigger_set_line_up()
            # NOTE: Doublicate entries in history are perfectly reasonable at this point,
            #       simply if two states trigger on the same character range to the same
            #       target state. When ranges are opened/closed via the history items
            #       this algo keeps track of doublicates (see below).
            history.extend(line_up)

        # (*) sort history according to position
        history.sort(lambda a, b: cmp(a.position, b.position))
        ## DEBUG_print_history(history)

        # (*) build the elementary subset list
        combinations = {}  # use dictionary for uniqueness
        map_key_str_to_target_index_combination = {
        }  # use dictionary for uniqueness
        current_interval_begin = None
        current_involved_target_indices = {}  # use dictionary for uniqueness
        current_involved_targets_epsilon_closure = []
        for item in history:
            # -- add interval and target indice combination to the data
            #    (only build interval when current begin is there,
            #     when the interval size is not zero, and
            #     when the epsilon closure of target states is not empty)
            if current_interval_begin != None and \
               current_interval_begin != item.position and \
               current_involved_target_indices.keys() != []:

                interval = Interval(current_interval_begin, item.position)
                key_str = repr(current_involved_targets_epsilon_closure)
                if not combinations.has_key(key_str):
                    combinations[key_str] = NumberSet(interval,
                                                      ArgumentIsYoursF=True)
                    map_key_str_to_target_index_combination[key_str] = \
                                     current_involved_targets_epsilon_closure
                else:
                    combinations[key_str].unite_with(interval)

            # -- BEGIN / END of interval:
            #    add or delete a target state to the set of currently considered target states
            #    NOTE: More than one state can trigger on the same range to the same target state.
            #          Thus, one needs to keep track of the 'opened' target states.
            if item.change == INTERVAL_BEGIN:
                if current_involved_target_indices.has_key(item.target_idx):
                    current_involved_target_indices[item.target_idx] += 1
                else:
                    current_involved_target_indices[item.target_idx] = 1
            else:  # == INTERVAL_END
                if current_involved_target_indices[item.target_idx] > 1:
                    current_involved_target_indices[item.target_idx] -= 1
                else:
                    del current_involved_target_indices[item.target_idx]

            # -- re-compute the epsilon closure of the target states
            current_involved_targets_epsilon_closure = \
                self.get_epsilon_closure_of_state_set(current_involved_target_indices.keys())
            current_involved_targets_epsilon_closure.sort()

            # -- set the begin of interval to come
            current_interval_begin = item.position

        # (*) create the list of pairs [target-index-combination, trigger_set]
        result = []
        for key_str, target_index_combination in map_key_str_to_target_index_combination.items(
        ):
            result.append([target_index_combination, combinations[key_str]])

        return result
Пример #24
0
    def get_trigger_set_union(self):
        result = NumberSet()
        for trigger_set in self.__db.values():
            result.unite_with(trigger_set)

        return result
Пример #25
0
def get_supported_unicode_character_set(CodecAlias, FH=-1, LineN=None):
    result = NumberSet()
    for source_begin, source_end, target_begin in get_codec_transformation_info(
            CodecAlias, FH, LineN):
        result.add_interval(Interval(source_begin, source_end))
    return result