Пример #1
0
    def seal(self):
        if len(self.space_db) == 0 and len(self.grid_db) == 0:
            default_space = ord(' ')
            default_tab = ord('\t')
            bad = self.bad_character_set
            if bad.get().contains(default_space) == False:
                self.specify_space("[ ]", NumberSet(default_space), 1, self.fh)
            if bad.get().contains(default_tab) == False:
                self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh)

            if len(self.space_db) == 0 and len(self.grid_db) == 0:
                error_msg(
                    "No space or grid defined for indentation counting. Default\n"
                    "values ' ' and '\\t' could not be used since they are specified as 'bad'.",
                    bad.file_name, bad.line_n)

        if self.newline_state_machine.get() is None:
            sm = StateMachine()
            end_idx = sm.add_transition(sm.init_state_index,
                                        NumberSet(ord('\n')),
                                        AcceptanceF=True)
            mid_idx = sm.add_transition(sm.init_state_index,
                                        NumberSet(ord('\r')),
                                        AcceptanceF=False)
            sm.add_transition(mid_idx,
                              NumberSet(ord('\n')),
                              end_idx,
                              AcceptanceF=False)
            self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
Пример #2
0
    def buffer_codec_prepare(self, BufferCodecName, BufferCodecFileName=None, Module=None):
        """Determines: Setup.buffer_codec_name
                       Setup.buffer_codec
        """
        if   BufferCodecName in ("utf8", "utf16"):
            assert Module is not None
            result = codec_db.CodecDynamicInfo(BufferCodecName, Module)
        elif BufferCodecFileName:
            os.path.splitext(os.path.basename(BufferCodecFileName))
            try: 
               os.path.splitext(os.path.basename(BufferCodecFileName))
            except:
                file_in.error_msg("cannot interpret string following '--codec-file'")
            result = codec_db.CodecTransformationInfo(FileName=BufferCodecFileName)
        elif BufferCodecName == "unicode":
            # (Still, 'icu' or 'iconv' may provide converted content, but ...) 
            # If the internal buffer is 'unicode', then the pattern's state 
            # machines are not converted. The requirement for the pattern's
            # range is the same as for the 'buffer element chunks'.
            result = codec_db.CodecInfo("unicode", 
                                NumberSet.from_range(0, self.get_character_value_limit()), 
                                NumberSet.from_range(0, self.get_character_value_limit()))
        elif BufferCodecName == "unit-test":
            result = codec_db.CodecInfo("unicode", 
                                NumberSet.from_range(-sys.maxint, sys.maxint),
                                NumberSet.from_range(-sys.maxint, sys.maxint))

        else:
            result = codec_db.CodecTransformationInfo(BufferCodecName)

        self.buffer_codec = result
Пример #3
0
    def __sm_newline_default(self):
        """Default newline: '(\n)|(\r\n)'
        """
        global cc_type_name_db

        newline_set = NumberSet(ord('\n'))
        retour_set  = NumberSet(ord('\r'))

        before = self.count_command_map.find_occupier(newline_set, set())
        if before is not None:
            error_msg("Trying to implement default newline: '\\n' or '\\r\\n'.\n" 
                      "The '\\n' option is not possible, since it has been occupied by '%s'.\n" \
                      "No newline can be defined by default."
                      % cc_type_name_db[before.cc_type], before.sr, DontExitF=True, 
                      SuppressCode=NotificationDB.warning_default_newline_0A_impossible)
            # In this case, no newline can be defined!
            return

        sm = StateMachine.from_character_set(newline_set)

        if Setup.dos_carriage_return_newline_f:
            before = self.count_command_map.find_occupier(retour_set, set())
            if before is not None:
                error_msg("Trying to implement default newline: '\\n' or '\\r\\n'.\n" 
                          "The '\\r\\n' option is not possible, since '\\r' has been occupied by '%s'." \
                          % cc_type_name_db[before.cc_type],
                          before.sr, DontExitF=True, 
                          SuppressCode=NotificationDB.warning_default_newline_0D_impossible)
            else:
                sm.add_transition_sequence(sm.init_state_index, [retour_set, newline_set])

        return sm
Пример #4
0
    def add_transition(self, Trigger, TargetStateIdx):
        """Adds a transition according to trigger and target index.
           RETURNS: The target state index (may be created newly).
        """
        assert type(TargetStateIdx) == long or TargetStateIdx is None
        assert Trigger.__class__ in [int, long, list, Interval, NumberSet
                                     ] or Trigger is None

        if Trigger is None:  # This is a shorthand to trigger via the remaining triggers
            Trigger = self.get_trigger_set_union().inverse()
        elif type(Trigger) == long:
            Trigger = Interval(int(Trigger), int(Trigger + 1))
        elif type(Trigger) == int:
            Trigger = Interval(Trigger, Trigger + 1)
        elif type(Trigger) == list:
            Trigger = NumberSet(Trigger, ArgumentIsYoursF=True)

        if Trigger.__class__ == Interval:
            if self.__db.has_key(TargetStateIdx):
                self.__db[TargetStateIdx].add_interval(Trigger)
            else:
                self.__db[TargetStateIdx] = NumberSet(Trigger,
                                                      ArgumentIsYoursF=True)
        else:
            if self.__db.has_key(TargetStateIdx):
                self.__db[TargetStateIdx].unite_with(Trigger)
            else:
                self.__db[TargetStateIdx] = Trigger

        return TargetStateIdx
Пример #5
0
def __indentation_add(Info):
    # (0) If all involved counts are single spaces, the 'counting' can be done
    #     easily by subtracting 'end - begin', no adaption.
    indent_txt = " " * 16
    if Info.has_only_single_spaces():
        return ""

    def __do(txt, CharSet, Operation):
        txt.append(indent_txt + "if( ")
        __condition(txt, CharSet)
        txt.append(" ) { ")
        txt.append(Operation)
        txt.append(" }\\\n")

    txt = []
    spaces_db = {}  # Sort same space counts together
    grid_db = {}  # Sort same grid counts together
    for name, count_parameter in Info.count_db.items():
        count = count_parameter.get()
        character_set = Info.character_set_db[name].get()
        if count == "bad": continue
        # grid counts are indicated by negative integer for count.
        if count >= 0:
            spaces_db.setdefault(count, NumberSet()).unite_with(character_set)
        else:
            grid_db.setdefault(count, NumberSet()).unite_with(character_set)

    for count, character_set in spaces_db.items():
        __do(txt, character_set, "(I) += %i;" % count)

    for count, character_set in grid_db.items():
        __do(txt, character_set,
             "(I) += (%i - ((I) %% %i));" % (abs(count), abs(count)))

    return "".join(txt)
Пример #6
0
def get_supported_unicode_character_set(CodecAlias=None, FileName=None, FH=-1, LineN=None):
    assert CodecAlias is not None or FileName is not None

    mapping_list = get_codec_transformation_info(CodecAlias, FileName, FH, LineN)
    result       = NumberSet()
    for source_begin, source_end, target_begin in mapping_list:
        result.add_interval(Interval(source_begin, source_end))
    return result
Пример #7
0
    def load_UnicodeData(self):
        fh = open_data_base_file("UnicodeData.txt")

        # some rows contain aliases, so they need to get converted into values
        property_general_category = self.db["gc"]
        property_bidi_class = self.db["bc"]

        def convert(Property, ValueAlias):
            """Convert specified ValueAlias to Value of the given property."""
            if Property.alias_to_name_map.has_key(ValueAlias):
                return Property.alias_to_name_map[ValueAlias]
            return ValueAlias

        names_db = {}
        general_category_db = {}
        bidi_class_db = {}
        numeric_value_db = {}
        names_uc1_db = {}
        iso_comment_db = {}

        for line in fh.readlines():
            if line.find("#") != -1: line = line[:line.find("#")]
            if line == "" or line.isspace(): continue

            x = line.split(";")

            code_point = int("0x" + x[0].strip(), 16)  # CodePointIdx       = 0
            name = x[1].strip().replace(" ", "_")  # NameIdx            = 1
            general_category = x[2].strip().replace(
                " ", "_")  # GeneralCategoryIdx = 2
            general_category = convert(property_general_category,
                                       general_category)
            bidi_class = x[4].strip().replace(" ",
                                              "_")  # BidiClassIdx       = 4
            bidi_class = convert(property_bidi_class, bidi_class)
            numeric_value = x[6].strip()  # NumericValueIdx    = 6
            uc1_name = x[10].strip().replace(" ",
                                             "_")  # NameUC1Idx         = 10
            iso_comment = x[11].strip().replace(" ",
                                                "_")  # ISO_CommentIdx     = 11

            names_db[name] = code_point
            general_category_db.setdefault(
                general_category, NumberSet()).quick_append_value(code_point)
            bidi_class_db.setdefault(
                bidi_class, NumberSet()).quick_append_value(code_point)
            numeric_value_db.setdefault(
                numeric_value, NumberSet()).quick_append_value(code_point)
            names_uc1_db[uc1_name] = code_point
            iso_comment_db[iso_comment] = str(code_point)

        self.db["na"].code_point_db = names_db  # Name
        self.db["gc"].code_point_db = general_category_db  # General Category
        self.db["bc"].code_point_db = bidi_class_db  # BidiClass
        self.db["nv"].code_point_db = numeric_value_db  # Numeric Value
        self.db["na1"].code_point_db = names_uc1_db  # Name Unicode 1
        self.db["isc"].code_point_db = iso_comment_db  # ISO_Comment
Пример #8
0
 def __get_remaining_set(self):
     ignored = (E_CharacterCountType.BAD, 
                E_CharacterCountType.BEGIN_NEWLINE_SUPPRESSOR, 
                E_CharacterCountType.BEGIN_NEWLINE, 
                E_CharacterCountType.END_NEWLINE) 
     result  = NumberSet()
     for character_set, info in self.__map:
         if info.cc_type in ignored: continue
         result.unite_with(character_set)
     return result.get_complement(Setup.buffer_codec.source_set)
Пример #9
0
    def load_Composition_Exclusion(self):
        # Column 0 contains what is interesting ...
        table = parse_table("CompositionExclusions.txt", NumberColumnList=[0])

        number_set = NumberSet()
        for row in table:
           begin = row[0]
           number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()    

        self.db["CE"].code_point_db = number_set
Пример #10
0
 def indentation_count_character_set(self):
     """Returns the superset of all characters that are involved in
        indentation counting. That is the set of character that can
        appear between newline and the first non whitespace character.
     """
     result = NumberSet()
     for character_set in self.space_db.values():
         result.unite_with(character_set.get())
     for character_set in self.grid_db.values():
         result.unite_with(character_set.get())
     return result
Пример #11
0
def create_ALL_BUT_NEWLINE_state_machine():
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n")).inverse()) 

    if Setup.get_character_value_limit() != sys.maxint:
        trigger_set.intersect_with(Interval(0, Setup.get_character_value_limit()))

    result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) 
    return result
Пример #12
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if len(value_list) == 0: 
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        # No decoupling, since result is computed each fresh and new
        return result
Пример #13
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if len(value_list) == 0:
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        # No decoupling, since result is computed each fresh and new
        return result
Пример #14
0
def create_ALL_BUT_NEWLINE_state_machine(stream):
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n"))).get_complement(Setup.buffer_codec.source_set)
    if trigger_set.is_empty():
        error_msg("The set of admissible characters contains only newline.\n"
                  "The '.' for 'all but newline' is an empty set.",
                  SourceRef.from_FileHandle(stream))

    result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) 
    return result
Пример #15
0
class Tracker:
    def __init__(self):
        self.match_set = NumberSet()
        self.negation_f = False

    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))

    def consider_letter(self, CharCode):
        self.consider_interval(CharCode, CharCode + 1)
Пример #16
0
class Tracker:
    def __init__(self):
        self.match_set  = NumberSet()
        self.negation_f = False
 
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))

    def consider_letter(self, CharCode):
        self.consider_interval(CharCode, CharCode+1)
Пример #17
0
def create_ALL_BUT_NEWLINE_state_machine():
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n")).inverse())

    if Setup.get_character_value_limit() != sys.maxint:
        trigger_set.intersect_with(
            Interval(0, Setup.get_character_value_limit()))

    result.add_transition(result.init_state_index,
                          trigger_set,
                          AcceptanceF=True)
    return result
Пример #18
0
 def get_trigger_set_to_target(self, TargetIdx):
     """Returns all triggers that lead to target 'TargetIdx'. If a trigger 'None' is returned
        it means that the epsilon transition triggers to target state. If the TargetIndex is 
        omitted the set of all triggers, except the epsilon triggers, are returned.
     """
     if self.__db.has_key(TargetIdx): return self.__db[TargetIdx]
     else: return NumberSet()
Пример #19
0
def CounterSetupLineColumn_Default():
    global _CounterSetupLineColumn_Default

    if _CounterSetupLineColumn_Default is None:
        count_command_map = CountCmdMap()
        count_command_map.add(NumberSet(ord('\n')), "newline", 1, SourceRef_DEFAULT)
        count_command_map.add(NumberSet(ord('\t')), "grid",    4, SourceRef_DEFAULT)
        count_command_map.define_else("space",   1, SourceRef_DEFAULT)    # Define: "\else"
        count_command_map.assign_else_count_command(
            Setup.buffer_codec.source_set.minimum(), 
            Setup.buffer_codec.source_set.supremum(),                     # Apply:  "\else"
            SourceRef_DEFAULT) 

        _CounterSetupLineColumn_Default = ParserDataLineColumn(SourceRef_DEFAULT, 
                                                               count_command_map)

    return _CounterSetupLineColumn_Default
Пример #20
0
def get_newline_in_codec(TrafoInfo):
    """Translate the code for the newline character into the given codec by 'TrafoInfo'.

       RETURNS: None if the transformation is not possible.
    """
    tmp = NumberSet(ord('\n'))
    if isinstance(TrafoInfo, (str, unicode)):
        if TrafoInfo == "utf8-state-split": pass
        elif TrafoInfo == "utf16-state-split": pass
        else:
            error_msg("Character encoding '%s' unknown to skipper.\n" % TrafoInfo + \
                      "For line number counting assume code of newline character code to be '0x%02X'." % ord('\n'),
                      DontExitF=True)
        return ord('\n')

    tmp.transform(TrafoInfo)
    return tmp.get_the_only_element()  # Returns 'None' if there is none
Пример #21
0
    def is_DFA_compliant(self):
        """Checks if the current state transitions are DFA compliant, i.e. it
           investigates if trigger sets pointing to different targets intersect.
           RETURN:  True  => OK
                    False => Same triggers point to different target. This cannot
                             be part of a deterministic finite automaton (DFA).
        """
        # DFA's do not have epsilon transitions
        if len(self.__epsilon_target_index_list) != 0: return False

        # check whether trigger sets intersect
        all_trigger_sets = NumberSet()
        for trigger_set in self.__db.values():
            if all_trigger_sets.has_intersection(trigger_set):
                return False
            else:
                all_trigger_sets.unite_with(trigger_set)

        return True
Пример #22
0
    def is_DFA_compliant(self):
        """Checks if the current state transitions are DFA compliant, i.e. it
           investigates if trigger sets pointing to different targets intersect.
           RETURNS:  True  => OK
                    False => Same triggers point to different target. This cannot
                             be part of a deterministic finite automaton (DFA).
        """
        # DFA's do not have epsilon transitions
        if len(self.__epsilon_target_index_list) != 0: return False

        # check whether trigger sets intersect
        all_trigger_sets = NumberSet()
        for trigger_set in self.__db.itervalues():
            if all_trigger_sets.has_intersection(trigger_set): 
                return False
            else:
                all_trigger_sets.unite_with(trigger_set)

        return True
Пример #23
0
    def __whitespace_default(self):
        """Try to define default whitespace ' ' or '\t' if their positions
        are not yet occupied in the count_command_map.
        """
        cs0 = NumberSet(ord(" "))
        cs1 = NumberSet(ord("\t"))
        result = NumberSet()
        if not self.count_command_map.find_occupier(cs0, set()):
            result.unite_with(cs0)
        if not self.count_command_map.find_occupier(cs1, set()):
            result.unite_with(cs1)

        if result.is_empty():
            error_msg("Trying to implement default whitespace ' ' or '\\t' failed.\n"
                      "Characters are occupied by other elements.", self.sr)
        return result
Пример #24
0
def get_all():
    """RETURNS:

       A state machine that 'eats' absolutely everything, i.e. 


                              .--- \Any ---.
                              |            |
           (0)--- \Any --->(( 0 ))<--------'
    """
    result = StateMachine()

    i = index.get()
    state = State(AcceptanceF=True)
    state.add_transition(NumberSet(Interval(-sys.maxint, sys.maxint)), i)
    result.states[i] = state

    result.get_init_state().add_transition(
        NumberSet(Interval(-sys.maxint, sys.maxint)), i)

    return result
Пример #25
0
def get_any():
    """RETURNS:

       A state machine that 'eats' any character, but only one. 

           (0)--- \Any --->(( 0 ))
    """
    result = StateMachine()
    result.add_transition(result.init_state_index,
                          NumberSet(Interval(-sys.maxint, sys.maxint)),
                          AcceptanceF=True)

    return result
Пример #26
0
def __display_set(CharSet, cl):
    if Setup.query_numeric_f: display = "hex"
    else: display = "utf8"

    CharSet.intersect_with(NumberSet(Interval(0, 0x110000)))

    print "Characters:\n"
    if Setup.query_interval_f:
        __print_set_in_intervals(CharSet, display, 80)
    elif Setup.query_unicode_names_f:
        __print_set_character_names(CharSet, display, 80)
    else:
        __print_set_single_characters(CharSet, display, 80)

    print
Пример #27
0
def __display_set(CharSet, cl):
    if cl.search("--numeric"): display = "hex"
    else: display = "utf8"

    CharSet.intersect_with(NumberSet(Interval(0, 0x110000)))

    print "Characters:\n",
    if cl.search("--intervals"):
        __print_set_in_intervals(CharSet, display, 80)
    elif cl.search("--names"):
        __print_set_character_names(CharSet, display, 80)
    else:
        __print_set_single_characters(CharSet, display, 80)

    print
Пример #28
0
def do(section_list, fh):
    """Parses a codec information file. The described codec can only be
    a 'static character length' encoding. That is every character in the
    code occupies the same number of bytes.

    RETURNS: [0] Set of characters in unicode which are covered by the
                 described codec.
             [1] Range of values in the codec elements.
    """
    source_set = NumberSet()
    drain_set = NumberSet()

    error_str = None

    try:
        while error_str is None:
            skip_whitespace(fh)
            source_begin = read_integer(fh)
            if source_begin is None:
                error_str = "Missing integer (source interval begin) in codec file."
                continue

            skip_whitespace(fh)
            source_size = read_integer(fh)
            if source_size is None:
                error_str = "Missing integer (source interval size) in codec file."
                continue

            skip_whitespace(fh)
            target_begin = read_integer(fh)
            if target_begin is None:
                error_str = "Missing integer (target interval begin) in codec file."
                continue

            source_end = source_begin + source_size
            list.append(section_list, [source_begin, source_end, target_begin])

            source_set.add_interval(Interval(source_begin, source_end))
            drain_set.add_interval(
                Interval(target_begin, target_begin + source_size))

    except EndOfStreamException:
        pass

    return source_set, drain_set, error_str
Пример #29
0
 def indentation_count_character_set(self):
     """Returns the superset of all characters that are involved in
        indentation counting. That is the set of character that can
        appear between newline and the first non whitespace character.
     """
     result = NumberSet()
     for character_set in self.space_db.values():
         result.unite_with(character_set.get())
     for character_set in self.grid_db.values():
         result.unite_with(character_set.get())
     return result
Пример #30
0
    def load_Composition_Exclusion(self):
        # Column 0 contains what is interesting ...
        table = parse_table("CompositionExclusions.txt", NumberColumnList=[0])

        number_set = NumberSet()
        for row in table:
            begin = row[0]
            number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()

        self.db["CE"].code_point_db = number_set
Пример #31
0
    def __init__(self, fh=-1):
        self.fh = fh
        if fh != -1:
            self.file_name = fh.name
            self.line_n = get_current_line_info_number(fh)
        else:
            self.file_name = "no file handle"
            self.line_n = -1

        self.space_db = {}  # Maps: space width --> character_set
        self.grid_db = {}  # Maps: grid width  --> character_set
        self.bad_character_set = LocalizedParameter("bad", NumberSet())
        self.newline_state_machine = LocalizedParameter("newline", None)
        self.newline_suppressor_state_machine = LocalizedParameter(
            "suppressor", None)

        self.__containing_mode_name = ""
Пример #32
0
    def get_incidence_id_map(self, BeyondIncidenceId=None):
        """RETURNS: A list of pairs: (character_set, incidence_id) 
             
           All same counting actions are referred to by the same incidence id.

           If BeyondIncidenceId is given, then the remaining set of characters
           is associated with 'BeyondIncidenceId'.
        """
        result = [(x.character_set, x.incidence_id) for x in self.__map]

        if BeyondIncidenceId is None:
            return result

        all_set = NumberSet.from_union_of_iterable(x.character_set
                                                   for x in self.__map)
        beyond_set = all_set.get_complement(Setup.buffer_codec.source_set)
        if not beyond_set.is_empty():
            result.append((beyond_set, BeyondIncidenceId))
        return result
Пример #33
0
def convert_table_to_associative_map(table, ValueColumnIdx, ValueType,
                                     KeyColumnIdx):
    """Produces a dictionary that maps from 'keys' to NumberSets. The 
       number sets represent the code points for which the key (property)
       is valid.

       ValueColumnIdx: Column that contains the character code interval or
                       string to which one wishes to map.

       KeyColmnIdx:   Column that contains the 'key' to be used for the map

       self.db = database to contain the associative map.
    """

    db = {}
    if ValueType == "NumberSet":
        for record in table:
            key = record[KeyColumnIdx].strip()
            key = key.replace(" ", "_")
            value = record[ValueColumnIdx]

            if type(value) == int: value = Interval(value)

            db.setdefault(key, NumberSet()).quick_append_interval(value,
                                                                  SortF=False)

    elif ValueType == "number" or ValueType == "string":
        for record in table:
            key = record[KeyColumnIdx].strip()
            key = key.replace(" ", "_")
            value = record[ValueColumnIdx]
            db[key] = value
    else:
        raise BaseException("ValueType = '%s' unknown.\n" % ValueType)

    # if the content was a number set, it might be simplified, try it.
    if ValueType == "NumberSet":
        for key, number_set in db.items():
            number_set.clean()

    return db
Пример #34
0
def do(SM):
    """RETURNS: A state machines that matches anything which is 
               not matched by SM.

       Idea: The paths along SM do not guide to acceptance states,
             but to normal states.

             Any drop-out is translated into a transition into 
             the 'accept all state'.

       NOTE: This function produces a finite state automaton which
             is not applicable by itself. It would eat ANYTHING
             from a certain state on.
    """
    result = deepcopy(SM)  # Not clone

    accept_all_state_index = index.get()
    state = State(AcceptanceF=True)
    state.add_transition(NumberSet(Interval(-sys.maxint, sys.maxint)),
                         accept_all_state_index)
    result.states[accept_all_state_index] = state

    def is_accept_all_state(sm, StateIndex):
        state = sm.states[StateIndex]
        if not state.is_acceptance(): return False
        tm = state.target_map.get_map()
        if len(tm) != 1: return False
        elif tm.iterkeys().next() != StateIndex: return False
        elif not tm.itervalues().next().is_all(): return False

        # Target is an 'Accept-All' state. Delete the transition.
        return True

    for state_index, state in SM.states.iteritems():
        # deepcopy --> use same state indices in SM and result
        result_state = result.states[state_index]
        assert state.target_map.is_DFA_compliant(), \
               "State machine must be transformed to DFA first: nfa_to_dfa.do()"

        # -- Every transition to 'Accept-All' state becomes a drop-out.
        for target_index in (
                i for i in state.target_map.get_target_state_index_list()
                if is_accept_all_state(SM, i)):
            result_state.target_map.delete_transitions_to_target(target_index)

        # -- Every drop-out becomes a transition to 'Accept-All' state.
        trigger_set = state.target_map.get_trigger_set_union()
        inverse_trigger_set = trigger_set.get_complement(
            Setup.buffer_codec.source_set)
        if not inverse_trigger_set.is_empty():
            result_state.add_transition(inverse_trigger_set,
                                        accept_all_state_index)

    # Every acceptance state becomes a non-acceptance state.
    # Every non-acceptance state becomes an acceptance state.
    for state_index, state in SM.states.iteritems():
        if state.is_acceptance():
            result.states[state_index].set_acceptance(False)
        elif state_index != SM.init_state_index:
            result.states[state_index].set_acceptance(True)

    result.clean_up()

    return result.clone()
Пример #35
0
def get_unicode_range():
    return NumberSet.from_range(0, 0x110000)
Пример #36
0
def get_codec_element_range():
    """Codec element's size is 2 bytes."""
    return NumberSet.from_range(0, 0x10000)
Пример #37
0
    def get_character_set(self, Value=None):
        """Returns the character set that corresponds to 'Property==Value'.
           'Value' can be a property value or a property value alias.
           For binary properties 'Value' must be None.
        """
        assert self.type != "Binary" or Value is None

        def get_value_combination(CmbAlias):
            result = []
            for alias in self.alias_to_alias_combination_db[CmbAlias]:
                name = self.alias_to_name_map.get(alias)
                if name is None:
                    return "Unicode database error: no name related to alias '%s'" % alias
                result.append(name)
            return result

        if self.type != "Binary" and Value is None:
            return "Property '%s' requires a value setting.\n" % self.name + \
                   "Possible Values: " + \
                   self.get_value_list_help()

        if self.code_point_db is None:
            self.init_code_point_db()

        if self.type == "Binary": 
            # Decouple, since we refer to an internal database
            return deepcopy(self.code_point_db)

        adapted_value = Value.replace(" ", "_")

        if   self.code_point_db.has_key(adapted_value): 
            # 'value' is present as name in the code point database
            value = adapted_value

        elif Value in self.alias_to_name_map.keys():
            # 'value' is present as alias in code pointer database
            value = self.alias_to_name_map[adapted_value]

        elif Value in self.alias_to_alias_combination_db.keys():
            # 'value' is present as a combination of aliases
            value = get_value_combination(adapted_value)

        elif self.name_to_alias_map.has_key(adapted_value):
            # The value was a combination of values
            value = get_value_combination(self.name_to_alias_map[adapted_value])

        else:
            # -- WILDCARD MATCH: Results in a list of property values  
            character_set = self.__wildcard_value_match(adapted_value)
            if character_set is None:
                return "Property '%s' cannot have a value or value alias '%s'.\n" % (self.name, Value) + \
                       "Possible Values: " + \
                       self.get_value_list_help()
            # No need to decouple, since character is not a reference to
            # internal database (for safety, do it)
            return deepcopy(character_set)

        if type(value) == list:
            result = NumberSet()
            for element in value:
                if element == "Unassigned": continue
                entry = self.code_point_db.get(element)
                if entry is None:
                    return "%s/%s is not supported by Unicode database." % (self.name, repr(element))
                result.unite_with(entry)
        else:
            result = self.code_point_db.get(value)
            if result is None:
                return "%s/%s is not supported by Unicode database." % (self.name, repr(value))

        # Reference to internal database --> decouple with 'deepcopy'
        return deepcopy(result)
Пример #38
0
 def __init__(self):
     self.match_set  = NumberSet()
     self.negation_f = False
Пример #39
0
 def set_all_character_set_UNIT_TEST(self, Begin, End):
     self.buffer_codec.source_set = NumberSet.from_range(Begin, End)
Пример #40
0
 def do(DB):
     combined = NumberSet()
     for number_set in DB.itervalues():
         assert not number_set.has_intersection(combined)
         combined.unite_with(number_set)
Пример #41
0
 def get_trigger_set_union(self):
     result = NumberSet()
     for trigger_set in self.__db.itervalues():
         result.unite_with(trigger_set)
     return result
Пример #42
0
# mode_db: storing the mode information into a dictionary:
#            key  = mode name
#            item = Mode object
#-----------------------------------------------------------------------------------------
mode_db = {}

#-----------------------------------------------------------------------------------------
# Counter Settings (Default)
#
Default_NewlineCharDB = {
    1:
    NumberSet([
        Interval(0x0A),  # Line Feed 
        Interval(0x0B),  # Vertical Tab 
        Interval(0x0C),  # Form Feed 
        #        0x0D       --> set to '0' newlines, see below
        Interval(0x85),  # Next Line 
        Interval(0x2028),  # Line Separator 
        Interval(0x2029)
    ]),  # Paragraph Separator 
    0:
    NumberSet(Interval(0x0D)),  # Carriage Return
    #                                 # DOS/Windows: 0x0D, 0x0A --> 1 newline
}
Default_GridCharDB = {
    4: NumberSet(ord('\t'))  # Tabulator: Grid of 4 columns
}
Default_SpecialCharDB = {
    #                                 # Special character sizes are font dependent.
    #                                 # No assumptions made by default.
}
Пример #43
0
 def do(DB):
     combined = NumberSet()
     for number_set in DB.itervalues():
         assert not number_set.has_intersection(combined)
         combined.unite_with(number_set)