Пример #1
0
def general_checks(loop_map, appendix_sm_list):
    print "#_[ Checks ]__________________________________________________"
    print
    print "character sets do not intersect",
    all_set = NumberSet()
    for lei in loop_map:
        assert lei.character_set is not None
        assert not lei.character_set.has_intersection(all_set)
        all_set.unite_with(lei.character_set)
    print "[ok]"

    print "count actions do not appear more than once",
    count_action_couple_set = set()
    count_action_plain_set  = set()
    exit_exists_f           = False
    appendix_sm_id_set      = set()
    for lei in loop_map:
        if lei.count_action is None: 
            assert lei.appendix_sm_id is None
            exit_exists_f = True
        elif lei.appendix_sm_id is None:
            assert lei.incidence_id not in count_action_plain_set
            count_action_plain_set.add(lei.incidence_id)
        else:
            assert lei.incidence_id not in count_action_couple_set
            count_action_couple_set.add(lei.incidence_id)
            appendix_sm_id_set.add(lei.appendix_sm_id)
    print "[ok]"
    list_id_set = set(sm.get_id() for sm in appendix_sm_list)
    assert appendix_sm_id_set == list_id_set
    print "appendix sm-ids are the same in loop map and sm list: [ok]"
    print "exit character set exits: [%s]" % exit_exists_f

    print
Пример #2
0
    def get(self):
        # Transform 'cursor' into a number set
        result = NumberSet()
        K   = len(self.__cursor)
        if K == 0: return None
        k   = 0
        end = 0
        while k < K - 1:
            begin = end   + self.__cursor[k]
            end   = begin + self.__cursor[k+1]
            if end > self.N: 
                self.__cursor.pop()
                K -= 1
                break
            if begin != end:
                result.quick_append_interval(Interval(begin, end))
            k += 2

        # Increment cursor
        k = 0
        while k < K:
            if k == 0:
                self.__cursor[k] += 2
                if self.__cursor[k] < 8: 
                    break
            else:
                self.__cursor[k] += 1
                if self.__cursor[k] < 3:
                    break
            self.__cursor[k] = 1
            k += 1

        return result
Пример #3
0
    def buffer_codec_prepare(self, BufferCodecName, BufferCodecFileName=None, Module=None):
        """Determines: Setup.buffer_codec_name
                       Setup.buffer_codec
        """
        assert    BufferCodecName == "unit-test" \
               or self.__buffer_element_specification_done_f == True

        if   BufferCodecName in ("utf8", "utf16"):
            assert Module is not None
            result = codec_db.CodecDynamicInfo(BufferCodecName, Module)
        elif BufferCodecFileName:
            os.path.splitext(os.path.basename(BufferCodecFileName))
            try: 
               os.path.splitext(os.path.basename(BufferCodecFileName))
            except:
                error.log("cannot interpret string following '--codec-file'")
            result = codec_db.CodecTransformationInfo(FileName=BufferCodecFileName)
        elif BufferCodecName == "unicode":
            # (Still, 'icu' or 'iconv' may provide converted content, but ...) 
            # If the internal buffer is 'unicode', then the pattern's state 
            # machines are not converted. The requirement for the pattern's
            # range is the same as for the 'buffer element chunks'.
            result = codec_db.CodecInfo("unicode", 
                                NumberSet.from_range(0, self.get_character_value_limit()), 
                                NumberSet.from_range(0, self.get_character_value_limit()))
        elif BufferCodecName == "unit-test":
            result = codec_db.CodecInfo("unicode", 
                                NumberSet.from_range(-sys.maxint, sys.maxint),
                                NumberSet.from_range(-sys.maxint, sys.maxint))

        else:
            result = codec_db.CodecTransformationInfo(BufferCodecName)

        self.buffer_codec = result
Пример #4
0
 def __init__(self):
     EncodingTrafoBySplit.__init__(self, "utf16", 
                                      CodeUnitRange=NumberSet.from_range(0, 0x10000))
     self.error_range_code_unit0 = NumberSet([
         Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000)
     ]).get_complement(NumberSet_All())
     self.error_range_code_unit1 = NumberSet([
         Interval(0xDC00, 0xE000)
     ]).get_complement(NumberSet_All())
Пример #5
0
 def __get_remaining_set(self):
     ignored = (E_CharacterCountType.BAD, 
                E_CharacterCountType.BEGIN_NEWLINE_SUPPRESSOR, 
                E_CharacterCountType.BEGIN_NEWLINE, 
                E_CharacterCountType.END_NEWLINE) 
     result  = NumberSet()
     for character_set, info in self.__map:
         if info.cc_type in ignored: continue
         result.unite_with(character_set)
     return result.get_complement(Setup.buffer_codec.source_set)
Пример #6
0
def prepare(A_list, B_list):
    A = NumberSet()
    B = NumberSet()
    for begin, end in A_list:
        A.add_interval(Interval(begin, end))
    for begin, end in B_list:
        B.add_interval(Interval(begin, end))

    A.assert_consistency()
    B.assert_consistency()
    return A, B
Пример #7
0
    def load_Composition_Exclusion(self):
        # Column 0 contains what is interesting ...
        table = parse_table("CompositionExclusions.txt", NumberColumnList=[0])

        number_set = NumberSet()
        for row in table:
           begin = row[0]
           number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()    

        self.db["CE"].code_point_db = number_set
Пример #8
0
def verify(A, TrafoInfo):
    result = NumberSet()
    for interval in A.get_intervals():
        for x in range(interval.begin, interval.end):
            for source_begin, source_end, target_begin in TrafoInfo:
                if x >= source_begin and x < source_end:
                    offset = x - source_begin
                    y      = target_begin + offset
                    result.add_interval(Interval(y))
    result.assert_consistency()
    return result
Пример #9
0
 def get_ending_character_set(self):
     """Returns the union of all characters that trigger to an acceptance
        state in the given state machine. This is to detect whether the
        newline or suppressor end with an indentation character (grid or space).
     """
     result = NumberSet()
     for end_state_index in self.get_acceptance_state_index_list():
         for state in self.states.itervalues():
             if state.target_map.has_target(end_state_index) == False: continue
             result.unite_with(state.target_map.get_trigger_set_to_target(end_state_index))
     return result
Пример #10
0
 def test(X):
     print "#_______________________________________________"
     nset  = NumberSet([ Interval(x, y) for x, y in X])
     clone = nset.clone()
     print "#NumberSet:         %s" % nset
     result = nset.clone()
     result.complement(all)
     print "#NumberSet.inverse: %s" % result
     assert result.is_equal(nset.get_complement(all))
     assert result.intersection(nset).is_empty()
     assert result.union(nset).is_all()
Пример #11
0
    def __wildcard_value_match(self, WildCardValue):
        result = NumberSet()

        value_list = self.get_wildcard_value_matches(WildCardValue)
        if len(value_list) == 0: 
            return None

        for value in value_list:
            result.unite_with(NumberSet(self.code_point_db[value]))

        # No decoupling, since result is computed each fresh and new
        return result
Пример #12
0
def the_intersection(Comment, A, B):
    if B.__class__ == Interval: B = NumberSet(B)

    print "#\n#" + Comment
    print "#  A          = " + repr(A)
    print "#  B          = " + repr(B)
    result = A.intersection(B)
    result.assert_consistency()
    print "#  intersection(A,B) = " + repr(result)
    result = B.intersection(A)
    result.assert_consistency()
    print "#  intersection(B,A) = " + repr(result)
Пример #13
0
def _get_loop_map(TheCountMap, SmList, IidLoopExit):
    """A loop map tells about the behavior of the core loop. It tells what
    needs to happen as a consequence to an incoming character. Two options:

        -- Return to loop (normal case)
        -- Enter the tail (appendix) of a parallel state machine.

    RETURNS: List of LoopMapEntry-s. 

    A LoopMapEntry consists of:

       .character_set: Character set that triggers.
       .count_action:  Count action related to the character set.
                       == None, if the character set causes 'loop exit'.
       .incidence_id:  Incidence Id of terminal that is triggered by character set.
                       -- incidence id of count action terminal, or
                       -- incidence id of couple terminal.
       .appendix_sm:   Appendix state machine
                       -- combined appendix state machines, or
                       -- None, indicating that there is none.
    """
    L = TheCountMap.loop_character_set()

    # 'couple_list': Transitions to 'couple terminals' 
    #                => connect to appendix state machines
    couple_list,     \
    appendix_sm_list = _get_LoopMapEntry_list_parallel_state_machines(TheCountMap, 
                                                                      SmList)

    L_couple = NumberSet.from_union_of_iterable(
        lei.character_set for lei in couple_list
    )

    # 'plain_list': Transitions to 'normal terminals' 
    #               => perform count action and loop.
    L_plain    = L.difference(L_couple)
    plain_list = _get_LoopMapEntry_list_plain(TheCountMap, L_plain)

    # 'L_exit': Transition to exit
    #           => remaining characters cause exit.
    L_loop = NumberSet.from_union_of_iterable(
        x.character_set for x in chain(couple_list, plain_list)
    )
    universal_set = Setup.buffer_codec.source_set
    L_exit        = L_loop.get_complement(universal_set)
    exit_list     = [ LoopMapEntry(L_exit, None, IidLoopExit, None) ]

    result = couple_list + plain_list + exit_list

    assert not any(lei is None for lei in result)
    assert not any(lei.character_set is None for lei in result)
    assert not any(lei.incidence_id is None for lei in result)
    return result, appendix_sm_list
Пример #14
0
def create_ALL_BUT_NEWLINE_state_machine(stream):
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n"))).get_complement(Setup.buffer_codec.source_set)
    if trigger_set.is_empty():
        error.log("The set of admissible characters contains only newline.\n"
                  "The '.' for 'all but newline' is an empty set.",
                  SourceRef.from_FileHandle(stream))

    result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) 
    return result
Пример #15
0
    def __init__(self):
        drain_set = NumberSet.from_range(0, 0x100)
        EncodingTrafoBySplit.__init__(self, "utf8", CodeUnitRange=drain_set)
        self.UnchangedRange = 0x7F

        self.error_range_byte0 = NumberSet([
            Interval(0b00000000, 0b01111111+1), Interval(0b11000000, 0b11011111+1),
            Interval(0b11100000, 0b11101111+1), Interval(0b11110000, 0b11110111+1),
            Interval(0b11111000, 0b11111011+1), Interval(0b11111100, 0b11111101+1),
        ]).get_complement(NumberSet_All())

        self.error_range_byteN = NumberSet(
            Interval(0b10000000, 0b10111111+1)
        ).get_complement(NumberSet_All())
Пример #16
0
class Tracker:
    def __init__(self):
        self.match_set  = NumberSet()
        self.negation_f = False
 
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))

    def consider_letter(self, CharCode):
        self.consider_interval(CharCode, CharCode+1)
Пример #17
0
def get_setup(L0, L1, FSM0, FSM1, FSM2):
    # SPECIALITIES: -- sm0 and sm1 have an intersection between their second 
    #                  transition.
    #               -- sm1 transits further upon acceptance.
    #               -- sm2 has only one transition.
    ci_list = [
        CountInfo(dial_db.new_incidence_id(), NumberSet.from_range(L0, L1), 
                  CountAction(E_CharacterCountType.COLUMN, 0)),
    ]

    # Generate State Machine that does not have any intersection with 
    # the loop transitions.
    sm0 = StateMachine()
    si = sm0.add_transition(sm0.init_state_index, FSM0)
    si = sm0.add_transition(si, NS_A, AcceptanceF=True)
    sm0.states[si].mark_acceptance_id(dial_db.new_incidence_id())

    sm1 = StateMachine()
    si0 = sm1.add_transition(sm1.init_state_index, FSM1)
    si  = sm1.add_transition(si0, NS_A, AcceptanceF=True)
    iid1 = dial_db.new_incidence_id()
    sm1.states[si].mark_acceptance_id(iid1)
    si  = sm1.add_transition(si, NS_B, si0)
    sm1.states[si].mark_acceptance_id(iid1)

    sm2 = StateMachine()
    si = sm2.add_transition(sm2.init_state_index, FSM2, AcceptanceF=True)
    sm2.states[si].mark_acceptance_id(dial_db.new_incidence_id())

    return ci_list, [sm0, sm1, sm2]
Пример #18
0
def test_on_UCS_sample_sets(Trafo, unicode_to_transformed_sequence):
    script_list = [
        "Arabic", "Armenian", "Balinese", "Bengali", "Bopomofo", "Braille", "Buginese", "Buhid",
        "Canadian_Aboriginal", "Cherokee", "Common",  "Cuneiform",  "Cypriot",  "Deseret",
        "Gothic",  "Greek",  
        "Hanunoo", "Hebrew", "Hiragana", "Inherited", "Kannada", "Han",  
        "Katakana", "Kharoshthi", "Khmer", "Lao", "Latin", "Limbu", "Linear_B", "Malayalam",
        "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Osmanya", "Ogham", "Old_Italic", "Old_Persian",
        "Phoenician",  "Shavian",  "Syloti_Nagri", 
        "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tamil", "Telugu", "Thaana", "Thai",
        "Tibetan", "Tifinagh", "Ugaritic", "Yi"
    ]
    sets = [ X(name) for name in script_list ]

    orig = get_combined_state_machine(map(lambda x: x.sm, sets))
    state_n_before, result = transform(Trafo, orig)

    # print result.get_graphviz_string(Option="hex")

    for set in sets:
        set.check(result, unicode_to_transformed_sequence)
    print "Translated %i groups without abortion on error (OK)" % len(sets)

    union = NumberSet()
    for nset in map(lambda set: set.charset, sets):
        union.unite_with(nset)

    inverse_union = NumberSet(Interval(0, 0x110000))
    inverse_union.subtract(union)
    # print inverse_union.get_string(Option="hex")
    check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True), 
                   unicode_to_transformed_sequence)
Пример #19
0
 def test(UC):
     global trafo_cp037
     x = NumberSet(UC)
     y = x.clone()
     x.transform_by_table(trafo_cp037)
     x.assert_consistency()
     print "0x%02X --> 0x%s" % (UC, x.get_string(Option="hex"))
Пример #20
0
    def is_DFA_compliant(self):
        """Checks if the current state transitions are DFA compliant, i.e. it
           investigates if trigger sets pointing to different targets intersect.
           RETURNS:  True  => OK
                    False => Same triggers point to different target. This cannot
                             be part of a deterministic finite automaton (DFA).
        """
        # DFA's do not have epsilon transitions
        if len(self.__epsilon_target_index_list) != 0: return False

        # check whether trigger sets intersect
        all_trigger_sets = NumberSet()
        for trigger_set in self.__db.itervalues():
            if all_trigger_sets.has_intersection(trigger_set): 
                return False
            else:
                all_trigger_sets.unite_with(trigger_set)

        return True
Пример #21
0
def test(ci_list, SM_list=[]):
    Setup.buffer_codec.source_set = NumberSet_All()
    ci_map                     = CountInfoMap(ci_list, NumberSet.from_range(0, 100))
    iid_loop_exit              = dial_db.new_incidence_id()
    loop_map, appendix_sm_list = loop._get_loop_map(ci_map, SM_list, iid_loop_exit) 

    print
    print
    print
    general_checks(loop_map, appendix_sm_list)
    print_this(loop_map, appendix_sm_list)
Пример #22
0
 def get_number_set(Cursor):
     if len(Cursor) == 2:
         return S_None
     cursor = copy(Cursor)
     cursor.pop(
         0
     )  # element 0 and '-1' are just helping values, no interval borders.
     result = []
     while len(cursor) != 1:
         begin = cursor.pop(0)
         end = cursor.pop(0)
         result.append(Interval(begin, end))
     return NumberSet(result)
Пример #23
0
    def buffer_codec_prepare(self,
                             BufferCodecName,
                             BufferCodecFileName=None,
                             Module=None):
        """Determines: Setup.buffer_codec_name
                       Setup.buffer_codec
        """
        assert    BufferCodecName == "unit-test" \
               or self.__buffer_element_specification_done_f == True

        if BufferCodecName in ("utf8", "utf16"):
            assert Module is not None
            result = codec_db.CodecDynamicInfo(BufferCodecName, Module)
        elif BufferCodecFileName:
            os.path.splitext(os.path.basename(BufferCodecFileName))
            try:
                os.path.splitext(os.path.basename(BufferCodecFileName))
            except:
                error.log("cannot interpret string following '--codec-file'")
            result = codec_db.CodecTransformationInfo(
                FileName=BufferCodecFileName)
        elif BufferCodecName == "unicode":
            # (Still, 'icu' or 'iconv' may provide converted content, but ...)
            # If the internal buffer is 'unicode', then the pattern's state
            # machines are not converted. The requirement for the pattern's
            # range is the same as for the 'buffer element chunks'.
            result = codec_db.CodecInfo(
                "unicode",
                NumberSet.from_range(0, self.get_character_value_limit()),
                NumberSet.from_range(0, self.get_character_value_limit()))
        elif BufferCodecName == "unit-test":
            result = codec_db.CodecInfo(
                "unicode", NumberSet.from_range(-sys.maxint, sys.maxint),
                NumberSet.from_range(-sys.maxint, sys.maxint))

        else:
            result = codec_db.CodecTransformationInfo(BufferCodecName)

        self.buffer_codec = result
Пример #24
0
def test(Border, List):
    x = NumberSet([Interval(a, b) for a, b in List])
    y = deepcopy(x)
    z = deepcopy(x)
    print "Border:              %s" % Border
    print "NumberSet:           %s" % x
    x.cut_lesser(Border)
    x.assert_consistency()
    y.cut_greater_or_equal(Border)
    x.assert_consistency()
    print "cut_lesser           --> %s" % x
    print "cut_greater_or_equal --> %s" % y
    print "______________________________________"

    assert x.union(y).is_equal(z)
Пример #25
0
def __display_set(CharSet, cl):
    if Setup.query_numeric_f: display = "hex"
    else: display = "utf8"

    CharSet.intersect_with(NumberSet(Interval(0, 0x110000)))

    print "Characters:\n"
    if Setup.query_interval_f:
        __print_set_in_intervals(CharSet, display, 80)
    elif Setup.query_unicode_names_f:
        __print_set_character_names(CharSet, display, 80)
    else:
        __print_set_single_characters(CharSet, display, 80)

    print
Пример #26
0
def do(section_list, fh):
    """Parses a codec information file. The described codec can only be
    a 'static character length' encoding. That is every character in the
    code occupies the same number of bytes.

    RETURNS: [0] Set of characters in unicode which are covered by the
                 described codec.
             [1] Range of values in the codec elements.
    """
    source_set = NumberSet()
    drain_set = NumberSet()

    error_str = None

    try:
        while error_str is None:
            skip_whitespace(fh)
            source_begin = read_integer(fh)
            if source_begin is None:
                error_str = "Missing integer (source interval begin) in codec file."
                continue

            skip_whitespace(fh)
            source_size = read_integer(fh)
            if source_size is None:
                error_str = "Missing integer (source interval size) in codec file."
                continue

            skip_whitespace(fh)
            target_begin = read_integer(fh)
            if target_begin is None:
                error_str = "Missing integer (target interval begin) in codec file."
                continue

            source_end = source_begin + source_size
            list.append(section_list, [source_begin, source_end, target_begin])

            source_set.add_interval(Interval(source_begin, source_end))
            drain_set.add_interval(
                Interval(target_begin, target_begin + source_size))

    except EndOfStreamException:
        pass

    return source_set, drain_set, error_str
Пример #27
0
    def load_Composition_Exclusion(self):
        # Column 0 contains what is interesting ...
        table = parse_table("CompositionExclusions.txt", NumberColumnList=[0])

        number_set = NumberSet()
        for row in table:
            begin = row[0]
            number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()

        self.db["CE"].code_point_db = number_set
Пример #28
0
def verify(A, TrafoInfo):
    result = NumberSet()
    for interval in A.get_intervals():
        for x in range(interval.begin, interval.end):
            for source_begin, source_end, target_begin in TrafoInfo:
                if x >= source_begin and x < source_end:
                    offset = x - source_begin
                    y = target_begin + offset
                    result.add_interval(Interval(y))
    result.assert_consistency()
    return result
Пример #29
0
    def get_incidence_id_map(self, BeyondIncidenceId=None):
        """RETURNS: A list of pairs: (character_set, incidence_id) 
             
           All same counting actions are referred to by the same incidence id.

           If BeyondIncidenceId is given, then the remaining set of characters
           is associated with 'BeyondIncidenceId'.
        """
        result = [ (x.character_set, x.incidence_id) for x in self.__map ]

        if BeyondIncidenceId is None:
            return result

        all_set    = NumberSet.from_union_of_iterable(x.character_set for x in self.__map)
        beyond_set = all_set.get_complement(Setup.buffer_codec.source_set)
        if not beyond_set.is_empty(): 
            result.append((beyond_set, BeyondIncidenceId))
        return result
Пример #30
0
    def get_incidence_id_map(self, BeyondIncidenceId=None):
        """RETURNS: A list of pairs: (character_set, incidence_id) 
             
           All same counting actions are referred to by the same incidence id.

           If BeyondIncidenceId is given, then the remaining set of characters
           is associated with 'BeyondIncidenceId'.
        """
        result = [(x.character_set, x.incidence_id) for x in self.__map]

        if BeyondIncidenceId is None:
            return result

        all_set = NumberSet.from_union_of_iterable(x.character_set
                                                   for x in self.__map)
        beyond_set = all_set.get_complement(Setup.buffer_codec.source_set)
        if not beyond_set.is_empty():
            result.append((beyond_set, BeyondIncidenceId))
        return result
Пример #31
0
def the_difference(Comment, A, B, ViceVersaF=True):
    if B.__class__ == Interval: B = NumberSet(B)

    def __do_this(A, B):
        interval_list = B.get_intervals()
        for interval in interval_list:
            print "#"
            print "#  A                 = " + repr(A)
            print "#  B                 = " + repr(interval)
            X = deepcopy(A)
            safe = Interval(interval.begin, interval.end)
            X.cut_interval(safe)
            X.assert_consistency()
            safe.begin = 7777
            safe.end = 0000
            print "#  A.cut_interval(B) = " + repr(X)

    print "#\n# " + Comment + "_" * (80 - len(Comment))
    __do_this(A, B)
    if ViceVersaF: __do_this(B, A)
Пример #32
0
def convert_table_to_associative_map(table, ValueColumnIdx, ValueType,
                                     KeyColumnIdx):
    """Produces a dictionary that maps from 'keys' to NumberSets. The 
       number sets represent the code points for which the key (property)
       is valid.

       ValueColumnIdx: Column that contains the character code interval or
                       string to which one wishes to map.

       KeyColmnIdx:   Column that contains the 'key' to be used for the map

       self.db = database to contain the associative map.
    """

    db = {}
    if ValueType == "NumberSet":
        for record in table:
            key = record[KeyColumnIdx].strip()
            key = key.replace(" ", "_")
            value = record[ValueColumnIdx]

            if type(value) == int: value = Interval(value)

            db.setdefault(key, NumberSet()).quick_append_interval(value,
                                                                  SortF=False)

    elif ValueType == "number" or ValueType == "string":
        for record in table:
            key = record[KeyColumnIdx].strip()
            key = key.replace(" ", "_")
            value = record[ValueColumnIdx]
            db[key] = value
    else:
        raise BaseException("ValueType = '%s' unknown.\n" % ValueType)

    # if the content was a number set, it might be simplified, try it.
    if ValueType == "NumberSet":
        for key, number_set in db.items():
            number_set.clean()

    return db
Пример #33
0
    def __whitespace_default(self):
        """Try to define default whitespace ' ' or '\t' if their positions
        are not yet occupied in the count_command_map.
        """
        cs0 = NumberSet(ord(" "))
        cs1 = NumberSet(ord("\t"))
        result = NumberSet()
        if not self.specifier_count_op_map.find_occupier(cs0, set()):
            result.unite_with(cs0)
        if not self.specifier_count_op_map.find_occupier(cs1, set()):
            result.unite_with(cs1)

        if result.is_empty():
            error.log("Trying to implement default whitespace ' ' or '\\t' failed.\n"
                      "Characters are occupied by other elements.", self.sr)
        return result
Пример #34
0
def unary(TheList):
    global correct_n
    global X
    correct_n = 0
    X = NumberSet([Interval(p,q) for p,q in TheList])
    print "# %s ---------------------" % X
    equal("inv(inv(X))",           "X")

    equal("uni(X, inv(X))", "All")
    equal("uni(inv(X), X)", "All")
    equal("uni(X, None)",   "X")
    equal("uni(None, X)",   "X")
    equal("uni(X, All)",    "All")
    equal("uni(All, X)",    "All")

    equal("itsct(X, inv(X))", "None")
    equal("itsct(inv(X), X)", "None")
    equal("itsct(X, None)",   "None")
    equal("itsct(None, X)",   "None")
    equal("itsct(X, All)",    "X")
    equal("itsct(All, X)",    "X")

    equal("diff(X, inv(X))", "X")
    equal("diff(inv(X), X)", "inv(X) ")
    equal("diff(X, None)",   "X")
    equal("diff(None, X)",   "None")
    equal("diff(X, All)",    "None")
    equal("diff(All, X)",    "inv(X) ")

    equal("symdiff(X, inv(X))", "All")
    equal("symdiff(inv(X), X)", "All")
    equal("symdiff(X, None)",   "X")
    equal("symdiff(None, X)",   "X")
    equal("symdiff(X, All)",    "inv(X)")
    equal("symdiff(All, X)",    "inv(X) ")

    print "No abort --> %i x korrekt" % correct_n
    return
Пример #35
0
def do(BufferCodecName, BufferCodecFileName=""):
    from   quex.engine.state_machine.transformation.base              import EncodingTrafoUnicode, \
                                                                             EncodingTrafoNone
    from quex.engine.state_machine.transformation.table import EncodingTrafoByTable
    from quex.engine.state_machine.transformation.utf8_state_split import EncodingTrafoUTF8
    from quex.engine.state_machine.transformation.utf16_state_split import EncodingTrafoUTF16

    if BufferCodecName == "none":
        return EncodingTrafoNone()

    elif BufferCodecName == "utf8":
        return EncodingTrafoUTF8()

    elif BufferCodecName == "utf16":
        return EncodingTrafoUTF16()

    elif BufferCodecFileName:
        os.path.splitext(os.path.basename(BufferCodecFileName))
        try:
            os.path.splitext(os.path.basename(BufferCodecFileName))
        except:
            error.log("cannot interpret string following '--encoding-file'")
        return EncodingTrafoByTable(FileName=BufferCodecFileName)

    elif BufferCodecName in ("unicode", "utf32"):
        # (Still, 'icu' or 'iconv' may provide converted content, but ...)
        # If the internal buffer is 'unicode', then the pattern's state
        # machines are not converted. The requirement for the pattern's
        # range is the same as for the 'buffer element chunks'.
        return EncodingTrafoUnicode(NumberSet(Interval(0, 0x110000)),
                                    Name=BufferCodecName)

    elif BufferCodecName == "unit-test":
        return EncodingTrafoUnicode(NumberSet_All(), NumberSet_All())

    else:
        return EncodingTrafoByTable(BufferCodecName)
Пример #36
0
def do(section_list, fh):
    """Parses a codec information file. The described codec can only be
    a 'static character length' encoding. That is every character in the
    code occupies the same number of bytes.

    RETURNS: [0] Set of characters in unicode which are covered by the
                 described codec.
             [1] Range of values in the codec elements.
    """
    source_set = NumberSet()
    drain_set  = NumberSet()

    error_str = None

    try:
        while error_str is None:
            skip_whitespace(fh)
            source_begin = read_integer(fh)
            if source_begin is None:
                error_str = "Missing integer (source interval begin) in codec file."
                continue

            skip_whitespace(fh)
            source_size = read_integer(fh)
            if source_size is None:
                error_str = "Missing integer (source interval size) in codec file." 
                continue

            skip_whitespace(fh)
            target_begin = read_integer(fh)
            if target_begin is None:
                error_str = "Missing integer (target interval begin) in codec file."
                continue

            source_end = source_begin + source_size
            list.append(section_list, [source_begin, source_end, target_begin])

            source_set.add_interval(Interval(source_begin, source_end))
            drain_set.add_interval(Interval(target_begin, target_begin + source_size))

    except EndOfStreamException:
        pass

    return source_set, drain_set, error_str
Пример #37
0
if len(sys.argv) < 2 or not (sys.argv[1] in [
        "ANSI-C-PlainMemory", "ANSI-C", "Cpp", "Cpp_StrangeStream"
]):
    print "Language argument not acceptable, use --hwut-info"
    sys.exit(0)

Language = sys.argv[1]
__Setup_init_language_database(Language)

StrangeStream_str = ""
if Language.find("StrangeStream") != -1:
    StrangeStream_str = " -DQUEX_OPTION_STRANGE_ISTREAM_IMPLEMENTATION "

trigger_set = NumberSet(
    [Interval(ord('a'),
              ord('z') + 1),
     Interval(ord('A'),
              ord('Z') + 1)])

TestStr = "abcdefg_HIJKLMNOP-qrstuvw'XYZ12ok3"

compile_and_run(Language,
                create_character_set_skipper_code(Language, TestStr,
                                                  trigger_set),
                StrangeStream_str=StrangeStream_str)

TestStr = "-hijklmnop_qrstuvw#xyz9"

compile_and_run(Language,
                create_character_set_skipper_code(Language, TestStr,
                                                  trigger_set),
Пример #38
0
if "1" in sys.argv:

    def test(UC):
        global trafo_cp037
        x = NumberSet(UC)
        y = x.clone()
        x.transform_by_table(trafo_cp037)
        x.assert_consistency()
        print "0x%02X --> 0x%s" % (UC, x.get_string(Option="hex"))

    for letter in xrange(-2, 258):
        test(letter)

elif "all" in sys.argv:

    x = NumberSet(Interval(0, 0x100))
    y = x.clone()
    x.transform_by_table(trafo_cp037)
    x.assert_consistency()
    print "0x%s --> 0x%s" % (y, x.get_string(Option="hex"))

elif "some" in sys.argv:
    x = NumberSet(Interval(0, 0x32))
    y = x.clone()
    x.transform_by_table(trafo_cp037)
    x.assert_consistency()
    print "0x%s --> 0x%s" % (y, x.get_string(Option="hex"))

    x = NumberSet(Interval(0x42, 0x80))
    y = x.clone()
    x.transform_by_table(trafo_cp037)
Пример #39
0
if "1" in sys.argv:
    def test(UC):
        global trafo_cp037
        x = NumberSet(UC)
        y = x.clone()
        x.transform_by_table(trafo_cp037)
        x.assert_consistency()
        print "0x%02X --> 0x%s" % (UC, x.get_string(Option="hex"))

    for letter in xrange(-2, 258):
        test(letter)

elif "all" in sys.argv:

    x = NumberSet(Interval(0, 0x100))
    y = x.clone()
    x.transform_by_table(trafo_cp037)
    x.assert_consistency()
    print "0x%s --> 0x%s" % (y, x.get_string(Option="hex"))

elif "some" in sys.argv:
    x = NumberSet(Interval(0, 0x32))
    y = x.clone()
    x.transform_by_table(trafo_cp037)
    x.assert_consistency()
    print "0x%s --> 0x%s" % (y, x.get_string(Option="hex"))

    x = NumberSet(Interval(0x42, 0x80))
    y = x.clone()
    x.transform_by_table(trafo_cp037)
Пример #40
0
 def __init__(self):
     self.match_set = NumberSet()
     self.negation_f = False
Пример #41
0
        if ta.door_id in done: continue
        assert len(ta.command_list) == 1
        cmd = ta.command_list[0]
        print "%s => %s" % (ta.door_id, cmd.content.router_element)
        done.add(ta.door_id)

def print_this(AnalyzerList):
    print "#_[ Print %i analyzer(s) ]______________________________" % len(AnalyzerList)
    print
    for i, analyzer in enumerate(AnalyzerList):
        print "--( %i: init si = %i )-------------------------\n" % (i, analyzer.init_state_index)
        print analyzer
        print_drop_out(analyzer)


NS_A = NumberSet.from_range(0x600, 0x601) # UTF8: D8 80 => 216, 128
NS_B = NumberSet.from_range(0x601, 0x602) # UTF8: D8 81 => 216, 129
NS_C = NumberSet.from_range(0x640, 0x641) # UTF8: D9 80 => 217, 128

appendix_sm_id = 4711L

if "loop" in sys.argv:
    loop_map = loop.LoopMap([
        TestLME(NS_A, dial.new_incidence_id(), None),
    ])
    column_n_per_code_unit = 5
elif "appendix" in sys.argv:
    loop_map = loop.LoopMap([
        TestLME(NS_A, dial.new_incidence_id(), appendix_sm_id), # appendix_sm_id
    ])
    column_n_per_code_unit = 5
Пример #42
0
 def get_sm(SmId, Trigger):
     sm = StateMachine.from_IncidenceIdMap([
         (NumberSet.from_range(Trigger, Trigger + 1), SmId)
     ])
     sm.set_id(SmId)
     return sm
Пример #43
0
def get_codec_element_range():
    """Codec element's size is 1 byte."""
    return NumberSet.from_range(0, 0x100)
Пример #44
0
def get_elementary_trigger_sets(StateIdxList,
                                sm=None,
                                epsilon_closure_db=None):
    """NOTE: 'epsilon_closure_db' must previously be calculcated by 
             sm.get_epsilon_closure_db(). This has to happen once
             and for all in order to save computation time.
       TODO: Performance--at the bottom of this file there is a class 
             that might be directly used for indexing into a dictionary
             for caching the epsilon closures: MultiOccurrenceNumberList.
             (Tests showed that in average the a state combination requires
              6x to evaluate into a closure).
    
       Considers the trigger dictionary that contains a mapping from target state index 
       to the trigger set that triggers to it: 
 
               target_state_index   --->   trigger_set 

       The trigger sets of different target state indices may intersect. As a result,
       this function produces a list of pairs:

              [ state_index_list, elementary_trigger_set ]

       where the elementary trigger set is the set of all triggers that trigger
       at the same time to all states in the state_index_list. The list contains 
       for one state_index_list only one elementary_trigger_set. All elementary
       trigger sets are disjunct, i.e. they do not intersect.

      NOTE: A general solution of this problem would have to consider the 
            inspection of all possible subset combinations. The number of 
            combinations for N trigger sets is 2^N - which potentially blows
            the calculation power of the computer. Excessive optimizations
            would have to be programmed, if not the following were the case: 

      NOTE: Fortunately, we are dealing with one dimensional sets! Thus, there is
            a very effective way to determine the elementary trigger sets. Imagine
            three trigger sets stretching over the range of numbers as follows:

      different targets, e.g. T0, T1, T2 are triggered by different sets of letters
      in the alphabet. 
                                                                letters of alphabet
                  ---------------------------------------------------->

              T0  [---------)       [----------)
              T1          [------)      [-----)
              T2              [----------------------)    

      => elementary sets: 
 
         only T0  [-------)
         T0, T1           [-)
         only T1            [-)
         T1, T2               [--)
         only T2                 [---)          [----)
         T0, T2                      [---)     [)
         T0, T1, T2                      [-----)
    """
    # For Documentation Purposes: The following approach has been proven to be SLOWER
    #                             then the one currently implemented. May be, some time
    #                             it can be tweaked to be faster.
    #
    #                             Also, it is not proven to be correct!
    #
    ##    trigger_list = []
    ##    for state_index in StateIdxList:
    ##        state = sm.states[state_index]
    ##        for target_index, trigger_set in state.target_map:
    ##            target_epsilon_closure = epsilon_closure_db[target_index]
    ##            interval_list          = trigger_set.get_intervals(PromiseToTreatWellF=True)
    ##            trigger_list.extend([x, target_epsilon_closure] for x in interval_list])
    ##
    ##    trigger_list.sort(key=lambda x: x[0].begin)
    ##    for element in trigger_list:
    ##        # ... continue as shown below
    ##
    ##    return combination_list

    ## Special Case -- Quickly Done: One DFA_State, One Target DFA_State
    ##  (Improvement is merely measurable).
    ##  if len(StateIdxList) == 1:
    ##      state_idx = list(StateIdxList)[0]
    ##      if len(epsilon_closure_db[state_idx]) == 1:
    ##           tm = sm.states[state_idx].target_map.get_map()
    ##           if not tm:
    ##               return {}
    ##           elif len(tm) == 1:
    ##               target, trigger_set = tm.iteritems().next()
    ##               current_target_epsilon_closure = epsilon_closure_db[target]
    ##               return { tuple(sorted(current_target_epsilon_closure)): trigger_set }

    ## TODO: Get the epsilon closure before the loop over history!
    ##
    ##       sm.get_epsilon_closure_of_state_set(current_target_indices,
    ##                                             epsilon_closure_db)

    # (*) Accumulate the transitions for all states in the state list.
    #     transitions to the same target state are combined by union.
    history = _get_plain_line_up(
        [sm.states[si].target_map for si in StateIdxList])

    # (*) build the elementary subset list
    combinations = {}  # use dictionary for uniqueness
    current_interval_begin = None
    current_target_indices = {}  # use dictionary for uniqueness
    current_target_epsilon_closure = []
    for item in history:
        # -- add interval and target indice combination to the data
        #    (only build interval when current begin is there,
        #     when the interval size is not zero, and
        #     when the epsilon closure of target states is not empty)
        if     current_interval_begin is not None      \
           and current_interval_begin != item.position \
           and len(current_target_indices) != 0:

            interval = Interval(current_interval_begin, item.position)

            # key = tuple(current_target_epsilon_closure)
            key = tuple(sorted(current_target_epsilon_closure))
            combination = combinations.get(key)
            if combination is None:
                combinations[key] = NumberSet(interval, ArgumentIsYoursF=True)
            else:
                combination.unite_with(interval)

        # -- BEGIN / END of interval:
        #    add or delete a target state to the set of currently considered target states
        #    NOTE: More than one state can trigger on the same range to the same target state.
        #          Thus, one needs to keep track of the 'opened' target states.
        if item.change == E_Border.BEGIN:
            if current_target_indices.has_key(item.target_idx):
                current_target_indices[item.target_idx] += 1
            else:
                current_target_indices[item.target_idx] = 1
        else:  # == E_Border.END
            if item.target_idx not in current_target_indices:
                print "#ERROR:", history
            if current_target_indices[item.target_idx] > 1:
                current_target_indices[item.target_idx] -= 1
            else:
                del current_target_indices[item.target_idx]

        # -- re-compute the epsilon closure of the target states
        current_target_epsilon_closure = \
            sm.get_epsilon_closure_of_state_set(current_target_indices,
                                                epsilon_closure_db)
        # -- set the begin of interval to come
        current_interval_begin = item.position

    ## if proposal is not None:
    ##    if    len(proposal)     != len(combinations) \
    ##       or proposal.keys()   != combinations.keys() \
    ##       or not proposal.values()[0].is_equal(combinations.values()[0]):
    ##        print "##proposal:    ", proposal
    ##        print "##combinations:", combinations

    # (*) create the list of pairs [target-index-combination, trigger_set]
    return combinations
    "Buhid",
    "Canadian_Aboriginal",
    "Cherokee",
    "Syloti_Nagri",
    "Syriac",
    "Tagalog",
    "Tagbanwa",
    "Tai_Le",
    "Yi",
])

orig = get_combined_state_machine(map(lambda x: x.sm, sets))
print "# Number of states in state machine:"
print "#   Unicode:       %i" % len(orig.states)
result = trafo.do(orig)
print "#   UTF8-Splitted: %i" % len(result.states)

# print result.get_graphviz_string(Option="hex")

for set in sets:
    set.check(result)

union = NumberSet()
for nset in map(lambda set: set.charset, sets):
    union.unite_with(nset)

inverse_union = NumberSet(Interval(0, 0x110000))
inverse_union.subtract(union)
# print inverse_union.get_string(Option="hex")
check_negative(result, inverse_union.get_intervals(PromiseToTreatWellF=True))
Пример #46
0
 def __init__(self, Name, CodeUnitRange):
     base.EncodingTrafo.__init__(self, Name, NumberSet.from_range(0, 0x110000),
                                 CodeUnitRange)
Пример #47
0
    sys.exit(0)

if len(sys.argv) < 2:
    print "Argument not acceptable, use --hwut-info"
    sys.exit(0)

BS = int(sys.argv[1])

if BS not in [5, 6, 7, 8]:
    print "Argument not acceptable, use --hwut-info"
    sys.exit(0)

Language = "Cpp"
__Setup_init_language_database(Language)

trigger_set = NumberSet([Interval(0x600, 0x700)])
Setup.buffer_codec_set(bc_factory.do("utf8"), 1)


def make(TriggerSet, BufferSize):
    Language = "ANSI-C-from-file"
    code = create_character_set_skipper_code(Language,
                                             "",
                                             TriggerSet,
                                             QuexBufferSize=BufferSize,
                                             InitialSkipF=False,
                                             OnePassOnlyF=True)
    exe_name, tmp_file_name = compile(Language, code)
    return exe_name, tmp_file_name

Пример #48
0
def _get_loop_map(loop_config, CaMap, SmList, IidLoopExit, L_subset):
    """A loop map tells about the behavior of the core loop. It tells what
    needs to happen as a consequence to an incoming character. Two options:

    L_subset = NumberSet containing characters that are actually part of 
               the loop. 'None' => all characters of 'CaMap' are considered.

        -- Return to loop (normal case)
        -- Enter the tail (appendix) of a parallel state machine.

    RETURNS: List of LoopMapEntry-s. 

    A LoopMapEntry consists of:

       .character_set: Character set that triggers.
       .count_action:  Count action related to the character set.
                       == None, if the character set causes 'loop exit'.
       .iid_couple_terminal:  Incidence Id of terminal that is triggered by character set.
                       -- incidence id of count action terminal, or
                       -- incidence id of couple terminal.
       .appendix_sm:   Appendix state machine
                       -- combined appendix state machines, or
                       -- None, indicating that there is none.
    """
    assert all(
        _state_machine_tagged_with_matching_incidence_ids(sm) for sm in SmList)
    # State machines are not to be transformed at this point in time
    assert all(not _exists_bad_lexatom_detector_state(sm) for sm in SmList)

    CaMap.prune(Setup.buffer_encoding.source_set)
    L = CaMap.union_of_all()

    L_couple = NumberSet.from_union_of_iterable(
        sm.get_beginning_character_set() for sm in SmList)

    # 'plain_list': Transitions to 'normal terminals'
    #               => perform count action and loop.
    L_plain = L.difference(L_couple)
    if L_subset is not None: L_plain.intersect_with(L_subset)
    L_loop = L_plain.union(L_couple)
    L_exit = L_loop.get_complement(Setup.buffer_encoding.source_set)

    plain_list = _get_LoopMapEntry_list_plain(loop_config, CaMap, L_plain)

    exit_list = []
    if not L_exit.is_empty():
        exit_list.append(
            LoopMapEntry(L_exit,
                         IidLoopExit,
                         Code=loop_config.cmd_list_CA_GotoTerminal(
                             None, IidLoopExit)))

    # 'couple_list': Transitions to 'couple terminals'
    #                => connect to appendix state machines
    couple_list,               \
    combined_appendix_sm_list, \
    appendix_cmd_list_db       = parallel_state_machines.do(loop_config, CaMap, SmList)

    assert L_couple.is_equal(
        NumberSet.from_union_of_iterable(lei.character_set
                                         for lei in couple_list))

    result = LoopMap(
        couple_list,  # --> jump to appendix sm-s
        plain_list,  # --> iterate to loop start
        exit_list)  # --> exit loop

    return result, combined_appendix_sm_list, appendix_cmd_list_db
Пример #49
0
    print "Argument not acceptable, use --hwut-info"
    sys.exit(0)


def build(ISetup):
    Language = "ANSI-C"
    txt = create_indentation_handler_code(Language,
                                          "<by command line>",
                                          ISetup,
                                          BufferSize=3)
    executable_name, \
    source           = compile(Language, txt, AssertsActionvation_str = "")
    return executable_name, source


pattern_newline = get_Pattern_Prep(DFA.from_character_set(NumberSet(
    ord('\n'))))
pattern_suppressed_newline = get_Pattern_Prep(
    DFA.from_sequence([ord(x) for x in "\\\n"]))

indent_setup = IndentationCount_Pre(
    SourceRef_VOID,
    WhiteSpaceCharacterSet=NumberSet([Interval(ord(x)) for x in " :"]),
    BadSpaceCharacterSet=None,
    PatternNewline=pattern_newline,
    PatternSuppressedNewline=pattern_suppressed_newline,
    PatternListComment=[])

if "FIRST" in sys.argv or len(sys.argv) <= 2:
    exe, tmp_file = build(indent_setup)

exe = "tmp.c.exe"
Пример #50
0
 def get_trigger_set_union(self):
     interval_list = []
     for trigger_set in self.__db.itervalues():
         interval_list.extend(trigger_set.get_intervals())
     return NumberSet.from_IntervalList(interval_list)
Пример #51
0
        if lei.appendix_sm_has_transitions_f
    ]


def print_this(AnalyzerList):
    print "#_[ Print %i analyzer(s) ]______________________________" % len(
        AnalyzerList)
    print
    for i, analyzer in enumerate(AnalyzerList):
        print "--( %i: init si = %i )-------------------------\n" % (
            i, analyzer.init_state_index)
        print analyzer


if encoding == "unicode":
    NS_A = NumberSet.from_range(ord('A'), ord('A') + 1)
    NS_B = NumberSet.from_range(ord('B'), ord('B') + 1)
    NS_C = NumberSet.from_range(ord('C'), ord('C') + 1)
    NS_D = NumberSet.from_range(ord('D'), ord('D') + 1)
    NS_E = NumberSet.from_range(ord('E'), ord('E') + 1)
else:
    NS_A = NumberSet.from_range(0x600, 0x601)
    NS_B = NumberSet.from_range(0x601, 0x602)
    NS_C = NumberSet.from_range(0x602, 0x603)
    NS_D = NumberSet.from_range(0x603, 0x604)
    NS_E = NumberSet.from_range(0x604, 0x605)

CA_0 = CountAction(E_CharacterCountType.COLUMN, 5)
CA_1 = CountAction(E_CharacterCountType.LINE, 1)
CA_2 = CountAction(E_CharacterCountType.GRID, 2)
CA_3 = CountAction(E_CharacterCountType.WHITESPACE, 3)
Пример #52
0
#! /usr/bin/env python
import sys
import os
sys.path.insert(0, os.environ["QUEX_PATH"])

from quex.engine.misc.interval_handling import Interval, NumberSet
from quex.constants import INTEGER_MAX

all = NumberSet.from_range(-INTEGER_MAX, INTEGER_MAX)

if "--hwut-info" in sys.argv:
    print "NumberSet: Inverse"
    print "CHOICES: 1, 2, serious;"
    sys.exit(0)

def test(NSet):
    print "# write output in temporary file: 'tmp'"    
    print "# plot with gnuplot:"
    print "# > plot \"tmp\" w l"
    
    print NSet.gnuplot_string(1)
    result = NSet.get_complement(all)
    result.assert_consistency()
    print result.gnuplot_string(0)

if "1" in sys.argv:
    test(NumberSet([Interval(10,20),   Interval(21,30),
                    Interval(50,70),   Interval(71,80),
                    Interval(80,81),   Interval(82,90),
                    Interval(90,100),  Interval(110,130),
                    Interval(150,170), Interval(171,190),
Пример #53
0
def get_Pattern(ValueList):
    return Pattern.from_character_set(
        NumberSet([Interval(ord(x)) for x in ValueList]))
Пример #54
0
class EncodingTrafoUTF16(EncodingTrafoBySplit):
    UnchangedRange = 0x10000
    def __init__(self):
        EncodingTrafoBySplit.__init__(self, "utf16", 
                                         CodeUnitRange=NumberSet.from_range(0, 0x10000))
        self.error_range_code_unit0 = NumberSet([
            Interval(0x0000, 0xDC00), Interval(0xE000, 0x10000)
        ]).get_complement(NumberSet_All())
        self.error_range_code_unit1 = NumberSet([
            Interval(0xDC00, 0xE000)
        ]).get_complement(NumberSet_All())

    def prune(self, number_set):
        global ForbiddenRange
        number_set.subtract(ForbiddenRange)
        number_set.mask(0, 0x110000)

    def get_interval_sequences(self, Orig):
        interval_1word, intervals_2word = _get_contigous_intervals(Orig)

        result = []
        if interval_1word is not None:
            result.append([interval_1word])

        if intervals_2word is not None:
            result.extend(
                _get_trigger_sequence_for_interval(interval)
                for interval in intervals_2word
            )
        return result

    def lexatom_n_per_character(self, CharacterSet):
        """If all characters in a unicode character set state machine require the
        same number of bytes to be represented this number is returned.  Otherwise,
        'None' is returned.

        RETURNS:   N > 0  number of bytes required to represent any character in the 
                          given state machine.
                   None   characters in the state machine require different numbers of
                          bytes.
        """
        assert isinstance(CharacterSet, NumberSet)

        interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
        front = interval_list[0].begin     # First element of number set
        back  = interval_list[-1].end - 1  # Last element of number set
        # Determine number of bytes required to represent the first and the 
        # last character of the number set. The number of bytes per character
        # increases monotonously, so only borders have to be considered.
        front_chunk_n = len(unicode_to_utf16(front))
        back_chunk_n  = len(unicode_to_utf16(back))
        if front_chunk_n != back_chunk_n: return None
        else:                             return front_chunk_n

    def _plug_encoding_error_detectors(self, sm):
        """Adorn states with transitions to the 'on_encoding_error' handler if the 
        input value lies beyond the limits. The state machine is an implementation
        of linear sequences of intervals. Thus, the 'code unit position' can be 
        be determined by the number of transitions from the init state.

        sm = mini state machine that implements the transition sequences.

        Bad ranges for code units (a 2 byte):
            1st: 0xDC00 - 0xCFFF
            2nd: 0x0000 - 0xDBFF, 0xE000 - 0x11000 
        """
        # 'CodeUnit[0]' appears at the init state
        # (Adapt trigger map before entering the 'on bad lexatom state'
        init_tm = sm.get_init_state().target_map.get_map()
        workset = set(init_tm.iterkeys()) 
        for si, trigger_set in init_tm.iteritems():
            assert not trigger_set.has_intersection(self.error_range_code_unit0)

        bad_lexatom_state_index = self._plug_encoding_error_detector_single_state(sm, init_tm)

        # 'CodeUnit[>0]' appear all at later states
        done = set([bad_lexatom_state_index])
        while workset:
            si = workset.pop()
            tm = sm.states[si].target_map.get_map()
            done.add(si)

            # Only add bad lexatom detection to state that transit on lexatoms
            # (Bad lexatom states, btw. do not have transitions)
            if not tm: continue

            for trigger_set in tm.itervalues():
                assert not trigger_set.has_intersection(self.error_range_code_unit1)

            workset.update(new_si for new_si in tm.iterkeys() if new_si not in done) 
            tm[bad_lexatom_state_index] = self.error_range_code_unit1

    def _plug_encoding_error_detector_single_state(self, sm, target_map):
        bad_lexatom_state_index = sm.access_bad_lexatom_state()
        if target_map: 
            target_map[bad_lexatom_state_index] = self.error_range_code_unit0
        return bad_lexatom_state_index

    def adapt_source_and_drain_range(self, LexatomByteN):
        EncodingTrafoBySplit.adapt_source_and_drain_range(self, LexatomByteN)
        self.error_range_code_unit0.mask_interval(self.lexatom_range)
        self.error_range_code_unit1.mask_interval(self.lexatom_range)
        if LexatomByteN == -1:
            return
        elif LexatomByteN >= 2: 
            return
        else:
            # if there are less than 2 byte for the lexatoms, then only the 
            # unicode range from 0x00 to 0xFF can be treated.
            self.source_set.mask(0x00, 0x100)
Пример #55
0
                self.__cursor[k] += 2
                if self.__cursor[k] < 8:
                    break
            else:
                self.__cursor[k] += 1
                if self.__cursor[k] < 3:
                    break
            self.__cursor[k] = 1
            k += 1

        return result


generator = NumberSetGenerator()

all = NumberSet.from_range(-sys.maxint, sys.maxint)

# Generate 100 NumberSets
number_set_list = []
for i in range(100):
    result = generator.get()
    number_set_list.append(generator.get())


def test(N1, Op1, N2, Op2):
    global number_set_list
    the_tester = Tester(N1, Op1, N2, Op2)

    # Permutate all existing intervals against each other
    count_n = 0
    for i, x in enumerate(number_set_list):
Пример #56
0
def get_unicode_range():
    return NumberSet.from_range(0, 0x110000)
Пример #57
0
def create_random_number_set():
    result = NumberSet()
    for begin, end in create_random_interval_list(False):
        result.quick_append_interval(Interval(begin, end))
    result.clean()
    return result
Пример #58
0
def _enter(result, begin, end, target_state_setup):
    entry = result.get(target_state_setup)
    if entry is None:
        result[target_state_setup] = NumberSet.from_range(begin, end)
    else:
        entry.quick_append_interval(Interval(begin, end))