Пример #1
0
def get_trigger_sequence_for_contigous_byte_range_interval(X, L):
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence  = unicode_to_utf8(X.end - 1)
    # If the interval is contigous it must produce equal length utf8 sequences

    # Let me play with 'list comprehensions' just one time
    return [ Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L) ]
Пример #2
0
def split_interval_according_to_utf8_byte_sequence_length(X):
    """Split Unicode interval into intervals where all values
       have the same utf8-byte sequence length.
    """
    if X.begin < 0: X.begin = 0
    if X.end > UTF8_MAX: X.end = UTF8_MAX + 1

    if X.size() == 0: return None

    db = {}
    current_begin = X.begin
    last_L = len(unicode_to_utf8(X.end -
                                 1))  # Length of utf8 sequence corresponding
    #                                                # the last value inside the interval.
    while 1 + 1 == 2:
        L = len(unicode_to_utf8(
            current_begin))  # Length of the first unicode in utf8
        # Store the interval together with the required byte sequence length (as key)
        current_end = UTF8_BORDERS[L - 1]
        if L == last_L:
            db[L] = Interval(current_begin, X.end)
            break
        db[L] = Interval(current_begin, current_end)
        current_begin = current_end

    return db
Пример #3
0
def get_trigger_sequence_for_contigous_byte_range_interval(X, L):
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence = unicode_to_utf8(X.end - 1)
    # If the interval is contigous it must produce equal length utf8 sequences

    return [
        Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L)
    ]
Пример #4
0
def homogeneous_chunk_n_per_character(CharacterSet):
    """If all characters in a unicode character set state machine require the
    same number of bytes to be represented this number is returned.  Otherwise,
    'None' is returned.

    RETURNS:   N > 0  number of bytes required to represent any character in the 
                      given state machine.
               None   characters in the state machine require different numbers of
                      bytes.
    """
    assert isinstance(CharacterSet, NumberSet)

    interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
    front = interval_list[0].begin     # First element of number set
    back  = interval_list[-1].end - 1  # Last element of number set
    # Determine number of bytes required to represent the first and the 
    # last character of the number set. The number of bytes per character
    # increases monotonously, so only borders have to be considered.
    front_chunk_n = len(unicode_to_utf8(front))
    back_chunk_n  = len(unicode_to_utf8(back))
    if front_chunk_n != back_chunk_n: return None
    else:                             return front_chunk_n
Пример #5
0
def homogeneous_chunk_n_per_character(CharacterSet):
    """If all characters in a unicode character set state machine require the
    same number of bytes to be represented this number is returned.  Otherwise,
    'None' is returned.

    RETURNS:   N > 0  number of bytes required to represent any character in the 
                      given state machine.
               None   characters in the state machine require different numbers of
                      bytes.
    """
    assert isinstance(CharacterSet, NumberSet)

    interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True)
    front = interval_list[0].begin  # First element of number set
    back = interval_list[-1].end - 1  # Last element of number set
    # Determine number of bytes required to represent the first and the
    # last character of the number set. The number of bytes per character
    # increases monotonously, so only borders have to be considered.
    front_chunk_n = len(unicode_to_utf8(front))
    back_chunk_n = len(unicode_to_utf8(back))
    if front_chunk_n != back_chunk_n: return None
    else: return front_chunk_n
Пример #6
0
def split_interval_according_to_utf8_byte_sequence_length(X):
    """Split Unicode interval into intervals where all values
       have the same utf8-byte sequence length.
    """
    global utf8_border
    if X.begin == -sys.maxint: X.begin = 0
    if X.end   == sys.maxint:  X.end   = 0x110000
    assert X.end <= 0x110000  # Interval must lie in unicode range

    db = {}
    current_begin = X.begin
    LastL = len(unicode_to_utf8(X.end - 1))  # Length of utf8 sequence corresponding
    #                                        # the last value inside the interval.
    while 1 + 1 == 2:
        L = len(unicode_to_utf8(current_begin))   # Length of the first unicode in utf8
        # Store the interval together with the required byte sequence length (as key)
        current_end = utf8_border[L-1]
        if L == LastL: 
            db[L] = Interval(current_begin, X.end)
            break
        db[L] = Interval(current_begin, current_end)
        current_begin = current_end

    return db
Пример #7
0
def split_interval_according_to_utf8_byte_sequence_length(X):
    """Split Unicode interval into intervals where all values
       have the same utf8-byte sequence length.
    """
    if X.begin < 0:         X.begin = 0
    if X.end   > UTF8_MAX:  X.end   = UTF8_MAX + 1

    if X.size() == 0: return None

    db = {}
    current_begin = X.begin
    last_L        = len(unicode_to_utf8(X.end - 1))  # Length of utf8 sequence corresponding
    #                                                # the last value inside the interval.
    while 1 + 1 == 2:
        L = len(unicode_to_utf8(current_begin))   # Length of the first unicode in utf8
        # Store the interval together with the required byte sequence length (as key)
        current_end = UTF8_BORDERS[L-1]
        if L == last_L: 
            db[L] = Interval(current_begin, X.end)
            break
        db[L] = Interval(current_begin, current_end)
        current_begin = current_end

    return db
Пример #8
0
def split_interval_into_contigous_byte_sequence_range(X, L):
    """Use the fact that utf8 byte sequences of increasing unicode values relate
       to increasing byte sequence values. Consider the unicode interval [0x12345,
       0x17653]. 
       
                    Unicode   UTF8-byte sequence

                    012345    F0.92.8D.85
                              ...
                    01237F    F0.92.8D.BF
                    012380    F0.92.8E.80
                              ...
                    012FFF    F0.92.BF.BF
                    013000    F0.93.80.80
                              ...
                    016FFF    F0.96.BF.BF
                    017000    F0.97.80.80
                              ...
                    01763F    F0.97.98.BF
                    017640    F0.97.99.80
                              ...
                    017653    F0.97.99.93

       
       The utf8 sequences of the values in the sub-interval [0x12345, 0x1237F]
       only differ with respect to the last byte, but they all trigger to the
       'original targte state', so they can be combined into a trigger sequence

                 [F0, 92, 8D, [85,BF]]

       Analogously, the values in [0x12FFF, 0x13000] differ only with respect
       to the last two bytes. But, all trigger with 2x [80, BF] to the original
       target state. So, they can be combined to the original target state, thus
       they can be combined to

                 [F0, 92, [80,BF], [80,BF]]

       A contigous interval is an interval where such combinations are valid.
       This function splits a given interval into such intervals.

       REQUIRES: The byte sequence in the given interval **must** have all the same 
                 length L.

       RETURNS: List of 'contigous' intervals and the index of the first byte
                where all sequences differ.
    """
    # A byte in a utf8 sequence can only have a certain range depending
    # on its position. UTF8 sequences look like the following dependent
    # on their length:
    #
    #       Length:   Byte Masks for each byte
    #
    #       1 byte    0xxxxxxx
    #       2 bytes   110xxxxx 10xxxxxx
    #       3 bytes   1110xxxx 10xxxxxx 10xxxxxx
    #       4 bytes   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    #       5 bytes   ...
    #
    # where 'free' bits are indicated by 'x'. 
    # Min. value of a byte = where all 'x' are zero.
    # Max. value of a byte = where all 'x' are 1.
    # 
    def min_byte_value(ByteIndex):
        if ByteIndex == 0:
            return { 0: 0x00, 1: 0xC0, 2: 0xE0, 3: 0xF0 }[L]
        return 0x80

    def max_byte_value(ByteIndex):
        if ByteIndex == 0:
            return { 0: 0x7F, 1: 0xDF, 2: 0xEF, 3: 0xF7 }[L]
        return 0xBF
       
    def find_first_diff_byte(front_sequence, back_sequence):
        # Find the first byte that is different in the front and back sequence 
        for i in range(L-1):
            if front_sequence[i] != back_sequence[i]: return i
        # At least the last byte must be different. That's why it **must** be the
        # one different if no previous byte was it.
        return L - 1

    assert X.size() != 0

    if X.size() == 1: return [ X ], 0
    # If the utf8 sequence consist of one byte, then the range cannot be split.
    if L == 1: return [ X ], 0

    front_sequence = unicode_to_utf8(X.begin)
    back_sequence  = unicode_to_utf8(X.end - 1)
    p      = find_first_diff_byte(front_sequence, back_sequence)
    result = []
    current_begin = X.begin
    byte_sequence = copy(front_sequence)
    byte_indeces  = range(p + 1, L)
    byte_indeces.reverse()
    for q in byte_indeces:
        # There **must** be at least one overrun, even for 'q=p+1', since 'p+1' 
        # indexes the first byte after the first byte that was different. If 'p' 
        # indexed that last byte this block is never entered.
        byte_sequence[q] = max_byte_value(q)
        current_end      = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin    = current_end

    if front_sequence[p] + 1 != back_sequence[p]:
        if p == L - 1: byte_sequence[p] = back_sequence[p]
        else:          byte_sequence[p] = back_sequence[p] - 1 
        current_end      = utf8_to_unicode(byte_sequence) + 1
        result.append(Interval(current_begin, current_end))
        current_begin    = current_end

    byte_sequence[p] = back_sequence[p]
    for q in range(p + 1, L):
        if back_sequence[q] == min_byte_value(q):
            byte_sequence[q] = back_sequence[q]
        else:
            if q == L - 1: byte_sequence[q] = back_sequence[q] 
            else:          byte_sequence[q] = back_sequence[q] - 1
            current_end      = utf8_to_unicode(byte_sequence) + 1
            result.append(Interval(current_begin, current_end))
            if current_begin == X.end: break
            current_begin    = current_end
            byte_sequence[q] = back_sequence[q]

    if current_begin != X.end:
        result.append(Interval(current_begin, X.end))

    return result, p
Пример #9
0
def unicode_interval_to_utf8_intervals(X):
    front_list = unicode_to_utf8(X.begin)
    back_list  = unicode_to_utf8(X.end - 1)
    return map(lambda front, back: Interval(front, back + 1), front_list, back_list)
Пример #10
0
def get_trigger_sequence_for_contigous_byte_range_interval(X, L):
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence  = unicode_to_utf8(X.end - 1)
    # If the interval is contigous it must produce equal length utf8 sequences

    return [ Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L) ]