def get_trigger_sequence_for_contigous_byte_range_interval(X, L): front_sequence = unicode_to_utf8(X.begin) back_sequence = unicode_to_utf8(X.end - 1) # If the interval is contigous it must produce equal length utf8 sequences # Let me play with 'list comprehensions' just one time return [ Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L) ]
def split_interval_according_to_utf8_byte_sequence_length(X): """Split Unicode interval into intervals where all values have the same utf8-byte sequence length. """ if X.begin < 0: X.begin = 0 if X.end > UTF8_MAX: X.end = UTF8_MAX + 1 if X.size() == 0: return None db = {} current_begin = X.begin last_L = len(unicode_to_utf8(X.end - 1)) # Length of utf8 sequence corresponding # # the last value inside the interval. while 1 + 1 == 2: L = len(unicode_to_utf8( current_begin)) # Length of the first unicode in utf8 # Store the interval together with the required byte sequence length (as key) current_end = UTF8_BORDERS[L - 1] if L == last_L: db[L] = Interval(current_begin, X.end) break db[L] = Interval(current_begin, current_end) current_begin = current_end return db
def get_trigger_sequence_for_contigous_byte_range_interval(X, L): front_sequence = unicode_to_utf8(X.begin) back_sequence = unicode_to_utf8(X.end - 1) # If the interval is contigous it must produce equal length utf8 sequences return [ Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L) ]
def homogeneous_chunk_n_per_character(CharacterSet): """If all characters in a unicode character set state machine require the same number of bytes to be represented this number is returned. Otherwise, 'None' is returned. RETURNS: N > 0 number of bytes required to represent any character in the given state machine. None characters in the state machine require different numbers of bytes. """ assert isinstance(CharacterSet, NumberSet) interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True) front = interval_list[0].begin # First element of number set back = interval_list[-1].end - 1 # Last element of number set # Determine number of bytes required to represent the first and the # last character of the number set. The number of bytes per character # increases monotonously, so only borders have to be considered. front_chunk_n = len(unicode_to_utf8(front)) back_chunk_n = len(unicode_to_utf8(back)) if front_chunk_n != back_chunk_n: return None else: return front_chunk_n
def split_interval_according_to_utf8_byte_sequence_length(X): """Split Unicode interval into intervals where all values have the same utf8-byte sequence length. """ global utf8_border if X.begin == -sys.maxint: X.begin = 0 if X.end == sys.maxint: X.end = 0x110000 assert X.end <= 0x110000 # Interval must lie in unicode range db = {} current_begin = X.begin LastL = len(unicode_to_utf8(X.end - 1)) # Length of utf8 sequence corresponding # # the last value inside the interval. while 1 + 1 == 2: L = len(unicode_to_utf8(current_begin)) # Length of the first unicode in utf8 # Store the interval together with the required byte sequence length (as key) current_end = utf8_border[L-1] if L == LastL: db[L] = Interval(current_begin, X.end) break db[L] = Interval(current_begin, current_end) current_begin = current_end return db
def split_interval_according_to_utf8_byte_sequence_length(X): """Split Unicode interval into intervals where all values have the same utf8-byte sequence length. """ if X.begin < 0: X.begin = 0 if X.end > UTF8_MAX: X.end = UTF8_MAX + 1 if X.size() == 0: return None db = {} current_begin = X.begin last_L = len(unicode_to_utf8(X.end - 1)) # Length of utf8 sequence corresponding # # the last value inside the interval. while 1 + 1 == 2: L = len(unicode_to_utf8(current_begin)) # Length of the first unicode in utf8 # Store the interval together with the required byte sequence length (as key) current_end = UTF8_BORDERS[L-1] if L == last_L: db[L] = Interval(current_begin, X.end) break db[L] = Interval(current_begin, current_end) current_begin = current_end return db
def split_interval_into_contigous_byte_sequence_range(X, L): """Use the fact that utf8 byte sequences of increasing unicode values relate to increasing byte sequence values. Consider the unicode interval [0x12345, 0x17653]. Unicode UTF8-byte sequence 012345 F0.92.8D.85 ... 01237F F0.92.8D.BF 012380 F0.92.8E.80 ... 012FFF F0.92.BF.BF 013000 F0.93.80.80 ... 016FFF F0.96.BF.BF 017000 F0.97.80.80 ... 01763F F0.97.98.BF 017640 F0.97.99.80 ... 017653 F0.97.99.93 The utf8 sequences of the values in the sub-interval [0x12345, 0x1237F] only differ with respect to the last byte, but they all trigger to the 'original targte state', so they can be combined into a trigger sequence [F0, 92, 8D, [85,BF]] Analogously, the values in [0x12FFF, 0x13000] differ only with respect to the last two bytes. But, all trigger with 2x [80, BF] to the original target state. So, they can be combined to the original target state, thus they can be combined to [F0, 92, [80,BF], [80,BF]] A contigous interval is an interval where such combinations are valid. This function splits a given interval into such intervals. REQUIRES: The byte sequence in the given interval **must** have all the same length L. RETURNS: List of 'contigous' intervals and the index of the first byte where all sequences differ. """ # A byte in a utf8 sequence can only have a certain range depending # on its position. UTF8 sequences look like the following dependent # on their length: # # Length: Byte Masks for each byte # # 1 byte 0xxxxxxx # 2 bytes 110xxxxx 10xxxxxx # 3 bytes 1110xxxx 10xxxxxx 10xxxxxx # 4 bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx # 5 bytes ... # # where 'free' bits are indicated by 'x'. # Min. value of a byte = where all 'x' are zero. # Max. value of a byte = where all 'x' are 1. # def min_byte_value(ByteIndex): if ByteIndex == 0: return { 0: 0x00, 1: 0xC0, 2: 0xE0, 3: 0xF0 }[L] return 0x80 def max_byte_value(ByteIndex): if ByteIndex == 0: return { 0: 0x7F, 1: 0xDF, 2: 0xEF, 3: 0xF7 }[L] return 0xBF def find_first_diff_byte(front_sequence, back_sequence): # Find the first byte that is different in the front and back sequence for i in range(L-1): if front_sequence[i] != back_sequence[i]: return i # At least the last byte must be different. That's why it **must** be the # one different if no previous byte was it. return L - 1 assert X.size() != 0 if X.size() == 1: return [ X ], 0 # If the utf8 sequence consist of one byte, then the range cannot be split. if L == 1: return [ X ], 0 front_sequence = unicode_to_utf8(X.begin) back_sequence = unicode_to_utf8(X.end - 1) p = find_first_diff_byte(front_sequence, back_sequence) result = [] current_begin = X.begin byte_sequence = copy(front_sequence) byte_indeces = range(p + 1, L) byte_indeces.reverse() for q in byte_indeces: # There **must** be at least one overrun, even for 'q=p+1', since 'p+1' # indexes the first byte after the first byte that was different. If 'p' # indexed that last byte this block is never entered. byte_sequence[q] = max_byte_value(q) current_end = utf8_to_unicode(byte_sequence) + 1 result.append(Interval(current_begin, current_end)) current_begin = current_end if front_sequence[p] + 1 != back_sequence[p]: if p == L - 1: byte_sequence[p] = back_sequence[p] else: byte_sequence[p] = back_sequence[p] - 1 current_end = utf8_to_unicode(byte_sequence) + 1 result.append(Interval(current_begin, current_end)) current_begin = current_end byte_sequence[p] = back_sequence[p] for q in range(p + 1, L): if back_sequence[q] == min_byte_value(q): byte_sequence[q] = back_sequence[q] else: if q == L - 1: byte_sequence[q] = back_sequence[q] else: byte_sequence[q] = back_sequence[q] - 1 current_end = utf8_to_unicode(byte_sequence) + 1 result.append(Interval(current_begin, current_end)) if current_begin == X.end: break current_begin = current_end byte_sequence[q] = back_sequence[q] if current_begin != X.end: result.append(Interval(current_begin, X.end)) return result, p
def unicode_interval_to_utf8_intervals(X): front_list = unicode_to_utf8(X.begin) back_list = unicode_to_utf8(X.end - 1) return map(lambda front, back: Interval(front, back + 1), front_list, back_list)