def split_contigous_intervals_for_surrogates(Begin, End): """Splits the interval X into sub interval so that no interval runs over a 'surrogate' border of the last word. For that, it is simply checked if the End falls into the same 'surrogate' domain of 'front' (start value of front = Begin). If it does not an interval [front, end_of_domain) is split up and front is set to end of domain. This procedure repeats until front and End lie in the same domain. """ assert Begin >= 0x10000 assert End <= 0x110000 front_seq = unicode_to_utf16(Begin) back_seq = unicode_to_utf16(End - 1) if front_seq[0] == back_seq[0]: return [Interval(Begin, End)] # Separate into three domains: # # (1) interval from Begin until second surrogate hits border 0xE000 # (2) interval where the first surrogate inreases while second # surrogate iterates over [0xDC00, 0xDFFF] # (3) interval from begin of last surrogate border to End result = [] end = utf16_to_unicode([front_seq[0], 0xDFFF]) + 1 # The following **must** hold according to entry condition about front and back sequence assert End > end result.append(Interval(Begin, end)) if front_seq[0] + 1 != back_seq[0]: mid_end = utf16_to_unicode([back_seq[0] - 1, 0xDFFF]) + 1 result.append(Interval(end, mid_end)) end = mid_end result.append(Interval(end, End)) return result
def split_contigous_intervals_for_surrogates(Begin, End): """Splits the interval X into sub interval so that no interval runs over a 'surrogate' border of the last word. For that, it is simply checked if the End falls into the same 'surrogate' domain of 'front' (start value of front = Begin). If it does not an interval [front, end_of_domain) is split up and front is set to end of domain. This procedure repeats until front and End lie in the same domain. """ global ForbiddenRange assert Begin >= 0x10000 assert End <= 0x110000 assert End > Begin front_seq = unicode_to_utf16(Begin) back_seq = unicode_to_utf16(End - 1) # (*) First word is the same. # Then, # -- it is either a one word character. # -- it is a range of two word characters, but the range # extends in one contigous range in the second surrogate. # In both cases, the interval is contigous. if front_seq[0] == back_seq[0]: return [Interval(Begin, End)] # (*) First word is NOT the same # Separate into three domains: # # (1) Interval from Begin until second surrogate hits border 0xE000 # (2) Interval where the first surrogate inreases while second # surrogate iterates over [0xDC00, 0xDFFF] # (3) Interval from begin of last surrogate border to End result = [] end = utf16_to_unicode([front_seq[0], ForbiddenRange.end - 1]) + 1 # (1) 'Begin' until second surrogate hits border 0xE000 # (The following **must** hold according to entry condition about # front and back sequence.) assert End > end result.append(Interval(Begin, end)) if front_seq[0] + 1 != back_seq[0]: # (2) Second surrogate iterates over [0xDC00, 0xDFFF] mid_end = utf16_to_unicode([back_seq[0] - 1, ForbiddenRange.end - 1]) + 1 # (The following **must** hold according to entry condition about # front and back sequence.) assert mid_end > end result.append(Interval(end, mid_end)) end = mid_end # (3) Last surrogate border to End if End > end: result.append(Interval(end, End)) return result
def get_trigger_sequence_for_interval(X): # The interval either lies entirely >= 0x10000 or entirely < 0x10000 assert X.begin >= 0x10000 or X.end < 0x10000 # An interval below < 0x10000 remains the same if X.end < 0x10000: return [ X ] # In case that the interval >= 0x10000 it the value is split up into # two values. front_seq = unicode_to_utf16(X.begin) back_seq = unicode_to_utf16(X.end - 1) return [ Interval(front_seq[0], back_seq[0] + 1), Interval(front_seq[1], back_seq[1] + 1) ]
def homogeneous_chunk_n_per_character(CharacterSet): """If all characters in a unicode character set state machine require the same number of bytes to be represented this number is returned. Otherwise, 'None' is returned. RETURNS: N > 0 number of bytes required to represent any character in the given state machine. None characters in the state machine require different numbers of bytes. """ assert isinstance(CharacterSet, NumberSet) interval_list = CharacterSet.get_intervals(PromiseToTreatWellF=True) front = interval_list[0].begin # First element of number set back = interval_list[-1].end - 1 # Last element of number set # Determine number of bytes required to represent the first and the # last character of the number set. The number of bytes per character # increases monotonously, so only borders have to be considered. front_chunk_n = len(unicode_to_utf16(front)) back_chunk_n = len(unicode_to_utf16(back)) if front_chunk_n != back_chunk_n: return None else: return front_chunk_n