def split_contigous_intervals_for_surrogates(Begin, End): """Splits the interval X into sub interval so that no interval runs over a 'surrogate' border of the last word. For that, it is simply checked if the End falls into the same 'surrogate' domain of 'front' (start value of front = Begin). If it does not an interval [front, end_of_domain) is split up and front is set to end of domain. This procedure repeats until front and End lie in the same domain. """ assert Begin >= 0x10000 assert End <= 0x110000 front_seq = unicode_to_utf16(Begin) back_seq = unicode_to_utf16(End - 1) if front_seq[0] == back_seq[0]: return [Interval(Begin, End)] # Separate into three domains: # # (1) interval from Begin until second surrogate hits border 0xE000 # (2) interval where the first surrogate inreases while second # surrogate iterates over [0xDC00, 0xDFFF] # (3) interval from begin of last surrogate border to End result = [] end = utf16_to_unicode([front_seq[0], 0xDFFF]) + 1 # The following **must** hold according to entry condition about front and back sequence assert End > end result.append(Interval(Begin, end)) if front_seq[0] + 1 != back_seq[0]: mid_end = utf16_to_unicode([back_seq[0] - 1, 0xDFFF]) + 1 result.append(Interval(end, mid_end)) end = mid_end result.append(Interval(end, End)) return result
def get_trigger_sequence_for_interval(X): # The interval either lies entirely >= 0x10000 or entirely < 0x10000 assert X.begin >= 0x10000 or X.end < 0x10000 # An interval below < 0x10000 remains the same if X.end < 0x10000: return [ X ] # In case that the interval >= 0x10000 it the value is split up into # two values. front_seq = unicode_to_utf16(X.begin) back_seq = unicode_to_utf16(X.end - 1) return [ Interval(front_seq[0], back_seq[0] + 1), Interval(front_seq[1], back_seq[1] + 1) ]