Пример #1
0
def split_contigous_intervals_for_surrogates(Begin, End):
    """Splits the interval X into sub interval so that no interval runs over a 'surrogate'
       border of the last word. For that, it is simply checked if the End falls into the
       same 'surrogate' domain of 'front' (start value of front = Begin). If it does not
       an interval [front, end_of_domain) is split up and front is set to end of domain.
       This procedure repeats until front and End lie in the same domain.
    """
    assert Begin >= 0x10000
    assert End   <= 0x110000

    front_seq = unicode_to_utf16(Begin)
    back_seq  = unicode_to_utf16(End - 1)

    if front_seq[0] == back_seq[0]:
        return [Interval(Begin, End)]

    # Separate into three domains:
    #
    # (1) interval from Begin until second surrogate hits border 0xE000
    # (2) interval where the first surrogate inreases while second 
    #     surrogate iterates over [0xDC00, 0xDFFF]
    # (3) interval from begin of last surrogate border to End
    result = []
    end    = utf16_to_unicode([front_seq[0], 0xDFFF]) + 1
    # The following **must** hold according to entry condition about front and back sequence
    assert End > end
    result.append(Interval(Begin, end))
    if front_seq[0] + 1 != back_seq[0]: 
        mid_end = utf16_to_unicode([back_seq[0] - 1, 0xDFFF]) + 1
        result.append(Interval(end, mid_end)) 
        end = mid_end
    result.append(Interval(end, End)) 

    return result
Пример #2
0
def get_trigger_sequence_for_interval(X):
    # The interval either lies entirely >= 0x10000 or entirely < 0x10000
    assert X.begin >= 0x10000 or X.end < 0x10000

    # An interval below < 0x10000 remains the same
    if X.end < 0x10000: return [ X ]
    
    # In case that the interval >= 0x10000 it the value is split up into
    # two values.
    front_seq = unicode_to_utf16(X.begin)
    back_seq  = unicode_to_utf16(X.end - 1)

    return [ Interval(front_seq[0], back_seq[0] + 1), 
             Interval(front_seq[1], back_seq[1] + 1) ]