예제 #1
0
def test_composite_phrases() -> None:
    s = "Orða- og tengingasetning."
    #    0123456789012345678901234
    #    ^   ^^  ^               ^
    #      x
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 4, 5, 8, 24]
    assert byte_indexes == [0, 5, 6, 9, 25]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 4, 5, 8, 24, 25]
    assert byte_indexes == [0, 5, 6, 9, 25, 26]

    # The whole thing gets squished together into a single token.
    s = "Orða- og tengingasetning."
    #    0123456789012345678901234
    #    ^                       ^
    #      x
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 24]
    assert byte_indexes == [0, 25]
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 24, 25]
    assert byte_indexes == [0, 25, 26]
예제 #2
0
def test_paragraph_markers() -> None:
    s = "[[Stutt setning.]][[]][[Önnur setning.]]"
    #    012345678901234567890123456789012345678901234567
    #    ^^^    ^       ^^ ^ ^ ^ ^    ^       ^^
    #                                   x
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 29, 37, 38]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 30, 38, 39]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 29, 37, 38, 40]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 30, 38, 39, 41]

    # The tokenize functions does stuff to paragraph markers. Test that the
    # indexes are properly calculated after that.
    # Note that the text of the dropped empty paragraph markers disappears.
    s = "[[Stutt setning.]][[]][[Önnur setning.]]"
    #    012345678901234567890123456789012345678901234567
    #    ^ ^    ^       ^^ ^     ^    ^       ^^
    #                                   x
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 24, 29, 37, 38]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 24, 30, 38, 39]
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 2, 7, 15, 16, 18, 24, 29, 37, 38, 40]
    assert byte_indexes == [0, 2, 7, 15, 16, 18, 24, 30, 38, 39, 41]
예제 #3
0
def test_converted_measurements() -> None:
    s = "Stillið ofninn á 12° C til að baka kökuna."
    #    012345678901234567890123456789012345678901
    #    ^      ^      ^ ^     ^   ^  ^    ^      ^
    #          x        x   x        x       x
    toks = tokenizer.tokenize(s, convert_measurements=True)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 7, 14, 16, 22, 26, 29, 34, 41]
    assert byte_indexes == [0, 8, 15, 18, 25, 29, 33, 38, 46]
    toks = tokenizer.tokenize(s, convert_measurements=True)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 7, 14, 16, 22, 26, 29, 34, 41, 42]
    assert byte_indexes == [0, 8, 15, 18, 25, 29, 33, 38, 46, 47]
예제 #4
0
def test_lengthening_substitutions() -> None:
    s = "Þetta er 3ji báturinn!"
    #    0123456789012345678901
    #    ^    ^  ^   ^        ^
    #    x             x
    #             !             lengthening happens here (3ji->þriðji)
    toks = tokenizer.parse_tokens(
        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 5, 8, 12, 21]
    assert byte_indexes == [0, 6, 9, 13, 23]
    toks = tokenizer.parse_tokens(
        s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 5, 8, 12, 21, 22]
    assert byte_indexes == [0, 6, 9, 13, 23, 24]
예제 #5
0
def test_larger_case() -> None:
    s = "Þessi setning er í lengra lagi og er með bæði eins og tveggja bæta stafi."
    #    0123456789012345678901234567890123456789012345678901234567890123456789012
    #    ^    ^       ^  ^ ^      ^    ^  ^  ^   ^    ^    ^  ^       ^    ^     ^
    #    x                x                     x  xx                   x
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [
        0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72
    ]
    assert byte_indexes == [
        0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78
    ]
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [
        0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73
    ]
    assert byte_indexes == [
        0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79
    ]
예제 #6
0
def test_iterator_cases() -> None:
    s = [
        "Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ",
        "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi."
    ]
    # (char and byte indexes in a similar test above)
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [
        0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72
    ]
    assert byte_indexes == [
        0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78
    ]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [
        0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73
    ]
    assert byte_indexes == [
        0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79
    ]

    s = ["Stutt setning.", "", "Önnur setning."]
    #     01234567890123        45678901234567
    #     ^    ^       ^        ^    ^       ^
    #                           x
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 5, 13, 14, 19, 27]
    assert byte_indexes == [0, 5, 13, 14, 20, 28]
    toks = tokenizer.parse_tokens(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 5, 13, 14, 19, 27, 28]
    assert byte_indexes == [0, 5, 13, 14, 20, 28, 29]

    # parse_tokens does some implentation-detail-stuff here. Use tokenize instead.
    s = [" Stutt setning. ", "\n \n", "Önnur setning."]
    #     0123456789012345    6 78     90123456789012
    #     ^     ^       ^^                  ^       ^
    #                                  x
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 6, 14, 15, 24, 32]
    assert byte_indexes == [0, 6, 14, 15, 25, 33]
    toks = tokenizer.tokenize(s)
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 6, 14, 15, 24, 32, 33]
    assert byte_indexes == [0, 6, 14, 15, 25, 33, 34]
예제 #7
0
def test_small_easy_cases() -> None:
    s = "Bara ASCII."
    #    01234567890
    #    ^   ^     ^
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 4, 10]
    assert byte_indexes == [0, 4, 10]
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 4, 10, 11]
    assert byte_indexes == [0, 4, 10, 11]

    s = "Á bát."
    # char:
    #    012345
    #    ^^   ^
    # byte:
    # two-byte letters:
    #    x  x
    # indexes:
    #    023467
    #    ^^   ^
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 1, 5]
    assert byte_indexes == [0, 2, 7]
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 1, 5, 6]
    assert byte_indexes == [0, 2, 7, 8]

    s = "endar á ö"
    #    012345678
    #    ^    ^ ^
    #          x x
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0, 5, 7]
    assert byte_indexes == [0, 5, 8]
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 5, 7, 9]
    assert byte_indexes == [0, 5, 8, 11]
예제 #8
0
def test_small_difficult_cases() -> None:
    s = ""
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == []
    assert byte_indexes == []
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0]
    assert byte_indexes == [0]

    s = " "
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
    assert char_indexes == [0]
    assert byte_indexes == [0]
    toks = tokenizer.parse_tokens([s])
    char_indexes, byte_indexes = tokenizer.calculate_indexes(toks,
                                                             last_is_end=True)
    assert char_indexes == [0, 1]
    assert byte_indexes == [0, 1]

    # Single byte characters
    for x in ["a", "A", ".", "?", "!"]:
        s = x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 1]
        assert byte_indexes == [0, 1]

        s = " " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 2]
        assert byte_indexes == [0, 2]

        s = "  " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 3]

        s = "  " + x + " "
        # example:
        #   "  a "
        #    0123
        #    ^  ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 3]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3, 4]
        assert byte_indexes == [0, 3, 4]

        s = " " + x + " " + x
        # example:
        #   " a a"
        #    ^ ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 2]
        assert byte_indexes == [0, 2]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 2, 4]
        assert byte_indexes == [0, 2, 4]

    # Two byte characters
    for x in ["þ", "æ", "á"]:
        s = x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0], s
        assert byte_indexes == [0], s
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 1], s
        assert byte_indexes == [0, 2], s

        s = " " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 2]
        assert byte_indexes == [0, 3]

        s = "  " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 4]

        s = "  " + x + " "
        # example bytes:
        #   "  þ_ "
        #    01234
        #    ^   ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 4]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3, 4]
        assert byte_indexes == [0, 4, 5]

        s = " " + x + " " + x
        # example bytes:
        #   " þ_ þ_"
        #    012345
        #    ^  ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 2]
        assert byte_indexes == [0, 3]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 2, 4]
        assert byte_indexes == [0, 3, 6]

    # Two character characters
    # These strings contain two unicode code points that are rendered as one letter.
    # They are counted as two characters in python.
    # In addition the accent and umlaut characters are two bytes.
    for x in ["a" + ACCENT, "o" + UMLAUT]:
        s = x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0], s
        assert byte_indexes == [0], s
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 2], s
        assert byte_indexes == [0, 3], s

        s = " " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 4]

        s = "  " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 4]
        assert byte_indexes == [0, 5]

        s = "  " + x + " "
        # example chars:
        #   "  a´ "
        #    01234
        #    ^  ^^
        # example bytes:
        #   "  a´_ "
        #    012345
        #    ^  ^ ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 4]
        assert byte_indexes == [0, 5]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 4, 5]
        assert byte_indexes == [0, 5, 6]

        s = " " + x + " " + x
        # example chars:
        #   " a´ a´"
        #    012345
        #    ^  ^
        # example bytes:
        #   " a´_ a´_"
        #    01234567
        #    ^   ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 4]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3, 6]
        assert byte_indexes == [0, 4, 8]

    # The em-dash is 3 bytes
    for x in [EM_DASH]:
        s = x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0], s
        assert byte_indexes == [0], s
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 1], s
        assert byte_indexes == [0, 3], s

        s = " " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 2]
        assert byte_indexes == [0, 4]

        s = "  " + x
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0]
        assert byte_indexes == [0]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 5]

        s = "  " + x + " "
        # example chars:
        #   "  a "
        #    0123
        #    ^  ^
        # example bytes:
        #   "  a__ "
        #    012345
        #    ^    ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 3]
        assert byte_indexes == [0, 5]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 3, 4]
        assert byte_indexes == [0, 5, 6]

        s = " " + x + " " + x
        # example chars:
        #   " a a"
        #    0123
        #    ^ ^
        # example bytes:
        #   " a__ a__"
        #    01234567
        #    ^   ^
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
        assert char_indexes == [0, 2]
        assert byte_indexes == [0, 4]
        toks = tokenizer.parse_tokens([s])
        char_indexes, byte_indexes = tokenizer.calculate_indexes(
            toks, last_is_end=True)
        assert char_indexes == [0, 2, 4]
        assert byte_indexes == [0, 4, 8]