Exemplo n.º 1
0
def test_alignment_complex():
    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."]
    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5]
Exemplo n.º 2
0
def test_alignment_case_insensitive():
    other_tokens = ["I", "listened", "to", "obama", "'", "s", "podcasts", "."]
    spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 6]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1]
    assert list(align.y2x.data) == [0, 1, 2, 3, 4, 5, 6, 7]
Exemplo n.º 3
0
def get_spacy_alignment(ref, hyp) -> list:

    # The function returns the aligned tokenization between
    # a reference corpus and a hypothesis corpus
    # :param corpus: the corpus in coNLL format of which we want to reconstruct sentences
    # :return: a list cotaining alignment objects

    alignment = list()

    # Iterate over pairs of objects
    for i, j in zip(ref, hyp):

        # Compute the alignment between the two objects
        alignment.append(Alignment.from_strings(i, j))

    return alignment
Exemplo n.º 4
0
def test_alignment_spaces(en_vocab):
    # single leading whitespace
    other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.data) == [1, 1, 1, 2, 3, 4, 5, 6]

    # multiple leading whitespace tokens
    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.data) == [2, 2, 2, 3, 4, 5, 6, 7]

    # both with leading whitespace, not identical
    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
    spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
    assert list(align.y2x.data) == [0, 2, 2, 2, 3, 4, 5, 6, 7]

    # same leading whitespace, different tokenization
    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
    spacy_tokens = ["  ", "i", "listened", "to", "obama", "'s", "podcasts."]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
    assert list(align.x2y.data) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
    assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
    assert list(align.y2x.data) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]

    # only one with trailing whitespace
    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5]

    # different trailing whitespace
    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5, 6]

    # same trailing whitespace, different tokenization
    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", "  "]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
    assert list(align.x2y.data) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
    assert list(align.y2x.data) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]

    # differing whitespace is allowed
    other_tokens = ["a", " \n ", "b", "c"]
    spacy_tokens = ["a", "b", " ", "c"]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
    assert list(align.x2y.data) == [0, 1, 3]
    assert list(align.y2x.data) == [0, 2, 3]

    # other differences in whitespace are allowed
    other_tokens = [" ", "a"]
    spacy_tokens = ["  ", "a", " "]
    align = Alignment.from_strings(other_tokens, spacy_tokens)

    other_tokens = ["a", " "]
    spacy_tokens = ["a", "  "]
    align = Alignment.from_strings(other_tokens, spacy_tokens)
Exemplo n.º 5
0
def test_alignment_different_texts():
    other_tokens = ["she", "listened", "to", "obama", "'s", "podcasts", "."]
    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
    with pytest.raises(ValueError):
        Alignment.from_strings(other_tokens, spacy_tokens)