def test_TokenLabelPairs_two_datasets(dataset):

    datasets = [dataset, dataset]

    expected = [
        ("A", "a", "a"),
        ("B", "b", "b"),
        ("C", "c", "c"),
        ("D", "d", "d"),
        ("E", "e", "e"),
        ("F", "f", "f"),
        ("G", "g", "g"),
        ("A", "a", "a"),
        ("B", "b", "b"),
        ("C", "c", "c"),
        ("D", "d", "d"),
        ("E", "e", "e"),
        ("F", "f", "f"),
        ("G", "g", "g"),
        ("A", "a", "a"),
        ("B", "b", "b"),
        ("C", "c", "c"),
        ("D", "d", "d"),
        ("E", "e", "e"),
        ("F", "f", "f"),
        ("G", "g", "g"),
    ]

    tlp = TokenLabelPairs(line_limit=73,
                          respect_line_endings=True,
                          respect_doc_endings=False)
    actual = tlp.run(datasets)

    assert actual == expected
def test_TokenLabelPairs_respects_ignores_doc_endings(dataset):

    datasets = [dataset, dataset, dataset]

    expected = [
        ("A", ),
        ("B", ),
        ("C", ),
        ("D", ),
        ("E", ),
        ("F", ),
        ("G", ),
        ("A", ),
        ("B", ),
        ("C", ),
        ("D", ),
        ("E", ),
        ("F", ),
        ("G", ),
        ("A", ),
        ("B", ),
        ("C", ),
        ("D", ),
        ("E", ),
        ("F", ),
        ("G", ),
    ]

    tlp = TokenLabelPairs(line_limit=73,
                          respect_line_endings=False,
                          respect_doc_endings=False)
    actual = tlp.run(datasets)

    assert actual == expected
def test_TokenLabelPairs_retains_line_endings():
    """
    Rodrigues et al. retain the line endings as they appear in the text, meaning
    that on average a line is very short.
    """

    doc = dict()

    doc["_input_hash"] = 1337
    doc["tokens"] = [
        {
            "text": "\n",
            "start": 0,
            "end": 0,
            "id": 0
        },
        {
            "text": "\n",
            "start": 1,
            "end": 1,
            "id": 1
        },
        {
            "text": "\n",
            "start": 2,
            "end": 2,
            "id": 2
        },
        {
            "text": "\n",
            "start": 3,
            "end": 3,
            "id": 3
        },
    ]

    dataset = [doc]
    datasets = [dataset, dataset]

    expected = [
        (None, ),
        (None, ),
        (None, ),
        (None, ),
    ]

    tlp = TokenLabelPairs(respect_line_endings=True)
    actual = tlp.run(datasets)

    assert actual == expected
def test_TokenLabelPairs_ignores_line_endings():

    doc = dict()

    doc["_input_hash"] = 1337
    doc["tokens"] = [
        {
            "text": "a",
            "start": 0,
            "end": 0,
            "id": 0
        },
        {
            "text": "b",
            "start": 1,
            "end": 1,
            "id": 1
        },
        {
            "text": "c",
            "start": 2,
            "end": 2,
            "id": 2
        },
        {
            "text": "d",
            "start": 3,
            "end": 3,
            "id": 3
        },
    ]

    dataset = [doc]
    datasets = [dataset]

    expected = [
        ("a", ),
        ("b", ),
        (None, ),
        ("c", ),
        ("d", ),
        (None, ),
    ]

    tlp = TokenLabelPairs(line_limit=2, respect_line_endings=False)
    actual = tlp.run(datasets)

    assert actual == expected
def test_yield_token_label_pair(tokens, spans):

    expected = [
        ("A", "a"),
        ("B", "b"),
        ("C", "c"),
        ("D", "d"),
        ("E", "e"),
        ("F", "f"),
        ("G", "g"),
        (None, None),
    ]

    tlp = TokenLabelPairs(line_limit=73, respect_line_endings=True)

    tokens = [token["text"] for token in tokens]
    spans = [span["label"] for span in spans]

    tokens_and_labels = list(zip(*[tokens, spans]))
    actual = list(tlp.yield_token_label_pair(tokens_and_labels))

    assert expected == actual
def test_TokenLabelPairs_works_on_unlabelled(dataset):

    [doc.pop("spans") for doc in dataset]

    datasets = [dataset]

    expected = [
        ("A", ),
        ("B", ),
        ("C", ),
        ("D", ),
        ("E", ),
        ("F", ),
        ("G", ),
        (None, ),
        ("A", ),
        ("B", ),
        ("C", ),
        ("D", ),
        ("E", ),
        ("F", ),
        ("G", ),
        (None, ),
        ("A", ),
        ("B", ),
        ("C", ),
        ("D", ),
        ("E", ),
        ("F", ),
        ("G", ),
        (None, ),
    ]

    tlp = TokenLabelPairs(line_limit=73, respect_line_endings=True)
    actual = tlp.run(datasets)

    assert actual == expected
def test_reference_spans_real_example(doc):

    expected = [
        ("References", "o"),
        ("1", "o"),
        (".", "o"),
        ("United", "author"),
        ("Nations", "author"),
        ("Development", "author"),
        ("Programme", "author"),
        ("(", "author"),
        ("UNDP", "author"),
        (")", "author"),
        (".", "o"),
        ("A", "o"),
        ("Guide", "title"),
        ("to", "title"),
        ("Civil", "title"),
        ("Society", "title"),
        ("Organizations", "title"),
        ("working", "title"),
        ("on", "title"),
        ("Democratic", "title"),
        ("Governance", "title"),
        ("[", "title"),
        ("online", "title"),
        ("publication].", "title"),
        ("New", "o"),
        ("York", "o"),
        (",", "o"),
        ("NY", "o"),
        (";", "o"),
        ("UNDP", "o"),
        (";", "o"),
        ("2005", "year"),
        (".", "year"),
        ("(", "o"),
        ("Available", "o"),
        ("from", "o"),
        (":", "o"),
        ("http://www.undp.org", "o"),
        ("/", "o"),
        ("content", "o"),
        ("/", "o"),
        ("dam", "o"),
        ("/", "o"),
        ("aplaws", "o"),
        ("/", "o"),
        ("publication", "o"),
        ("/", "o"),
        ("en", "o"),
        ("/", "o"),
        ("publications", "o"),
        ("/", "o"),
        ("democratic-", "o"),
        ("governance", "o"),
        ("/", "o"),
        ("oslo", "o"),
        ("-", "o"),
        ("governance", "o"),
        ("-", "o"),
        ("center", "o"),
        ("/", "o"),
        ("civic", "o"),
        ("-", "o"),
        ("engagement", "o"),
        ("/", "o"),
        ("a", "o"),
        ("-", "o"),
        ("guide", "o"),
        ("-", "o"),
        ("to", "o"),
        ("-", "o"),
        ("civil", "o"),
        ("-", "o"),
        ("society-", "o"),
        ("organizations", "o"),
        ("-", "o"),
        ("working", "o"),
        ("-", "o"),
        ("on", "o"),
        ("-", "o"),
        ("democratic", "o"),
        ("-", "o"),
        ("governance-/3665%20Booklet_heleWEB_.pdf", "o"),
        (",", "o"),
        ("accessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("2", "o"),
        (".", "o"),
        ("Mental", "o"),
        ("Health", "author"),
        ("Peer", "author"),
        ("Connection", "author"),
        ("(", "author"),
        ("MHPC", "author"),
        (")", "author"),
        (".", "author"),
        ("Mental", "o"),
        ("Health", "title"),
        ("Peer", "title"),
        ("Connection", "title"),
        ("[", "title"),
        ("website].", "o"),
        ("Buffalo", "o"),
        (",", "o"),
        ("NY", "o"),
        (";", "o"),
        ("MHPC", "o"),
        (";", "o"),
        ("n.d", "o"),
        (".", "o"),
        ("(", "o"),
        ("Available", "o"),
        ("from", "o"),
        (":", "o"),
        ("http://wnyil.org/mhpc.html", "o"),
        (",", "o"),
        ("a", "o"),
        ("ccessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("3", "o"),
        (".", "o"),
        ("Avery", "o"),
        ("S", "author"),
        (",", "author"),
        ("Mental", "author"),
        ("Health", "author"),
        ("Peer", "author"),
        ("Connection", "author"),
        ("(", "author"),
        ("MHPC", "author"),
        (")", "author"),
        (".", "author"),
        ("Channels", "o"),
        ("2013", "o"),
        (",", "o"),
        ("“", "o"),
        ("Not", "title"),
        ("Without", "title"),
        ("Us", "title"),
        ("”", "title"),
        ("[", "title"),
        ("video].", "o"),
        ("Western", "o"),
        ("New", "o"),
        ("York", "o"),
        ("(", "o"),
        ("WNY", "o"),
        (")", "o"),
        (";", "o"),
        ("Squeeky", "o"),
        ("Wheel", "o"),
        (";", "o"),
        ("2013", "o"),
        (".", "year"),
        ("(", "year"),
        ("Available", "o"),
        ("from", "o"),
        (":", "o"),
        ("https://vimeo.com/62705552", "o"),
        (",", "o"),
        ("accessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("4", "o"),
        (".", "o"),
        ("Alzheimer", "o"),
        ("'s", "o"),
        ("Disease", "author"),
        ("International", "author"),
        ("(", "author"),
        ("ADI", "author"),
        (")", "author"),
        (".", "author"),
        ("How", "author"),
        ("to", "o"),
        ("develop", "title"),
        ("an", "title"),
        ("Alzheimer", "title"),
        ("'s", "title"),
        ("association", "title"),
        ("and", "title"),
        ("get", "title"),
        ("results", "title"),
        ("[", "title"),
        ("website].", "title"),
        ("United", "title"),
        ("Kingdom", "title"),
        (";", "o"),
        ("ADI", "o"),
        (";", "o"),
        ("2006", "o"),
        (".", "o"),
        ("(", "year"),
        ("Available", "year"),
        ("from", "o"),
        (":", "o"),
        ("https:/", "o"),
        ("/", "o"),
        ("www.alz.co.uk", "o"),
        ("/", "o"),
        ("how-", "o"),
        ("to", "o"),
        ("-", "o"),
        ("develop", "o"),
        ("-", "o"),
        ("an", "o"),
        ("-", "o"),
        ("association", "o"),
        (",", "o"),
        ("accessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("5", "o"),
        (".", "o"),
        ("Normal", "o"),
        ("Difference", "o"),
        ("Mental", "o"),
        ("Health", "author"),
        ("Kenya", "author"),
        ("(", "author"),
        ("NDMHK", "author"),
        (")", "author"),
        (".", "author"),
        ("About", "author"),
        ("Us", "author"),
        ("[", "o"),
        ("website].", "title"),
        ("Kenya", "title"),
        (";", "o"),
        ("NDMHK", "o"),
        (";", "o"),
        ("n.d", "o"),
        (".", "o"),
        ("(", "o"),
        ("Available", "o"),
        ("from", "o"),
        (":", "o"),
        (None, None),
        ("http://www.normal-difference.org/?page_id=15", "o"),
        (",", "o"),
        ("ac", "o"),
        ("cessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("6", "o"),
        (".", "o"),
        ("TOPSIDE", "o"),
        (".", "o"),
        ("Training", "o"),
        ("Opportunities", "author"),
        ("for", "author"),
        ("Peer", "o"),
        ("Supporters", "title"),
        ("with", "title"),
        ("Intellectual", "title"),
        ("Disabilities", "title"),
        ("in", "title"),
        ("Europe", "title"),
        ("[", "title"),
        ("website", "title"),
        ("]", "title"),
        (";", "o"),
        ("TOPSIDE", "o"),
        (";", "o"),
        ("n.d", "o"),
        (".", "o"),
        ("(", "o"),
        ("Available", "o"),
        ("from", "o"),
        (":", "o"),
        ("http://www.peer-support.eu/about-the-project/", "o"),
        (",", "o"),
        ("accessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("7", "o"),
        (".", "o"),
        ("KOSHISH", "o"),
        ("National", "o"),
        ("Mental", "o"),
        ("Health", "o"),
        ("Self", "author"),
        ("-", "author"),
        ("help", "author"),
        ("Organisation", "author"),
        (".", "author"),
        ("Advocacy", "author"),
        ("and", "author"),
        ("Awareness", "author"),
        ("[", "o"),
        ("website].", "title"),
        ("Nepal", "title"),
        (";", "o"),
        ("KOSHISH", "o"),
        (";", "o"),
        ("2015", "o"),
        (".", "o"),
        ("(", "o"),
        ("Available", "o"),
        ("from", "year"),
        (":", "year"),
        ("http://koshishnepal.org/advocacy", "o"),
        (",", "o"),
        ("accessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("8", "o"),
        (".", "o"),
        ("Dementia", "o"),
        ("Alliance", "o"),
        ("International", "o"),
        ("(", "o"),
        ("DAI", "o"),
        (")", "author"),
        (".", "author"),
        ("Dementia", "author"),
        ("Alliance", "author"),
        ("International", "author"),
        ("[", "author"),
        ("website].", "o"),
        ("Ankeny", "title"),
        (",", "title"),
        ("IA", "title"),
        (";", "o"),
        ("DAI", "o"),
        (";", "o"),
        ("2014/2015", "o"),
        (".", "o"),
        ("(", "o"),
        ("Available", "o"),
        ("from", "o"),
        (":", "o"),
        ("http://www.dementiaallianceinternational.org/", "o"),
        (",", "o"),
        ("accessed", "o"),
        ("15", "o"),
        ("February", "o"),
        ("2017", "o"),
        (")", "o"),
        (".", "o"),
        ("9", "o"),
        (".", "o"),
        (None, None),
    ]

    dataset = [doc]
    datasets = [dataset]

    tlp = TokenLabelPairs(respect_line_endings=False)
    actual = tlp.run(datasets)

    assert actual == expected
def test_TokenLabelPairs_cleans_whitespace():

    doc = dict()

    doc["_input_hash"] = 1337
    doc["tokens"] = [
        {
            "text": "A ",
            "start": 0,
            "end": 0,
            "id": 0
        },
        {
            "text": "B  ",
            "start": 1,
            "end": 1,
            "id": 1
        },
        {
            "text": "C   ",
            "start": 2,
            "end": 2,
            "id": 2
        },
        {
            "text": "D\t",
            "start": 3,
            "end": 3,
            "id": 3
        },
        {
            "text": "E\t\t",
            "start": 4,
            "end": 4,
            "id": 4
        },
        {
            "text": "F\t\t \t",
            "start": 5,
            "end": 5,
            "id": 5
        },
        {
            "text": "G \t \t \t \t",
            "start": 6,
            "end": 6,
            "id": 6
        },
        {
            "text": "\n",
            "start": 7,
            "end": 7,
            "id": 7
        },
        {
            "text": "\n ",
            "start": 8,
            "end": 8,
            "id": 8
        },
        {
            "text": "\n\t \t \t \t",
            "start": 9,
            "end": 6,
            "id": 9
        },
    ]

    dataset = [doc]
    datasets = [dataset]

    expected = [
        ("A", ),
        ("B", ),
        ("C", ),
        ("D", ),
        ("E", ),
        ("F", ),
        ("G", ),
        (None, ),
        (None, ),
        (None, ),
    ]

    tlp = TokenLabelPairs(line_limit=73, respect_line_endings=True)
    actual = tlp.run(datasets)

    assert expected == actual