예제 #1
0
def test_insert_hints_overlapping() -> None:
    """Asserts that insert_hints works as expected for overlapping entities."""
    text = (
        "Mutation pattern in clinically asymptomatic coagulation factor VII deficiency. A total of"
        " 122 subjects, referred after presurgery screening or checkup for prolonged prothrombin"
        " time, were characterized for the presence of coagulation factor VII deficiency."
    )
    pubator_annotation = schemas.PubtatorAnnotation(
        pmid="8844208",
        text=text,
        entities={
            "2155":
            schemas.PubtatorEntity(
                mentions=["coagulation factor VII", "coagulation factor VII"],
                offsets=[(44, 67), (222, 244)],
                label="Gene",
            ),
            "D005168":
            schemas.PubtatorEntity(
                mentions=["factor VII deficiency", "factor VII deficiency"],
                offsets=[(56, 78), (234, 255)],
                label="Disease",
            ),
        },
    )
    expected = (
        f"coagulation factor vii @GENE@ factor vii deficiency @DISEASE@ {HINT_SEP_SYMBOL} {text}"
    )

    pubator_annotation.insert_hints()
    actual = pubator_annotation.text

    assert actual == expected
예제 #2
0
def test_insert_hints_no_mutation() -> None:
    """Asserts that insert_hints does not mutate any attribute beside `text`."""
    text = (
        "Different lobular distributions of altered hepatocyte tight junctions in rat models of"
        " intrahepatic and extrahepatic cholestasis.")
    pubator_annotation = schemas.PubtatorAnnotation(
        pmid="9862868",
        text=text,
        entities={
            "D002780":
            schemas.PubtatorEntity(mentions=["intrahepatic cholestasis"],
                                   offsets=[(87, 128)],
                                   label="Disease"),
            "D001651":
            schemas.PubtatorEntity(mentions=["extrahepatic cholestasis"],
                                   offsets=[(104, 128)],
                                   label="Disease"),
        },
    )
    expected = copy.deepcopy(pubator_annotation)

    pubator_annotation.insert_hints()

    assert pubator_annotation.text != expected.text
    assert pubator_annotation.pmid == expected.pmid
    assert pubator_annotation.entities == expected.entities
    assert pubator_annotation.relations == expected.relations
예제 #3
0
def test_insert_hints_compound() -> None:
    """Asserts that insert_hints works as expected for compound entities."""
    text = (
        "Different lobular distributions of altered hepatocyte tight junctions in rat models of"
        " intrahepatic and extrahepatic cholestasis.")
    pubator_annotation = schemas.PubtatorAnnotation(
        pmid="9862868",
        text=text,
        entities={
            "D002780":
            schemas.PubtatorEntity(mentions=["intrahepatic cholestasis"],
                                   offsets=[(87, 128)],
                                   label="Disease"),
            "D001651":
            schemas.PubtatorEntity(mentions=["extrahepatic cholestasis"],
                                   offsets=[(104, 128)],
                                   label="Disease"),
        },
    )
    expected = f"intrahepatic cholestasis @DISEASE@ extrahepatic cholestasis @DISEASE@ {HINT_SEP_SYMBOL} {text}"

    pubator_annotation.insert_hints()
    actual = pubator_annotation.text

    assert actual == expected
예제 #4
0
def test_parse_pubtator_compound_ent() -> None:
    # A truncated example taken from the CDR dataset
    pmid = "17854040"
    title_text = (
        "Mutations associated with lamivudine-resistance in therapy-na  ve hepatitis B virus (HBV)"
        " infected patients with and without HIV co-infection: implications for antiretroviral"
        " therapy in HBV and HIV co-infected South African patients. infected patients with and"
        " without HIV co-infection: implications for antiretroviral therapy in HBV and HIV"
        " co-infected South African patients.")
    abstract_text = (
        "This was an exploratory study to investigate lamivudine-resistant hepatitis B virus (HBV)"
        " strains in selected lamivudine-na  ve HBV carriers with and without human"
        " immunodeficiency virus (HIV) co-infection in South African patients. Thirty-five"
        " lamivudine-naive HBV infected patients with or without HIV co-infection were studied: 15"
        " chronic HBV mono-infected patients and 20 HBV-HIV co-infected patients."
    )
    pubtator_content = f"""
    {pmid}|t|{title_text}
    {pmid}|a|{abstract_text}
    {pmid}\t26\t36\tlamivudine\tChemical\tD019259
    {pmid}\t59\t61\tna\tChemical\tD012964
    {pmid}\t66\t98\thepatitis B virus (HBV) infected\tDisease\tD006509
    {pmid}\t125\t141\tHIV co-infection\tDisease\tD015658
    {pmid}\t186\t209\tHBV and HIV co-infected\tDisease\tD006509|D015658	HBV infected|HIV infected
    """

    expected = schemas.PubtatorAnnotation(
        pmid=pmid,
        text=f"{title_text} {abstract_text}",
        entities={
            "D019259":
            schemas.PubtatorEntity(
                mentions=["lamivudine"],
                offsets=[(26, 36)],
                label="Chemical",
            ),
            "D012964":
            schemas.PubtatorEntity(mentions=["na"],
                                   offsets=[(59, 61)],
                                   label="Chemical"),
            "D006509":
            schemas.PubtatorEntity(
                mentions=["hepatitis B virus (HBV) infected", "HBV infected"],
                offsets=[(66, 98), (186, 209)],
                label="Disease",
            ),
            "D015658":
            schemas.PubtatorEntity(
                mentions=["HIV co-infection", "HIV infected"],
                offsets=[(125, 141), (194, 209)],
                label="Disease",
            ),
        },
    )
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.both)
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations
예제 #5
0
def test_pubtator_entity_to_string() -> None:
    ent = schemas.PubtatorEntity(
        # Contains:
        # - multi-word mentions
        # - overlapping mentions
        # - multiple duplicate mentions
        # - at least two unique mentions (case-insensitive)
        # - mentions that are not already ordered by first appearance
        mentions=[
            "factor vii deficiency",
            "factor vii deficiency",
            "Factor VII Deficiency",
            "factor vii deficient",
        ],
        offsets=[(200, 221), (100, 121), (20, 41), (0, 21)],
        label="Disease",
    )
    # Test with sorting (which is and should be the default)
    actual = ent.to_string()
    expected = f"factor vii deficient {COREF_SEP_SYMBOL} factor vii deficiency @DISEASE@"
    assert actual == expected

    # Test without sorting
    # Note: because the mentions are randomly sorted when sort=False, we check a couple other
    # attributes, like length of the string.
    actual = ent.to_string(sort=False)
    assert len(actual) == len(expected)
    assert "factor vii deficient" in actual
    assert "factor vii deficiency" in actual
    assert "@DISEASE@" in actual
    assert COREF_SEP_SYMBOL in actual
예제 #6
0
def test_insert_hints() -> None:
    """Asserts that insert_hints works as expected for a list of edge cases."""
    # A truncated example taken from the GDA dataset. It contains a few edge cases:
    # - coreferent mention
    # - entites that differ in case
    # - paranthesized entity
    # - multiple identical mentions of an entity
    text = (
        "Apolipoprotein E epsilon4 allele, elevated midlife total cholesterol level, and high"
        " midlife systolic blood pressure are independent risk factors for late-life Alzheimer disease."
        " BACKGROUND: Presence of the apolipoprotein E (apoE) epsilon4 allele, which is involved in"
        " cholesterol metabolism, is the most important genetic risk factor for Alzheimer disease."
        " Elevated midlife values for total cholesterol level and blood pressure have been"
        " implicated recently as risk factors for Alzheimer disease.")

    pubator_annotation = schemas.PubtatorAnnotation(
        pmid="12160362",
        text=text,
        entities={
            "348":
            schemas.PubtatorEntity(
                mentions=["Apolipoprotein E", "apolipoprotein E", "apoE"],
                offsets=[(0, 17), (207, 223), (225, 229)],
                label="Gene",
            ),
            "D000544":
            schemas.PubtatorEntity(
                mentions=[
                    "Alzheimer disease", "Alzheimer disease",
                    "Alzheimer disease"
                ],
                offsets=[(160, 177), (339, 356), (479, 496)],
                label="Disease",
            ),
        },
    )
    expected = f"apolipoprotein e {COREF_SEP_SYMBOL} apoe @GENE@ alzheimer disease @DISEASE@ {HINT_SEP_SYMBOL} {text}"

    pubator_annotation.insert_hints()
    actual = pubator_annotation.text

    assert actual == expected
예제 #7
0
    def test_filter_hypernyms(self):
        annotation = schemas.PubtatorAnnotation(
            text=(
                "Carbamazepine-induced cardiac dysfunction. A patient with sinus bradycardia and"
                " atrioventricular block, induced by carbamazepine, prompted an extensive"
                " literature review of all previously reported cases."
            ),
            pmid="",
            entities={
                "D002220": schemas.PubtatorEntity(
                    mentions=["Carbamazepine", "carbamazepine"],
                    offsets=[(0, 13), (115, 128)],
                    label="Chemical",
                ),
                "D006331": schemas.PubtatorEntity(
                    mentions=["cardiac dysfunction"],
                    offsets=[(22, 41)],
                    label="Disease",
                ),
                "D001919": schemas.PubtatorEntity(
                    mentions=["bradycardia"],
                    offsets=[(64, 75)],
                    label="Disease",
                ),
                "D054537": schemas.PubtatorEntity(
                    mentions=["atrioventricular block"],
                    offsets=[(80, 102)],
                    label="Disease",
                ),
            },
            relations=[("D002220", "D001919", "CID"), ("D002220", "D054537", "CID")],
        )

        cdr._filter_hypernyms([annotation])
        actual = annotation.filtered_relations
        # D006331 is a hypernym of D001919 and/or D054537 and so it should be filtered.
        expected = [("D002220", "D006331", "CID")]

        assert actual == expected
예제 #8
0
def test_pubtator_entity_get_offsets() -> None:
    ent = schemas.PubtatorEntity(
        # We don't need actual mentions or a label to test this method.
        mentions=[
            "",
            "",
            "",
            "",
        ],
        offsets=[(200, 221), (100, 121), (20, 41), (0, 21)],
        label="",
    )
    expected = (0, 21)
    actual = ent.get_offsets()
    assert actual == expected
예제 #9
0
def test_pubtator_annotation_to_string() -> None:
    # Contains:
    # - at least one entity with multiple mentions, including a unique mention
    # - at least two relations with different head entities
    # - at least one n-ary relation
    # - relations that are not already ordered by first appearance
    ann = schemas.PubtatorAnnotation(
        # We don't need text or a PMID to test this method.
        pmid="",
        text="",
        entities={
            "D008094":
            schemas.PubtatorEntity(
                mentions=["lithium", "lithium", "Li", "Li"],
                offsets=[(54, 61), (111, 118), (941, 943), (1333, 1335)],
                label="Chemical",
            ),
            "D006973":
            schemas.PubtatorEntity(
                mentions=["hypertension", "hypertension"],
                offsets=[(1000, 1012), (1500, 1512)],
                label="Disease",
            ),
            "D011507":
            schemas.PubtatorEntity(
                mentions=["proteinuria", "proteinuria"],
                offsets=[(975, 986), (1466, 1477)],
                label="Disease",
            ),
            "D007676":
            schemas.PubtatorEntity(
                mentions=["chronic renal failure", "chronic renal failure"],
                offsets=[(70, 91), (1531, 1552)],
                label="Disease",
            ),
        },
        relations=[
            ("D008094", "D006973", "CID"),
            ("D008094", "D011507", "CID"),
            ("D008094", "D007676", "CID"),
            # This is an artificial n-ary relation.
            ("D008094", "D006973", "D011507", "CID"),
        ],
    )

    # Test with sorting (which is and should be the default)
    actual = ann.to_string()
    expected = (
        f"lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ chronic renal failure @DISEASE@ @CID@"
        f" lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ proteinuria @DISEASE@ @CID@"
        f" lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ hypertension @DISEASE@ @CID@"
        f" lithium {COREF_SEP_SYMBOL} li @CHEMICAL@ hypertension @DISEASE@ proteinuria @DISEASE@ @CID@"
    )
    assert actual == expected

    # Test without sorting
    # Note: because the mentions are randomly sorted when sort=False, we check a couple other
    # attributes, like length of the string.
    actual = ann.to_string(sort=False)
    assert len(actual) == len(expected)
    assert "lithium" in actual
    assert "li" in actual
    assert "chronic renal failure" in actual
    assert "proteinuria" in actual
    assert "hypertension" in actual
    assert "@CHEMICAL@" in actual
    assert "@DISEASE@" in actual
    assert "@CID@" in actual
    assert COREF_SEP_SYMBOL in actual
예제 #10
0
def test_parse_pubtator() -> None:
    # A truncated example taken from the CDR dataset
    pmid = "18020536"
    title_text = (
        "Associations between use of benzodiazepines or related drugs and health, physical"
        " abilities and cognitive function: a non-randomised clinical study in the elderly."
    )
    abstract_text = (
        "OBJECTIVE: To describe associations between the use of benzodiazepines or related drugs"
        " (BZDs/RDs) and health, functional abilities and cognitive function in the elderly."
        " METHODS: A non-randomised clinical study of patients aged > or =65 years admitted to"
        " acute hospital wards during 1 month. 164 patients (mean age +/- standard deviation [SD]"
        " 81.6 +/- 6.8 years) were admitted. Of these, nearly half (n = 78) had used BZDs/RDs"
        " before admission, and the remainder (n = 86) were non-users. Cognitive ability was"
        " assessed by the Mini-Mental State Examination (MMSE). Patients scoring > or =20 MMSE"
        " sum points were interviewed (n = 79) and questioned regarding symptoms and functional"
        " abilities during the week prior to admission.")
    # Include a dummy annotation with ID == -1. These should be ignored.
    pubtator_content = f"""
    {pmid}|t|{title_text}
    {pmid}|a|{abstract_text}
    {pmid}\t28\t43\tbenzodiazepines\tChemical\tD001569
    {pmid}\t219\t234\tbenzodiazepines\tChemical\tD001569
    {pmid}\t253\t257\tBZDs\tChemical\tD001569
    {pmid}\t583\t587\tBZDs\tChemical\tD001569
    {pmid}\t1817\t1826\ttiredness\tDisease\tD005221
    {pmid}\t0\t0\tArbitrary\tArbitrary\t-1
    {pmid}\tCID\tD001569\tD005221
    """

    title_entities = {
        "D001569":
        schemas.PubtatorEntity(
            mentions=["benzodiazepines"],
            offsets=[(28, 43)],
            label="Chemical",
        ),
    }
    abstract_entities = {
        "D001569":
        schemas.PubtatorEntity(
            mentions=["benzodiazepines", "BZDs", "BZDs"],
            offsets=[(219, 234), (253, 257), (583, 587)],
            label="Chemical",
        ),
        "D005221":
        schemas.PubtatorEntity(mentions=["tiredness"],
                               offsets=[(1817, 1826)],
                               label="Disease"),
    }
    both_entities = {
        "D001569":
        schemas.PubtatorEntity(
            mentions=title_entities["D001569"].mentions +
            abstract_entities["D001569"].mentions,
            offsets=title_entities["D001569"].offsets +
            abstract_entities["D001569"].offsets,
            label="Chemical",
        ),
        "D005221":
        schemas.PubtatorEntity(mentions=["tiredness"],
                               offsets=[(1817, 1826)],
                               label="Disease"),
    }

    # Title only
    expected = schemas.PubtatorAnnotation(pmid=pmid,
                                          text=title_text,
                                          entities=title_entities,
                                          relations=[])
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.title)
    # Breaking up the asserts leads to much clearer outputs when the test fails
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations

    # Abstract only
    expected = schemas.PubtatorAnnotation(
        pmid=pmid,
        text=abstract_text,
        entities=abstract_entities,
        relations=[("D001569", "D005221", "CID")],
    )
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.abstract)
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations

    # Both
    expected = schemas.PubtatorAnnotation(
        pmid=pmid,
        text=f"{title_text} {abstract_text}",
        entities=both_entities,
        relations=[("D001569", "D005221", "CID")],
    )
    actual = util.parse_pubtator(pubtator_content,
                                 text_segment=util.TextSegment.both)
    assert actual[0].text == expected.text
    assert actual[0].entities == expected.entities
    assert actual[0].relations == expected.relations
예제 #11
0
def test_query_pubtator() -> None:
    pmid = "19285439"
    title_text = (
        "The ubiquitin ligase RNF5 regulates antiviral responses by mediating degradation"
        " of the adaptor protein MITA.")
    abstract_text = (
        "Viral infection activates transcription factors NF-kappaB and IRF3, which collaborate to"
        " induce type I interferons (IFNs) and elicit innate antiviral response. MITA (also known"
        " as STING) has recently been identified as an adaptor that links virus-sensing receptors"
        " to IRF3 activation. Here, we showed that the E3 ubiquitin ligase RNF5 interacted with"
        " MITA in a viral-infection-dependent manner. Overexpression of RNF5 inhibited"
        " virus-triggered IRF3 activation, IFNB1 expression, and cellular antiviral response,"
        " whereas knockdown of RNF5 had opposite effects. RNF5 targeted MITA at Lys150 for"
        " ubiquitination and degradation after viral infection. Both MITA and RNF5 were located at"
        " the mitochondria and endoplasmic reticulum (ER) and viral infection caused their"
        " redistribution to the ER and mitochondria, respectively. We further found that"
        " virus-induced ubiquitination and degradation of MITA by RNF5 occurred at the"
        " mitochondria. These findings suggest that RNF5 negatively regulates virus-triggered"
        " signaling by targeting MITA for ubiquitination and degradation at the mitochondria."
    )
    title_entities = {
        "6048":
        schemas.PubtatorEntity(
            mentions=["RNF5"],
            offsets=[(21, 25)],
            label="Gene",
        ),
        "340061":
        schemas.PubtatorEntity(mentions=["MITA"],
                               offsets=[(104, 108)],
                               label="Gene"),
    }
    abstract_entities = {
        "4790":
        schemas.PubtatorEntity(mentions=["NF-kappaB"],
                               offsets=[(158, 167)],
                               label="Gene"),
        "3661":
        schemas.PubtatorEntity(
            mentions=["IRF3", "IRF3", "IRF3"],
            offsets=[(172, 176), (378, 382), (554, 558)],
            label="Gene",
        ),
        "340061":
        schemas.PubtatorEntity(
            mentions=["MITA", "STING", "MITA", "MITA", "MITA", "MITA", "MITA"],
            offsets=[
                (270, 274),
                (290, 295),
                (461, 465),
                (684, 688),
                (762, 766),
                (1000, 1004),
                (1136, 1140),
            ],
            label="Gene",
        ),
        "6048":
        schemas.PubtatorEntity(
            mentions=["RNF5", "RNF5", "RNF5", "RNF5", "RNF5", "RNF5", "RNF5"],
            offsets=[
                (440, 444),
                (523, 527),
                (643, 647),
                (670, 674),
                (771, 775),
                (1008, 1012),
                (1071, 1075),
            ],
            label="Gene",
        ),
        "3456":
        schemas.PubtatorEntity(mentions=["IFNB1"],
                               offsets=[(571, 576)],
                               label="Gene"),
    }
    both_entities = {
        "6048":
        schemas.PubtatorEntity(
            mentions=title_entities["6048"].mentions +
            abstract_entities["6048"].mentions,
            offsets=title_entities["6048"].offsets +
            abstract_entities["6048"].offsets,
            label="Gene",
        ),
        "340061":
        schemas.PubtatorEntity(
            mentions=title_entities["340061"].mentions +
            abstract_entities["340061"].mentions,
            offsets=title_entities["340061"].offsets +
            abstract_entities["340061"].offsets,
            label="Gene",
        ),
        "4790":
        schemas.PubtatorEntity(mentions=["NF-kappaB"],
                               offsets=[(158, 167)],
                               label="Gene"),
        "3661":
        schemas.PubtatorEntity(
            mentions=abstract_entities["3661"].mentions,
            offsets=abstract_entities["3661"].offsets,
            label="Gene",
        ),
        "3456":
        schemas.PubtatorEntity(mentions=["IFNB1"],
                               offsets=[(571, 576)],
                               label="Gene"),
    }

    # Title only
    expected = schemas.PubtatorAnnotation(pmid=pmid,
                                          text=title_text,
                                          entities=title_entities,
                                          relations=[])
    actual = util.query_pubtator(pmids=[pmid],
                                 concepts=["gene"],
                                 text_segment=util.TextSegment.title)
    # Breaking up the asserts leads to much clearer outputs when the test fails
    assert len(actual) == 1
    assert actual[expected.pmid].text == expected.text
    assert actual[expected.pmid].entities == expected.entities
    assert actual[expected.pmid].relations == expected.relations

    # Abstract only
    expected = schemas.PubtatorAnnotation(pmid=pmid,
                                          text=abstract_text,
                                          entities=abstract_entities,
                                          relations=[])
    actual = util.query_pubtator(pmids=[pmid],
                                 concepts=["gene"],
                                 text_segment=util.TextSegment.abstract)
    assert len(actual) == 1
    assert actual[expected.pmid].text == expected.text
    assert actual[expected.pmid].entities == expected.entities
    assert actual[expected.pmid].relations == expected.relations

    # Both
    expected = schemas.PubtatorAnnotation(pmid=pmid,
                                          text=f"{title_text} {abstract_text}",
                                          entities=both_entities,
                                          relations=[])
    actual = util.query_pubtator(pmids=[pmid],
                                 concepts=["gene"],
                                 text_segment=util.TextSegment.both)
    assert len(actual) == 1
    assert actual[expected.pmid].text == expected.text
    assert actual[expected.pmid].entities == expected.entities
    assert actual[expected.pmid].relations == expected.relations