def test_multiple_matches(self): extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True) resp = extract.hpo("l pre auricular ear pit.") self.assertEqual(resp.hpids, ['HP:0004467']) resp = extract.hpo("Coloboma, microphthalmia, macrocephaly, ear pit.") self.assertEqual( set(resp.hpids), set(['HP:0000589', 'HP:0004467', 'HP:0000568', 'HP:0000256']))
def test_hpo_big_text_max_neighbors(self): # test parsing a page extract = Extractor(max_neighbors=1, correct_spelling=True, remove_overlapping=False) hpo_max_2 = extract.hpo(test_case11_text).hpids extract = Extractor(max_neighbors=3, correct_spelling=True, remove_overlapping=False) hpo_max_3 = extract.hpo(test_case11_text).hpids self.assertNotEqual(hpo_max_2, hpo_max_3)
def test_remove_overlapping(self): extract = Extractor(correct_spelling=False, remove_overlapping=False) resp = extract.hpo("Polycystic kidney disease and myoclonus seizures.") self.assertEqual( set(resp.hpids), set([ 'HP:0000113', 'HP:0000112', 'HP:0001250', 'HP:0002123', 'HP:0001336' ])) extract = Extractor(correct_spelling=False, remove_overlapping=True) resp = extract.hpo("Polycystic kidney disease and myoclonus seizures.") self.assertEqual(set(resp.hpids), set(['HP:0002123', 'HP:0000113']))
def test_extract_full_context(self): extract = Extractor(max_neighbors=2, correct_spelling=False, phenotypes_only=False) resp = extract.hpo("X linked") self.assertEqual(resp.entries[0]['hpid'][0], 'HP:0001417') self.assertEqual(resp.entries[0]['type'], 'mode_of_inheritance')
def test_hpo_big_text_spellcheck_off(self): # test parsing a page extract = Extractor(max_neighbors=2, correct_spelling=False, remove_overlapping=True) res = extract.hpo(test_case11_text) self.assertEqual(res.n_entries, 7)
def test_conflict_instantiate(self): # Test that reinstantiation does not affect results extract = Extractor(resolve_conflicts=True) self.assertEqual(extract.resolve_conflicts, True) res = extract.hpo("ASD") self.assertEqual(len(res.entries[0]['hpid']), 1) extract = Extractor(resolve_conflicts=False) self.assertEqual(extract.resolve_conflicts, False) res = extract.hpo("ASD") self.assertEqual(len(res.entries[0]['hpid']), 2) extract = Extractor(resolve_conflicts=True) self.assertEqual(extract.resolve_conflicts, True) res = extract.hpo("ASD") self.assertEqual(len(res.entries[0]['hpid']), 1)
def test_extract_ambiguous(self): # test resolver works extract = Extractor(resolve_conflicts=True) truth = [{"hpid": ["HP:0001631"], "index": [44, 47], "matched": "ASD"}] test1 = extract.hpo( "secundum, all underwent surgical repair for ASD except for 1 individual whose defect spontaneously closed" ) self.assertEqual(truth, test1.entries_sans_context)
def test_stop_word_phenos(self): # Test extracting multiple phenotypes with max_neighbors extract = Extractor(correct_spelling=True, max_neighbors=3) truth = [{ "hpid": ["HP:0001263"], "index": [0, 23], "matched": "developmental and delay" }] self.assertEqual( extract.hpo("developmental and delay").entries_sans_context, truth) extract = Extractor(correct_spelling=True, max_neighbors=1) truth = [] self.assertEqual( extract.hpo("developmental and delay").entries_sans_context, truth) extract = Extractor(correct_spelling=False) # Test extracting single phenotype followed by multiword phenotype truth = [{ "hpid": ["HP:0000750"], "index": [0, 12], "matched": "Speech delay" }, { "hpid": ["HP:0100710"], "index": [17, 26], "matched": "impulsive" }] self.assertEqual( extract.hpo("Speech delay and impulsive").entries_sans_context, truth) # Test extracting multiword phenotype followed by single phenotype truth = [{ "hpid": ["HP:0100710"], "index": [0, 9], "matched": "Impulsive" }, { "hpid": ["HP:0000750"], "index": [14, 26], "matched": "speech delay" }] self.assertEqual( extract.hpo("Impulsive and speech delay").entries_sans_context, truth)
def test_negated_hpo_retention(self): extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True, max_neighbors=2, phenotypes_only=False, remove_negated=True) resp = extract.hpo("Patient has developmental delay but no hypotonia") self.assertEqual(["HP:0001252"], resp.negated_hpids) resp = extract.hpo("developmental delay and a wide mouth") self.assertEqual([], resp.negated_hpids) resp = extract.hpo("developmental delay with no wide mouth") self.assertEqual(['HP:0000154'], resp.negated_hpids) resp = extract.hpo("developmental delay without a wide mouth") self.assertEqual(['HP:0000154'], resp.negated_hpids) resp = extract.hpo("no developmental delay, but has a wide mouth") self.assertEqual(['HP:0001263'], resp.negated_hpids) resp = extract.hpo( "the patient has a wide mouth but no developmental delay.") self.assertEqual(['HP:0001263'], resp.negated_hpids) resp = extract.hpo( "the patient does not have either a wide mouth or developmental delay." ) self.assertEqual(set(['HP:0000154', 'HP:0001263']), set(resp.negated_hpids))
def test_extract_without_negated(self): # negation should not apply if negation is part of matched string extract = Extractor(remove_negated=True) resp = extract.hpo("the patient presents with absent speech") self.assertEqual(resp.hpids, ['HP:0001344']) extract = Extractor() resp = extract.hpo("developmental delay and a wide mouth") resp.detect_negation() self.assertEqual(set(['HP:0000154', 'HP:0001263']), set(resp.hpids)) resp = extract.hpo("developmental delay with no wide mouth") resp.detect_negation() resp.remove_negated() self.assertEqual(['HP:0001263'], resp.hpids) extract = Extractor(remove_negated=True) resp = extract.hpo("developmental delay without a wide mouth") self.assertEqual(['HP:0001263'], resp.hpids) extract = Extractor(remove_negated=True) resp = extract.hpo("no developmental delay, but has a wide mouth") self.assertEqual(['HP:0000154'], resp.hpids) extract = Extractor(remove_negated=True) resp = extract.hpo( "the patient has a wide mouth but no developmental delay.") self.assertEqual(['HP:0000154'], resp.hpids) extract = Extractor(remove_negated=True) resp = extract.hpo( "the patient does not have either a wide mouth or developmental delay." ) self.assertEqual([], resp.hpids)
def test_extract_json_property(self): extract = Extractor(max_neighbors=2) truth = json.dumps([{ "hpid": ["HP:0000154"], "index": [16, 26], "matched": "wide mouth" }]) resp = extract.hpo("Wide gait and a wide mouth") self.assertEqual(truth, resp.json)
def test_extract_from_repeated_context(self): extract = Extractor() truth = [{ "hpid": ["HP:0000154"], "index": [16, 26], "matched": "wide mouth" }] resp = extract.hpo("Wide gait and a wide mouth") self.assertEqual(truth, resp.entries_sans_context)
def test_handling_term_hyphenation(self): extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True, max_neighbors=2, phenotypes_only=False) hyphenated_phenos = \ [ (hpo_network.nodes()[x]['name'], x) for x in hpo_network.nodes() \ if ( '-' in hpo_network.nodes()[x]['name'] and ',' not in hpo_network.nodes()[x]['name'] and '.' not in hpo_network.nodes()[x]['name'] ) ] # Phenotypes where word-order is important is a limitation of current parsing method known_bugs = ['HP:0000510', 'HP:0030932', 'HP:0001215'] long_phenos = [ 'HP:0011654', 'HP:0410303', 'HP:0000654', 'HP:0000847', 'HP:0000864', 'HP:0000877', 'HP:0001074' ] hyphenated_phenos = [ x for x in hyphenated_phenos if x[1] not in known_bugs + long_phenos ] hyphenated_phenos = [ x for x in hyphenated_phenos if x[1] not in non_phenos ] hyphenated_phenos = hyphenated_phenos[:10] for test in hyphenated_phenos: # current version is not expected to extract very long phenotypes hpids = extract.hpo(test[0]).hpids self.assertEqual(hpids, [test[1]]) # replace hyphens with space hpids = extract.hpo(test[0].replace('-', ' ')).hpids self.assertEqual(hpids, [test[1]])
def test_custom_synonyms(self): # test adding custom synonyms custom_syn = { "HP:0001263": ['DD', 'GDD'], "HP:0000729": ['ASD', 'PDD'] } extract = Extractor(custom_synonyms=custom_syn) truth = [{ "hpid": ["HP:0001263"], "index": [0, 3], "matched": "GDD" }, { "hpid": ["HP:0001263"], "index": [4, 6], "matched": "DD" }] self.assertEqual(extract.hpo("GDD DD").entries_sans_context, truth)
def test_capitalization_affecting_outcome(self): extract = Extractor(correct_spelling=False) resp = extract.hpo("enlarged heart") self.assertEqual(resp.hpids, ['HP:0001640']) resp = extract.hpo(" enlarged heart") self.assertEqual(resp.hpids, ['HP:0001640']) resp = extract.hpo("Enlarged heart") self.assertEqual(resp.hpids, ['HP:0001640']) resp = extract.hpo(" Enlarged heart") self.assertEqual(resp.hpids, ['HP:0001640']) resp = extract.hpo("Male with Sotos, enlarged heart") self.assertEqual(resp.hpids, ['HP:0001640']) resp = extract.hpo("Myoclonus Seizures") self.assertEqual(set(resp.hpids), set(['HP:0002123']))
def test_iteration_over_chunks(self): # test performing multiple extractions in a row sentences = ['Developmental delay', 'Hypotonia'] extract = Extractor(correct_spelling=False) for sentence in sentences: self.assertNotEqual(extract.hpo(sentence).n_entries, 0) sentences = ['Hypotonia', 'Developmental delay'] for sentence in sentences: self.assertNotEqual(extract.hpo(sentence).n_entries, 0) extract = Extractor(correct_spelling=True) sentences = ['Developmental delay', 'Hypotonia'] for sentence in sentences: self.assertNotEqual(extract.hpo(sentence).n_entries, 0) sentences = ['Hypotonia', 'Developmental delay'] for sentence in sentences: self.assertNotEqual(extract.hpo(sentence).n_entries, 0) sentences = ['Developmental delay', 'Hypotonia'] for sentence in sentences: self.assertNotEqual(extract.hpo(sentence).n_entries, 0) sentences = ['Hypotonia', 'Developmental delay'] for sentence in sentences: self.assertNotEqual(extract.hpo(sentence).n_entries, 0)
def run_phenoseries_experiment(outdir=None, phenotypic_series_filepath=None, min_hpos=2, min_entities=4, phenoseries_fraction=1.0, scoring_method="HRSS", threads=1, omim_phenotypes_file=None, pairwise_mim_scores_file=None): if outdir is None: outdir = os.getcwd # load HPO network # data directory phenopy_data_directory = os.path.join(os.getenv("HOME"), ".phenopy/data") # files used in building the annotated HPO network obo_file = os.path.join(phenopy_data_directory, "hp.obo") disease_to_phenotype_file = os.path.join(phenopy_data_directory, "phenotype.hpoa") hpo_network, alt2prim, _ = generate_annotated_hpo_network( obo_file, disease_to_phenotype_file, ages_distribution_file=None) # read the phenotypic series file as a DataFrame psdf = pd.read_csv( phenotypic_series_filepath, sep="\t", comment="#", names=["PS", "MIM", "Phenotype"], ) # null phenotypes are actually null MIM id fields, so just drop these psdf = psdf.dropna().sample(frac=phenoseries_fraction, random_state=42) psdf.reset_index(inplace=True, drop=True) # create a dictionary for phenotypic series to list of omim ids mapping ps2mimids = {} for ps, mim_ids in psdf.groupby(["PS"])["MIM"]: # more than two mims in a ps if len(mim_ids) >= 2: ps2mimids[ps] = list(set([int(mid) for mid in mim_ids.tolist()])) # invert the ps2mimid dictionary for easy lookup of which ps a mim belongs to mim2psids = {} for mim_id, ps in psdf.groupby(["MIM"])["PS"]: mim2psids[int(mim_id)] = ps.tolist() fields_to_use = [ "text", "description", "otherFeatures", "biochemicalFeatures", "diagnosis", "clinicalFeatures", ] if omim_phenotypes_file == "": logger.info("Scraping OMIM Diseases text") mim_texts = {} for mim_id in mim2psids: mim_response = request_mimid_info(mim_id) try: mim_info = mim_response.json() except AttributeError: break mim_text = mim_info["omim"]["entryList"][0]["entry"][ "textSectionList"] all_mim_text = "" for text_section in mim_text: section_name = text_section["textSection"]["textSectionName"] if section_name in fields_to_use: # unique_section_names.add(section_name) all_mim_text += " " + text_section["textSection"][ "textSectionContent"] mim_texts[mim_id] = all_mim_text # instantiate txt2hpo's Exctractor class to perform named entity recognition extractor = Extractor(remove_negated=True, max_neighbors=3, correct_spelling=False) # loop over the MIM ids and extract hpo ids from each MIM's text fields mim_hpos = {} for mim_id in mim2psids: mim_hpos[mim_id] = extractor.hpo(mim_texts[mim_id]).hpids mimdf = pd.DataFrame() mimdf["omim_id"] = list(mim2psids.keys()) mimdf["hpo_terms"] = mimdf["omim_id"].apply( lambda mim_id: mim_hpos[mim_id]) mimdf.to_csv(os.path.join(outdir, "omim_phenotypes.txt"), index=False, sep='\t') else: logger.info("You passed an OMIM disease to phenotype file") try: mimdf = pd.read_csv(omim_phenotypes_file, sep="\t") mimdf["omim_id"] = mimdf["omim_id"].astype(int) mimdf["hpo_terms"] = mimdf["hpo_terms"].apply(literal_eval) mim_hpos = dict(zip(mimdf["omim_id"], mimdf["hpo_terms"])) except FileNotFoundError: sys.exit("Please provide a valid file path") # do we need this? # mim_hpos = {mim_id: hpos for mim_id, hpos in mim_hpos.items()} # clean up HPO ids in lists for mim_id, hpo_ids in mim_hpos.items(): mim_hpos[mim_id] = convert_and_filter_hpoids(hpo_ids, hpo_network, alt2prim) # remove entities (mims) that have less than min_hpos mims_to_remove = [] for mim_id, hpo_ids in mim_hpos.copy().items(): if len(hpo_ids) <= min_hpos: mims_to_remove.append(mim_id) # Now remove the entities (mim ids) with less than min_hpos experiment_ps2mimids = {} # remove these mims from ps for ps, mimids in ps2mimids.copy().items(): experiment_ps2mimids[ps] = [] for ps_mim_id in mimids: if ps_mim_id not in mims_to_remove: experiment_ps2mimids[ps].append(ps_mim_id) # After removing entities, make sure the series has min number of entities # get lists of mims and their PS remove_these_ps = [] for ps, mimids in experiment_ps2mimids.items(): if len(mimids) < min_entities: remove_these_ps.append(ps) for psid in remove_these_ps: del experiment_ps2mimids[psid] # Create a unique list of entity ids, for scoring later experiment_omims = set() for psid, mim_ids in experiment_ps2mimids.items(): for mim in mim_ids: experiment_omims.add(mim) experiment_omims = list(experiment_omims) # make a DataFrame for entity ids mimdf = pd.DataFrame() mimdf["omim_id"] = experiment_omims mimdf["hpo_terms"] = mimdf["omim_id"].apply( lambda mim_id: mim_hpos[mim_id]) if pairwise_mim_scores_file == "": scorer = Scorer(hpo_network, scoring_method=scoring_method) records = [{ "record_id": mim_id, "terms": convert_and_filter_hpoids(hpo_terms, hpo_network, alt2prim), "weights": {}, } for mim_id, hpo_terms in dict( zip(mimdf["omim_id"], mimdf["hpo_terms"])).items()] results = scorer.score_records(records, records, half_product(len(records), len(records)), threads=threads) pairwise_scores = pd.DataFrame( results, columns=["mimid1", "mimid2", "phenopy-score"]) # convert to square form pairwise_scores = pairwise_scores.set_index(["mimid1", "mimid2"]).unstack() # This pandas method chain fills in the missing scores of the square matrix with the values from the transpose of df. pairwise_scores = (pairwise_scores["phenopy-score"].reset_index( drop=True).fillna( pairwise_scores.T.droplevel(0).reset_index( drop=True)).set_index(pairwise_scores.index, drop=True)) # reindex with the mimdf index pairwise_scores = pairwise_scores.reindex(mimdf["omim_id"].tolist()) pairwise_scores = pairwise_scores[mimdf["omim_id"].tolist()] pd.DataFrame(pairwise_scores).to_csv(os.path.join( outdir, 'phenoseries.psim_matrix.txt'), sep='\t') else: pairwise_scores = pd.read_csv(pairwise_mim_scores_file, sep='\t') ranksdf = make_rank_dataframe( pairwise_scores.astype(float).values, mimdf, experiment_ps2mimids) ranksdf.to_csv(os.path.join(outdir, "phenoseries.rankdf.txt"), sep="\t")
def test_hpo_big_text_spellcheck_on(self): # test parsing a page extract = Extractor(max_neighbors=2, remove_overlapping=False) self.assertEqual(extract.hpo(test_case11_text).n_entries, 12)
def test_hpo(self): extract = Extractor(correct_spelling=False) # Test extracting single phenotype truth = [{ "hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonia" }] self.assertEqual(extract.hpo("Hypotonia").entries_sans_context, truth) # Test adding non phenotypic term truth = [{ "hpid": ["HP:0001252"], "index": [5, 14], "matched": "hypotonia" }] self.assertEqual( extract.hpo("Word hypotonia").entries_sans_context, truth) # Test handling punctuation truth = [{ "hpid": ["HP:0001252"], "index": [6, 15], "matched": "hypotonia" }] self.assertEqual( extract.hpo("Word, hypotonia").entries_sans_context, truth) # Test extracting a multiword phenotype truth = [{ "hpid": ["HP:0001263"], "index": [0, 19], "matched": "Developmental delay" }] self.assertEqual( extract.hpo("Developmental delay").entries_sans_context, truth) # Test extracting a multiword phenotype with reversed word order truth = [{ "hpid": ["HP:0001263"], "index": [0, 19], "matched": "Delay developmental" }] self.assertEqual( extract.hpo("Delay developmental").entries_sans_context, truth) # Test extracting a phenotype with inflectional endings truth = [{ "hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonic" }] self.assertEqual(extract.hpo("Hypotonic").entries_sans_context, truth) # Test extracting a multiword phenotype with inflectional endings and reversed order truth = [{ "hpid": ["HP:0001263"], "index": [0, 19], "matched": "Delayed development" }] self.assertEqual( extract.hpo("Delayed development").entries_sans_context, truth) # Test extracting multiword phenotype following an unrelated phenotypic term truth = [{ "hpid": ["HP:0000365"], "index": [6, 18], "matched": "hearing loss" }] self.assertEqual( extract.hpo("Delay hearing loss").entries_sans_context, truth) # Test extracting multiword phenotype preceding an unrelated phenotypic term truth = [{ "hpid": ["HP:0000365"], "index": [0, 12], "matched": "Hearing loss" }] self.assertEqual( extract.hpo("Hearing loss following").entries_sans_context, truth) # Test extracting two multiword phenotype preceding interrupted by an unrelated phenotypic term truth = [ { "hpid": ["HP:0000365"], "index": [0, 12], "matched": "Hearing loss" }, { "hpid": ["HP:0001263"], "index": [23, 42], "matched": "developmental delay" }, ] self.assertEqual( extract.hpo("Hearing loss following developmental delay"). entries_sans_context, truth) # Test spellchecker extract = Extractor(correct_spelling=True) truth = [{ "hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonic" }] self.assertEqual(extract.hpo("Hyptonic").entries_sans_context, truth) truth = [] extract = Extractor(correct_spelling=False) self.assertEqual(extract.hpo("Hyptonic").entries_sans_context, truth) truth = [ { "hpid": ["HP:0002757"], "index": [12, 30], "matched": "multiple fractures" }, { "hpid": ["HP:0000938"], "index": [35, 45], "matched": "osteopenia" }, ] self.assertEqual( truth, extract.hpo("Female with multiple fractures and osteopenia NA NA"). entries_sans_context) truth = [{ "hpid": ["HP:0001156"], "index": [30, 43], "matched": "brachydactyly" }] self.assertEqual( truth, extract.hpo("Female with fourth metacarpal brachydactyly"). entries_sans_context) extract = Extractor(correct_spelling=False, remove_overlapping=False) truth = [ { "hpid": ["HP:0000964"], "index": [10, 16], "matched": "eczema" }, { "hpid": ["HP:0000988"], "index": [18, 27], "matched": "skin rash" }, { "hpid": ["HP:0000988"], "index": [23, 27], "matched": "rash" }, { "hpid": ["HP:0008070"], "index": [33, 44], "matched": "sparse hair" }, ] resp = extract.hpo("Male with eczema, skin rash, and sparse hair" ).entries_sans_context self.assertEqual(truth, resp) # Test extracting an abbreviated phenotype truth = [{"hpid": ["HP:0001370"], "index": [0, 2], "matched": "RA"}] self.assertEqual(extract.hpo("RA").entries_sans_context, truth) # Test extracting multiple phenotypes truth = [{ "hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonia" }, { "hpid": ["HP:0001263"], "index": [11, 30], "matched": "developmental delay" }] self.assertEqual( extract.hpo("Hypotonia, developmental delay").entries_sans_context, truth) # Test term indexing given max length of extracted text extract = Extractor(correct_spelling=False, max_length=20, chunk_by="max_length") truth = [{ "hpid": ["HP:0001263"], "index": [0, 19], "matched": "Developmental delay" }, { "hpid": ["HP:0001252"], "index": [21, 30], "matched": "hypotonia" }] self.assertEqual( extract.hpo("Developmental delay, hypotonia").entries_sans_context, truth)