def generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, annotations_file=None, ages_distribution_file=None): hpo_network = load_network(obo_file) alt2prim = generate_alternate_ids(hpo_network) # load phenotypes to diseases associations ( disease_records, phenotype_to_diseases, ) = load_d2p(disease_to_phenotype_file, hpo_network, alt2prim) # load hpo network hpo_network = annotate( hpo_network, phenotype_to_diseases, len(disease_records), alt2prim, annotations_file=annotations_file, ages_distribution_file=ages_distribution_file, ) return hpo_network, alt2prim, disease_records
def test_ic_custom(self): """Calculate the information content of a phenotype when multiple annotations are present""" custom_annotation_file = os.path.join(self.parent_dir, 'data/test.score-long.txt') hpo_network = load_network(self.obo_file) hpo_network = annotate(hpo_network, self.phenotype_to_diseases, self.num_diseases_annotated, self.alt2prim, annotations_file=custom_annotation_file) self.assertAlmostEqual(hpo_network.nodes[self.hpo_id]['ic'], 6.38, 1)
def test_age_weight(self): # Test age based weight distribution and best_match_weighted_average calculation terms_a = ['HP:0001251', 'HP:0001263', 'HP:0001290', 'HP:0004322'] # ATAX, DD, HYP, SS terms_b = ['HP:0001263', 'HP:0001249', 'HP:0001290'] # DD, ID, HYP self.hpo_network = annotate( self.hpo_network, self.phenotype_to_diseases, self.num_diseases_annotated, self.alt2prim, ages_distribution_file=self.ages_distribution_file) age_a = 9.0 age_b = 4.0 # calculate weights based on patients age weights_a = { 'age': calculate_age_weights(terms_a, age_b, self.hpo_network) } weights_b = { 'age': calculate_age_weights(terms_b, age_a, self.hpo_network) } # make pairwise scores matrix df = pd.DataFrame([[4.22595743e-02, 3.92122308e-02, 3.04851573e-04], [1.07473687e-01, 5.05101655e-01, 3.78305515e-04], [3.69780479e-04, 3.78305515e-04, 4.64651944e-01], [4.17139800e-04, 4.12232546e-04, 3.67984322e-04]], index=pd.Index(terms_a, name='a'), columns=pd.MultiIndex.from_arrays( [['score'] * len(terms_b), terms_b], names=[None, 'b'])) # compute pairwise best match weighted average score_bmwa = self.scorer.best_match_weighted_average( df, weights_a, weights_b) self.assertAlmostEqual(score_bmwa, 0.3741, 4) # set all weights to 1.0, result should be the same as BMA without weights weights_a = {'disease_frequency': [1.] * len(terms_a)} weights_b = {'disease_frequency': [1.] * len(terms_b)} score_bmwa = self.scorer.best_match_weighted_average( df, weights_a, weights_b) self.assertAlmostEqual(score_bmwa, 0.2985, 4) # test term not in network terms_a = ['HP:Not_a_term'] weights_a = calculate_age_weights(terms_a, age_b, self.hpo_network) self.assertEqual(weights_a, [1.0]) # term in network no age terms_a = ['HP:0000001'] weights_a = calculate_age_weights(terms_a, age_b, self.hpo_network) self.assertEqual(weights_a, [1.0])
def test_score_pairs_age(self): # Test reading in records files and calculating pairwise scores # read in records self.hpo_network = annotate( self.hpo_network, self.phenotype_to_diseases, self.num_diseases_annotated, self.alt2prim, ages_distribution_file=self.ages_distribution_file) records = parse_input( os.path.join(self.parent_dir, 'data/test.score-short.txt'), self.hpo_network, self.alt2prim) # create instance the scorer class scorer = Scorer(self.hpo_network, summarization_method='BMWA', min_score_mask=None) # select which patients to test in pairwise best_match_weighted_average input_records = [ x for x in records if x['record_id'] in ['118200', '118210'] ] results = scorer.score_records( input_records, input_records, [ (0, 1), ], ) self.assertEqual(len(results), 1) # the right answer = answer = np.average([0.166, 1.0, 1.0, 0.125, 0.25, 1.0, 1.0], weights=[0.481, 1.0, 1.0, 0.0446, 1.0, 1.0, 1.0]) self.assertAlmostEqual(float(results[0][2]), answer, 2) # Test identical records for which one age exist and one doesn't input_records = [ x for x in records if x['record_id'] in ['118210', '118211'] ] results = scorer.score_records( input_records, input_records, [ (0, 1), ], ) self.assertEqual(len(results), 1) self.assertAlmostEqual(float(results[0][2]), 1.0, 1)
def test_annotate_network(self): hpo_network = load_network(self.obo_file) alt2prim = generate_alternate_ids(hpo_network) # load phenotypes to diseases associations disease_to_phenotype_file = os.path.join(self.parent_dir, 'data/phenotype.hpoa') disease_records, phenotype_to_diseases = load_d2p( disease_to_phenotype_file, hpo_network, alt2prim) num_diseases_annotated = len(disease_records) hpo_network = annotate(hpo_network, phenotype_to_diseases, num_diseases_annotated, alt2prim) self.assertAlmostEqual(hpo_network.nodes['HP:0010863']['ic'], 5.69, 2)
def setUpClass(cls): # parent dir cls.parent_dir = os.path.dirname(os.path.realpath(__file__)) # load and process the network cls.obo_file = os.path.join(cls.parent_dir, 'data/hp.obo') cls.hpo_network = load_network(cls.obo_file) cls.alt2prim = generate_alternate_ids(cls.hpo_network) # load phenotypes to genes associations cls.disease_to_phenotype_file = os.path.join(cls.parent_dir, 'data/phenotype.hpoa') cls.disease_records, cls.phenotype_to_diseases = load_d2p(cls.disease_to_phenotype_file, cls.hpo_network, cls.alt2prim) cls.num_diseases_annotated = len(cls.disease_records) cls.hpo_network = annotate(cls.hpo_network, cls.phenotype_to_diseases, cls.num_diseases_annotated, cls.alt2prim) cls.hpo_id = 'HP:0010863' cls.disease_to_phenotype_output_file = os.path.join(cls.parent_dir, 'data/phenotype.noparents.hpoa')
def setUp(cls): # parent dir cls.parent_dir = os.path.dirname(os.path.realpath(__file__)) # load and process the network cls.obo_file = os.path.join(cls.parent_dir, 'data/hp.obo') cls.hpo_network = load_network(cls.obo_file) cls.alt2prim = generate_alternate_ids(cls.hpo_network) cls.ages_distribution_file = os.path.join(cls.parent_dir, 'data/phenotype_age.tsv') # load phenotypes to genes associations cls.disease_to_phenotype_file = os.path.join(cls.parent_dir, 'data/phenotype.hpoa') cls.disease_records, cls.phenotype_to_diseases = load_d2p( cls.disease_to_phenotype_file, cls.hpo_network, cls.alt2prim) cls.num_diseases_annotated = len(cls.disease_records) cls.hpo_network = annotate(cls.hpo_network, cls.phenotype_to_diseases, cls.num_diseases_annotated, cls.alt2prim) # create instance the scorer class cls.scorer = Scorer(cls.hpo_network, min_score_mask=None)