def get_priors_from_models(models, categories, obo, dark=1): """Compute cohort-wide phenotype frequencies Arguments: models dictionary of Entity objects categories set, determines what models to use in the calculation obo object of class Obo dark integer, dark count for phenotype normalization Returns: dict mapping phenotypes (from obo) to values [0,1] integer, number of models used to inform the prior """ # get a subset of the models that satisfy the criteria all = [obj for _, obj in models.items()] hits = filter_entities_cat(all, categories) # transfer phenotypes into representations obodefaults = dict.fromkeys(obo.ids(), 0) freqcounts = dict.fromkeys(list(obo.ids()), dark) for entity in hits: # prepare concise representations rep = Representation(name=entity.id) for datum in entity.data: rep.set(datum.phenotype, datum.value) # convert to complete representation rep.impute(obo, obodefaults) # count phenotypes for phenotype in obo.ids(): freqcounts[phenotype] += rep.data[phenotype] # convert counts into frequencies result = counts_p(freqcounts, len(hits), dark) return result, len(hits)
def test_impute_fromseeds_highfirst(self): """imputing values from manually-specified seeds.""" rr = Representation(dict()) # specify data for two children, DOID:4 is higher in tree, so should gain rr.set("DOID:0014667", 0.4) rr.set("DOID:0080015", 0.3) rr.impute(self.obo, self.obodef, seeds=["DOID:0014667", "DOID:0080015"]) self.assertAlmostEqual(rr.get("DOID:0014667"), 0.4, msg="should remain") self.assertAlmostEqual(rr.get("DOID:0080015"), 0.3, msg="should remain") self.assertAlmostEqual( rr.get("DOID:4"), 1 - ((1 - 0.4) * (1 - 0.3) * (1 - 0.2)), msg="ancestor gains from two children (and prior)") self.assertAlmostEqual(rr.get("DOID:655"), 0.2, msg="remain; new DOID:4")
def test_impute_fromseeds_lowfirst(self): """imputing values from manually-specified seeds.""" rr = Representation(dict()) ## specify an inconsistent set of values, DOID:4 is higher in tree, so cannot ## have a lower value than DOID:0014667 rr.set("DOID:0014667", 0.4).set("DOID:4", 0.1) rr.impute(self.obo, self.obodef, seeds=["DOID:4", "DOID:0014667"]) self.assertEqual(rr.get("DOID:0080015"), 0.1, "child of DOID:4") self.assertEqual(rr.get("DOID:655"), 0.1, "child of DOID:4")
def test_impute_down_ordering(self): """updating values in representation via negative evidence.""" r1 = Representation(dict()) r1.set("DOID:3650", 0.01).set("DOID:0014667", 0.05) r2 = Representation(dict()) r2.set("DOID:3650", 0.01).set("DOID:0014667", 0.05) # imputation down should not depend on order of the seeds r1.impute(self.obo, self.obodef, seeds=["DOID:3650", "DOID:0014667"]) r2.impute(self.obo, self.obodef, seeds=["DOID:0014667", "DOID:3650"]) self.assertEqual(r1.data, r2.data, "all values the same")
def test_get_reference(self): """extract one reference from a representation set.""" r1 = Representation(name="refA").set("DOID:0014667", 0.4) r1.impute(obo, obodefaults) r2 = Representation(name="refB").set("DOID:0080015", 0.6) r2.impute(obo, obodefaults) rs = ReferenceSet(dict(refA=0.5, refB=0.5), ids=obo.ids()) rs.add(r1).add(r2) r3 = rs.get_representation("refA") self.assertTrue(r3.equal(r1))
def test_sum_with_impute(self): """sum of values associated with the representation.""" rr = Representation(dict()) rr.set("DOID:0014667", 1) sum1 = rr.sum() rr.impute(self.obo, self.obodef) sum2 = rr.sum() self.assertEqual(sum1, 1, "value of one phenotype") self.assertGreater(sum2, 2, msg="value for one phenotype+ancestors+defaults")
def test_impute_up_avoid_doubles(self): """updating values in representation via positive evidence in DAG""" rr = Representation(dict()) # DOID:11044 in test ontology has two paths to root (DOID:4) # one is direct (a shortcut) # another path is through 0080015 rr.set("DOID:11044", 0.4) rr.impute(self.obo, self.obodef) self.assertGreater(rr.get("DOID:0080015"), 0.2, "ancestor should increase") self.assertAlmostEqual( rr.get("DOID:0080015"), rr.get("DOID:4"), msg="4 should get bumped once, despite two paths from 11044")
def test_add_incrementally(self): """transferring values into a representation set.""" r1 = Representation(name="refA").set("DOID:0014667", 0.4) r1.impute(obo, obodefaults) r2 = Representation(name="refB").set("DOID:0080015", 0.6) r2.impute(obo, obodefaults) rs = ReferenceSet(dict(refA=0.5, refB=0.5), ids=obo.ids()) rs.add(r1).add(r2) self.assertEqual(rs.get("DOID:0014667", "refA"), 0.4, "refset should contain inserted data") self.assertEqual(rs.get("DOID:0080015", "refB"), 0.6, "refset should contain inserted data") self.assertEqual(rs.get("DOID:4", "refB"), 0.6, "refset should contain imputed data")
def test_impute_down(self): """updating values in representation via negative evidence.""" rr = Representation(dict()) rr.set("unrelated", 0.8) rr.set("DOID:0014667", 0.05) rr.impute(self.obo, self.obodef) self.assertAlmostEqual(rr.get("unrelated"), 0.8, "out-of-ontology terms remain") self.assertAlmostEqual(rr.get("DOID:0014667"), 0.05, "set value should remain") self.assertAlmostEqual(rr.get("DOID:4"), 0.2, "ancestors should remain") self.assertAlmostEqual(rr.get("DOID:0080015"), 0.2, "unrelated terms get default") self.assertAlmostEqual(rr.get("DOID:655"), 0.05, "children are unaffected")
def test_impute_up_always_increases(self): """updating values in representation via positive evidence.""" rr = Representation(dict()) rr.set("DOID:3650", 0.25) defaults = self.obodef.copy() defaults["DOID:0014667"] = 0.5 defaults["DOID:4"] = 1 rr.impute(self.obo, defaults) self.assertEqual(rr.get("DOID:3650"), 0.25, "set value should remain") self.assertGreater(rr.get("DOID:0060158"), 0.25, "ancestors should receive greater score") self.assertEqual(rr.get("DOID:655"), 0.2, "unrelated should stay at default") # ancestor that has already a higher score than what is propagated self.assertGreater( rr.get("DOID:0014667"), 0.5, "ancestor should receive score greater than its prior")
def test_impute_fromseeds_auto(self): """imputing values from automatically-ordered seeds.""" # specify an inconsistent set of values, DOID:4 is higher in tree, so cannot # have a lower value than DOID:0014667 # However, low DOID:4 can impact on other branches rr1 = Representation(dict()) rr1.set("DOID:0014667", 0.4).set("DOID:4", 0.1) rr1.impute(self.obo, self.obodef) # auto seeds rr2 = Representation(dict()) rr2.set("DOID:0014667", 0.4).set("DOID:4", 0.1) rr2.impute(self.obo, self.obodef) # auto seeds, different initial ordering rr3 = Representation(dict()) rr3.set("DOID:4", 0.1).set("DOID:0014667", 0.4) rr3.impute(self.obo, self.obodef) self.assertTrue(rr1.data == rr2.data, "auto and manual should have same data") self.assertTrue(rr2.data == rr3.data, "should be = regardless of input order") self.assertGreater(rr1.data["DOID:0014667"], 0.2, "DOID:0014667 increase by direct evidence") self.assertGreater(rr1.data["DOID:4"], 0.2, "DOID:4 increases driven by 0014667") self.assertEqual(rr1.data["DOID:11044"], 0.1, "low raw DOID:4 propagates down")
def test_impute_up(self): """updating values in representation via positive evidence.""" rr = Representation(dict()) rr.set("unrelated", 0.8) rr.set("DOID:0014667", 0.4) rr.impute(self.obo, self.obodef) self.assertEqual(rr.get("unrelated"), 0.8, msg="out-of-ontology terms remain") self.assertEqual(rr.get("DOID:0014667"), 0.4, msg="set value should remain") self.assertGreater(rr.get("DOID:4"), 0.4, msg="ancestors should receive greater score") self.assertEqual(rr.get("DOID:0080015"), 0.2, msg="unrelated terms get default") self.assertEqual(rr.get("DOID:655"), 0.2, msg="children are unaffected")