def test_positive_parent_multi(self): """fetching a parent term when terms have multiple parents.""" # load an ontology in which Y7 is connected to both Y2 and Y1 Yfile = join(testdir, "Ymulti.obo") Yobo = MinimalObo(Yfile) Ydefaults = dict.fromkeys(Yobo.ids(), 0.0001) Ydefaults["Y:003"] = 0.0002 Ydefaults["Y:005"] = 0.0002 # make slight variations of representations rs = ReferenceSet(dict(refA=0.5, refB=0.5), ids=Yobo.ids(), row_priors=Ydefaults) refA = Representation(name="refA") refA.set("Y:002", 0.5).set("Y:005", 1).impute(Yobo, Ydefaults) refB = Representation(name="refB") refB.set("Y:001", 0.5).impute(Yobo, Ydefaults) rs.add(refA).add(refB) rs.learn_obo(Yobo) self.assertEqual( rs._positive_ancestor(rs.columns["refA"], rs.rows["Y:007"]), rs.rows["Y:002"], "Y2 is a positive ancestor") self.assertEqual( rs._positive_ancestor(rs.columns["refB"], rs.rows["Y:007"]), rs.rows["Y:001"], "Y1 is a positive immediate parent")
def test_impute_fromseeds_highfirst(self): """imputing values from manually-specified seeds.""" rr = Representation(dict()) # specify data for two children, DOID:4 is higher in tree, so should gain rr.set("DOID:0014667", 0.4) rr.set("DOID:0080015", 0.3) rr.impute(self.obo, self.obodef, seeds=["DOID:0014667", "DOID:0080015"]) self.assertAlmostEqual(rr.get("DOID:0014667"), 0.4, msg="should remain") self.assertAlmostEqual(rr.get("DOID:0080015"), 0.3, msg="should remain") self.assertAlmostEqual( rr.get("DOID:4"), 1 - ((1 - 0.4) * (1 - 0.3) * (1 - 0.2)), msg="ancestor gains from two children (and prior)") self.assertAlmostEqual(rr.get("DOID:655"), 0.2, msg="remain; new DOID:4")
def get_priors_from_models(models, categories, obo, dark=1): """Compute cohort-wide phenotype frequencies Arguments: models dictionary of Entity objects categories set, determines what models to use in the calculation obo object of class Obo dark integer, dark count for phenotype normalization Returns: dict mapping phenotypes (from obo) to values [0,1] integer, number of models used to inform the prior """ # get a subset of the models that satisfy the criteria all = [obj for _, obj in models.items()] hits = filter_entities_cat(all, categories) # transfer phenotypes into representations obodefaults = dict.fromkeys(obo.ids(), 0) freqcounts = dict.fromkeys(list(obo.ids()), dark) for entity in hits: # prepare concise representations rep = Representation(name=entity.id) for datum in entity.data: rep.set(datum.phenotype, datum.value) # convert to complete representation rep.impute(obo, obodefaults) # count phenotypes for phenotype in obo.ids(): freqcounts[phenotype] += rep.data[phenotype] # convert counts into frequencies result = counts_p(freqcounts, len(hits), dark) return result, len(hits)
def test_set_feature_float(self): """can set and retrieve values""" rr = Representation() rr.set("abc", 1) self.assertEqual(rr.get("abc"), 1.0) self.assertTrue(type(rr.get("abc")) is float)
def test_has(self): """querying whether a value has been set.""" rr = Representation(dict(xyz=0.2)) rr.set("bob", 0.4) self.assertTrue(rr.has("xyz"), "set in constructor") self.assertTrue(rr.has("bob"), "set manually") self.assertFalse(rr.has("alice"), "not set")
def test_bg_nonverbose(self): """compute with background value leaves score the same.""" neutral = Representation(name="TP") neutral.set("Y:004", self.priors["Y:004"]) chain = self.rs.inference_chain(neutral, "refA", verbose=False) chain.evaluate() self.assertEqual(chain.posterior, chain.prior)
def get_complete_null(dbpath): """create a complete representation for the null reference""" result = Representation(name="null") tab = ReferenceCompletePhenotypeTable(dbpath) generator = DBGenerator(tab, where=dict(id="null")) for row in generator.next(): result.set(row["phenotype"], row["value"]) return result
def test_keys(self): """setting and getting from a generic representation.""" rr = Representation(dict(xyz=0.2)) rr.set("bob", 0.4) rr.set("xyz", 0.3) rr.defaults(self.defaults) self.assertEqual(rr.keys(), ["abc", "xyz", "bob"], "keys should have defaults and non-defaults")
def test_impute_fromseeds_lowfirst(self): """imputing values from manually-specified seeds.""" rr = Representation(dict()) ## specify an inconsistent set of values, DOID:4 is higher in tree, so cannot ## have a lower value than DOID:0014667 rr.set("DOID:0014667", 0.4).set("DOID:4", 0.1) rr.impute(self.obo, self.obodef, seeds=["DOID:4", "DOID:0014667"]) self.assertEqual(rr.get("DOID:0080015"), 0.1, "child of DOID:4") self.assertEqual(rr.get("DOID:655"), 0.1, "child of DOID:4")
def test_priors_reps(self): """generate priors for phenotypes from representations.""" repA = Representation(name="A") repA.set("Y:006", 1) repB = Representation(name="B") repB.set("Y:001", 0.8) reps = dict(A=repA, B=repB) priors, num = get_priors_from_reps(reps, obo) self.assertGreater(num, 0)
def test_impute_down_ordering(self): """updating values in representation via negative evidence.""" r1 = Representation(dict()) r1.set("DOID:3650", 0.01).set("DOID:0014667", 0.05) r2 = Representation(dict()) r2.set("DOID:3650", 0.01).set("DOID:0014667", 0.05) # imputation down should not depend on order of the seeds r1.impute(self.obo, self.obodef, seeds=["DOID:3650", "DOID:0014667"]) r2.impute(self.obo, self.obodef, seeds=["DOID:0014667", "DOID:3650"]) self.assertEqual(r1.data, r2.data, "all values the same")
def test_copy(self): """can copy a representation into a new object.""" r1 = Representation(self.defaults, name="hello") r1.set("abc", 0.5) result = r1.copy() self.assertEqual(r1.name, result.name) self.assertEqual(r1.get("abc"), result.get("abc")) result.set("abc", 0.75) self.assertEqual(r1.get("abc"), 0.5) self.assertEqual(result.get("abc"), 0.75)
def test_general_representation_get(self): """setting and getting from a generic representation.""" rr = Representation(dict(xyz=0.2)) rr.set("bob", 0.4) rr.set("xyz", 0.3) rr.defaults(self.defaults) self.assertEqual(rr.get("bob"), 0.4, "value should come from manual input") self.assertEqual(rr.get("abc"), 0.1, "value should come from defaults dict") self.assertEqual(rr.get("xyz"), 0.3, "value should come from manual override")
def test_sum_with_impute(self): """sum of values associated with the representation.""" rr = Representation(dict()) rr.set("DOID:0014667", 1) sum1 = rr.sum() rr.impute(self.obo, self.obodef) sum2 = rr.sum() self.assertEqual(sum1, 1, "value of one phenotype") self.assertGreater(sum2, 2, msg="value for one phenotype+ancestors+defaults")
def test_general_representation_get2(self): """setting and getting from a generic representation.""" # Similar to previous, but setting defaults before the specifics rr = Representation(dict(abc=0.1, xyz=0.2)) rr.defaults(self.defaults) rr.set("bob", 0.4) rr.set("xyz", 0.3) self.assertEqual(rr.get("bob"), 0.4, "value should come from manual input") self.assertEqual(rr.get("abc"), 0.1, "value should come from defaults dict") self.assertEqual(rr.get("xyz"), 0.3, "value should come from manual override")
def test_impute_up_avoid_doubles(self): """updating values in representation via positive evidence in DAG""" rr = Representation(dict()) # DOID:11044 in test ontology has two paths to root (DOID:4) # one is direct (a shortcut) # another path is through 0080015 rr.set("DOID:11044", 0.4) rr.impute(self.obo, self.obodef) self.assertGreater(rr.get("DOID:0080015"), 0.2, "ancestor should increase") self.assertAlmostEqual( rr.get("DOID:0080015"), rr.get("DOID:4"), msg="4 should get bumped once, despite two paths from 11044")
def test_positive_parent(self): """fetching a parent term that has a positive value.""" rs = ReferenceSet(dict(refA=0.5), ids=Yobo.ids(), row_priors=Ydefaults) refA = Representation(name="refA") refA.set("Y:002", 0.5).set("Y:005", 1).impute(Yobo, Ydefaults) rs.add(refA) rs.learn_obo(Yobo) refAindex = rs.columns["refA"] self.assertEqual(rs._positive_ancestor(refAindex, rs.rows["Y:002"]), rs.rows["Y:002"], "Y2 is itself is positive") self.assertEqual(rs._positive_ancestor(refAindex, rs.rows["Y:007"]), rs.rows["Y:002"], "Y2 is immediate parent of Y7") self.assertEqual(rs._positive_ancestor(refAindex, rs.rows["Y:006"]), rs.rows["Y:005"], "Y5 is immediate parent of Y6")
def test_impute_down(self): """updating values in representation via negative evidence.""" rr = Representation(dict()) rr.set("unrelated", 0.8) rr.set("DOID:0014667", 0.05) rr.impute(self.obo, self.obodef) self.assertAlmostEqual(rr.get("unrelated"), 0.8, "out-of-ontology terms remain") self.assertAlmostEqual(rr.get("DOID:0014667"), 0.05, "set value should remain") self.assertAlmostEqual(rr.get("DOID:4"), 0.2, "ancestors should remain") self.assertAlmostEqual(rr.get("DOID:0080015"), 0.2, "unrelated terms get default") self.assertAlmostEqual(rr.get("DOID:655"), 0.05, "children are unaffected")
def make_target_reference(reference, oomap, oo_median=None): """convert a single representation from one ontology to another.""" result = Representation(name=reference.name) result.title = reference.title for phenotype, value in reference.data.items(): for oo_phenotype, oo_score in oomap[phenotype]: # perhaps compute a rescaled oo value newvalue = value if oo_median is not None: newvalue = value * tanh(oo_score / oo_median) # always take the larger value if previously set if result.has(oo_phenotype): newvalue = max(newvalue, result.get(oo_phenotype)) result.set(oo_phenotype, newvalue) return result
def test_impute_up_always_increases(self): """updating values in representation via positive evidence.""" rr = Representation(dict()) rr.set("DOID:3650", 0.25) defaults = self.obodef.copy() defaults["DOID:0014667"] = 0.5 defaults["DOID:4"] = 1 rr.impute(self.obo, defaults) self.assertEqual(rr.get("DOID:3650"), 0.25, "set value should remain") self.assertGreater(rr.get("DOID:0060158"), 0.25, "ancestors should receive greater score") self.assertEqual(rr.get("DOID:655"), 0.2, "unrelated should stay at default") # ancestor that has already a higher score than what is propagated self.assertGreater( rr.get("DOID:0014667"), 0.5, "ancestor should receive score greater than its prior")
def test_equality(self): """checking content of representations.""" r1 = Representation(self.defaults, name="hello") r2 = Representation(self.defaults, name="hello") r3 = Representation(self.defaults, name="bye") r4 = Representation(self.defaults, name="hello") r4.set("abc", 100) r5 = Representation() r6 = Representation(self.defaults, name="hello") r6.set("qqq", 20) self.assertTrue(r1.equal(r2), "all is the same") self.assertFalse(r1.equal(5), "argument is not a Representation") self.assertFalse(r1.equal(r3), "same content, but different name") self.assertFalse(r1.equal(r4), "same name, but different content") self.assertFalse(r1.equal(r5), "r5 is empty") self.assertFalse(r1.equal(r6), "r6 has more keys") self.assertFalse(r1.equal(range(4)), "must compare to Representation")
def test_impute_fromseeds_auto(self): """imputing values from automatically-ordered seeds.""" # specify an inconsistent set of values, DOID:4 is higher in tree, so cannot # have a lower value than DOID:0014667 # However, low DOID:4 can impact on other branches rr1 = Representation(dict()) rr1.set("DOID:0014667", 0.4).set("DOID:4", 0.1) rr1.impute(self.obo, self.obodef) # auto seeds rr2 = Representation(dict()) rr2.set("DOID:0014667", 0.4).set("DOID:4", 0.1) rr2.impute(self.obo, self.obodef) # auto seeds, different initial ordering rr3 = Representation(dict()) rr3.set("DOID:4", 0.1).set("DOID:0014667", 0.4) rr3.impute(self.obo, self.obodef) self.assertTrue(rr1.data == rr2.data, "auto and manual should have same data") self.assertTrue(rr2.data == rr3.data, "should be = regardless of input order") self.assertGreater(rr1.data["DOID:0014667"], 0.2, "DOID:0014667 increase by direct evidence") self.assertGreater(rr1.data["DOID:4"], 0.2, "DOID:4 increases driven by 0014667") self.assertEqual(rr1.data["DOID:11044"], 0.1, "low raw DOID:4 propagates down")
def test_underflow(self): """attempt to get underflow in individual p.""" # let model have very sure values model = Representation(name="underflow") model.set("Y:007", 0.00001).set("Y:004", 1).set("Y:003", 1) # let ref universe have two annotations and one null refA = Representation(name="refA").set("Y:003", 1) refB = Representation(name="refB").set("Y:003", 1) rs = ReferenceSet(dict(null=0.98, refA=0.001, refB=0.001), ids=self.obo.ids()) rs.add(refA).add(refB) rs.learn_obo(self.obo) rs.prep() result = rs.inference(model, verbose=True) self.assertGreaterEqual(result["refA"], 0, msg="must always be a number, even if zero") self.assertGreaterEqual(result["refB"], 0, msg="must always be a number, even if zero") self.assertGreaterEqual(result["refB"], 0, msg="must always be a number, even if zero")
def test_impute_up(self): """updating values in representation via positive evidence.""" rr = Representation(dict()) rr.set("unrelated", 0.8) rr.set("DOID:0014667", 0.4) rr.impute(self.obo, self.obodef) self.assertEqual(rr.get("unrelated"), 0.8, msg="out-of-ontology terms remain") self.assertEqual(rr.get("DOID:0014667"), 0.4, msg="set value should remain") self.assertGreater(rr.get("DOID:4"), 0.4, msg="ancestors should receive greater score") self.assertEqual(rr.get("DOID:0080015"), 0.2, msg="unrelated terms get default") self.assertEqual(rr.get("DOID:655"), 0.2, msg="children are unaffected")
def test_subset(self): """make a refset smaller by ignoring some features.""" # create a reference set rs = ReferenceSet(OrderedDict(refA=0.5, refB=0.5), ids=testfeatures, row_priors=zerovals) # add some data to the two representations r1 = Representation(name="refA") r1.set("a", 0.1).set("b", 0.2).set("c", 0.3).set("d", 0.4) r2 = Representation(name="refB") r2.set("c", 0.6).set("d", 0.7).set("e", 0.8) rs.add(r1).add(r2) # manually create arrays with the reference set data expected_raw_A = [0.1, 0.2, 0.3, 0.4, 0.0] expected_raw_B = [0.0, 0.0, 0.6, 0.7, 0.8] self.assertTrue(list(rs.data[0]), expected_raw_A) self.assertTrue(list(rs.data[1]), expected_raw_B) # subset to a smaller number of features # myids - here c is repeated twice, z is not in the original features myids = ["e", "c", "a", "z", "c"] rs = ReferenceMatrix(rs, myids) # check new shape (three features and two references) self.assertEqual(len(rs.rows), 3) self.assertEqual(len(rs.row_names), 3) self.assertEqual(rs.data.shape, (3, 2)) # check that the relevant rows are present result = set(rs.rows.keys()) expected = set(myids) expected.remove("z") self.assertEqual(result, expected) # check data subset in output output_A = [0.1, 0.3, 0.0] output_B = [0.0, 0.6, 0.8] self.assertEqual(sum(rs.data[:, 0]), sum(output_A)) self.assertEqual(sum(rs.data[:, 1]), sum(output_B))
def test_inference_chain(self): """compute an inference chain.""" # create a reference set refA = Representation(name="refA") refA.set("Y:002", 1).impute(Yobo, Ydefaults) refB = Representation(name="refB") refB.set("Y:001", 1).impute(Yobo, Ydefaults) rs = ReferenceSet(dict(refA=0.5, refB=0.5), ids=Yobo.ids(), row_priors=Ydefaults) rs.add(refA).add(refB) rs.learn_obo(Yobo) # compute a chain object explaining scoring steps chain = rs.inference_chain(refA, "refB", verbose=True) self.assertEqual(chain.__dict__["model"], "refA") self.assertEqual(chain.__dict__["reference"], "refB") self.assertGreater(len(chain.data), 2, "data chain should describe multiple features") self.assertTrue("background" in chain.data[0].__dict__, "chain data have comparison information") self.assertTrue("result" in chain.data[0].__dict__, "chain data have TP/FP/etc codes")
def test_FP_with_fp_penalty(self): """FP increases more with lower fp_penalty""" # make a new reference set with different priors priors2 = self.priors.copy() priors2["Y:003"] = 0.4 priors2["Y:002"] = 0.15 priors2["Y:007"] = 0.1 rs2 = ReferenceSet(dict(null=0.4, ref=0.3), ids=Yobo.ids(), row_priors=priors2) ref = Representation(name="ref") ref.set("Y:001", 1).impute(Yobo, priors2) ref.set("Y:007", priors2["Y:007"] / 2) rs2.add(self.refnull).add(ref) rs2.learn_obo(Yobo) FP = Representation(name="model").set("Y:007", 0.35) chain1 = rs2.inference_chain(FP, "ref", verbose=True, fp_penalty=0.1) chain1.evaluate_inference() self.assertGreater(chain1.posterior, chain1.prior) chain2 = rs2.inference_chain(FP, "ref", verbose=True, fp_penalty=1) chain2.evaluate_inference() self.assertLess(chain2.posterior, chain1.posterior)
class ReferenceMatrixTests(unittest.TestCase): """Test cases for computing average representation of neighbors""" def setUp(self): """prepare a reference set with a few references.""" self.refnull = Representation(data=null_defaults, name="null") self.refA = Representation(data=Ydefaults, name="refA") self.refA.set("Y:004", 1) self.refB = Representation(data=Ydefaults, name="refB") self.refB.set("Y:004", 1).set("Y:001", 0.6).set("Y:003", 0.5) self.refC = Representation(data=Ydefaults, name="refC") self.refC.set("Y:004", 1).set("Y:001", 0.5) self.refD = Representation(data=Ydefaults, name="refD") self.refD.set("Y:004", 0.1) ref_priors = dict(null=0.1, refA=0.1, refB=0.1, refC=0.1, refD=0.1) rs = ReferenceSet(ref_priors, ids=Ydefaults.keys()) rs.add(self.refnull) rs.add(self.refA).add(self.refB) rs.add(self.refC).add(self.refD) self.rm = ReferenceMatrix(rs, list(Ydefaults.keys())) def test_neighbors_cosine(self): """identify neighboring/similar references.""" indexes = dict() for _ in ["refA", "refB", "refC", "refD"]: indexes[_] = self.rm.columns[_] result = self.rm.nearest_neighbors("refC", 2) expected = ["refB", "refA"] self.assertEqual(result, expected) def test_average(self): """compute an average of several representations.""" result = self.rm.get_average(["refB", "refC"]) self.assertAlmostEqual(result.get("Y:004"), 1.0) self.assertAlmostEqual(result.get("Y:001"), 0.55) self.assertAlmostEqual(result.get("Y:007"), Ydefaults["Y:007"])
def test_set_feature(self): """can set and retrieve values""" rr = Representation() rr.set("abc", 0.2) self.assertEqual(rr.get("abc"), 0.2)