def test_average(self): """summarize phenotypes using an average (consistent values).""" # first add several pieces of evidence into an entity object m = Entity("abc", "genes") d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)) d2 = PhenotypeDatum("MP:007", Experiment(0, 0.4, 0.05)) d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05)) d4 = PhenotypeDatum("MP:007", Experiment(0, 0.4, 0.15)) d5 = PhenotypeDatum("MP:009", Experiment(1, 0.6, 0.05)) m.add(d1).add(d2).add(d3) m.add(d4).add(d5) self.assertEqual(len(m.data), 5) # check that the average contains all phenotypes m.average() self.assertEqual(len(m.data), 3) expected_tpr = {"MP:002": 0.7, "MP:007": 0.4, "MP:009": 0.6} expected_fpr = {"MP:002": 0.05, "MP:007": 0.1, "MP:009": 0.05} expected_val = {"MP:002": 1, "MP:007": 0, "MP:009": 1} for i in range(3): iphen = m.data[i].phenotype iexp = m.data[i].experiment self.assertEqual(iexp.value, expected_val[iphen]) self.assertEqual(iexp.tpr, expected_tpr[iphen]) self.assertEqual(iexp.fpr, expected_fpr[iphen])
def test_consensus_imputed(self): """summarize multiple rows of phenotypes using a consensus, with imputed values""" # first add several pieces of evidence into an entity object m = Entity("abc", "genes") d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)) # here add with value between 0 and 1 d2 = PhenotypeDatum("MP:007", Experiment(0.6, 0.4, 0.05)) d3 = PhenotypeDatum("MP:007", Experiment(0.4, 0.6, 0.05)) m.add(d1).add(d2).add(d3) self.assertEqual(len(m.data), 3) # check that the consensus matches the inputs m.consensus() self.assertEqual(len(m.data), 2) c1 = m.data[0] c2 = m.data[1] expected_tpr = {"MP:002": 0.8, "MP:007": 0.5} expected_fpr = {"MP:002": 0.05, "MP:007": 0.05} expected_val = {"MP:002": 1, "MP:007": 0.5} for i in range(2): iphen = m.data[i].phenotype iexp = m.data[i].experiment self.assertEqual(iexp.value, expected_val[iphen]) self.assertEqual(iexp.tpr, expected_tpr[iphen]) self.assertEqual(iexp.fpr, expected_fpr[iphen])
def test_comparisons(self): """intialization and comparison""" e1 = Experiment(1, 0.4, 0.2) e2 = Experiment(1, 0.4, 0.2) e3 = Experiment(1, 0.5, 0.2) self.assertEqual(e1, e2, "all equal values") self.assertNotEqual(e1, e3, "tpr not equal") self.assertNotEqual(e1, 0, "different classes")
def test_equivalent_same_phenotypes(self): """entities with the same phenotypes are equivalent""" m1, m2 = Entity("A", "X"), Entity("A", "X") m1.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))) m1.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05))) m2.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05))) m2.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))) self.assertTrue(m1.equivalent(m2)) self.assertTrue(m2.equivalent(m1))
def test_trim_easy_keep(self): """trimming does not eliminate node if ask to keep.""" m = Entity("abc", "genes") d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05)) d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05)) m.add(d1).add(d2) self.assertEqual(len(m.data), 2) m.trim_ancestors(self.obo, set(["DOID:4"])) self.assertEqual(len(m.data), 2)
def test_trim_nothing(self): """trimming does nothing if there is nothing to do.""" m = Entity("abc", "genes") d1 = PhenotypeDatum("DOID:3650", Experiment(1, 0.8, 0.05)) d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05)) m.add(d1).add(d2) self.assertEqual(len(m.data), 2) m.trim_ancestors(self.obo) self.assertEqual(len(m.data), 2)
def test_lt_by_value(self): """experiments can be ordered for sorting""" e1 = Experiment(1, tpr=0.8, fpr=0.2) e2 = Experiment(1, tpr=0.8, fpr=0.2) self.assertFalse(e1 < e2, "equal experiments cannot be lt") self.assertFalse(e2 < e1, "equal experiments cannot be lt") e3 = Experiment(0, tpr=0.8, fpr=0.2) self.assertTrue(e3 < e1, "lt due to value") e4 = Experiment(0, tpr=1, fpr=0.3) self.assertTrue(e4 < e1, "lt despite tpr and fpr values")
def test_equivalent_different_phenotypes(self): """entities with the same phenotypes are equivalent""" m1, m2 = Entity("A", "X"), Entity("A", "X") # add phenotypes, but two m1.add(PhenotypeDatum("MP:001", Experiment(1, 0.8, 0.05))) m1.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05))) m2.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05))) m2.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))) self.assertFalse(m1.equivalent(m2)) self.assertFalse(m2.equivalent(m1))
def test_trim_easy(self): """trimming eliminates root node.""" m = Entity("abc", "genes") d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05)) d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05)) m.add(d1).add(d2) self.assertEqual(len(m.data), 2) m.trim_ancestors(self.obo) self.assertEqual(len(m.data), 1) self.assertEqual(m.data[0].phenotype, "DOID:11044")
def get_gxd(gxd_path, emp_map, tprfpr): """read a file with marker-emapa associationss Arguments: gxd_path file with columns .... emp_map dict mapping EMAPA ids to other ids tprfpr 2-tuple with (tpr, fpr) Returns: dict mapping markers to phenotypes terms """ tpr = tprfpr[0] fpr = tprfpr[1] # get all the mapping from the raw file result = dict() with open_file(gxd_path, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="'") for row in reader: feature = row["feature.primaryIdentifier"] emapa = row["structure.identifier"] strength = row["strength"] if feature not in result: modelid = "GXD_" + feature result[feature] = Entity(modelid, "expression", marker_id=feature) result[feature].set_description("expression", 1) result[feature].set_description("source", "GXD") if emapa not in emp_map: continue if strength not in gxd_strength: continue # determine whether to add a positive or negative phenotype strength_factor = gxd_strength[strength] row_exp = Experiment(1, fpr + (tpr - fpr) * strength_factor, fpr) if strength == "Absent": row_exp.value = 0 for mp in emp_map[emapa]: result[feature].add(PhenotypeDatum(mp, row_exp)) # get a concensus value for id in result: result[id].consensus() return result
def test_trim_medium(self): """trimming eliminates when when there are several leafs.""" m = Entity("abc", "genes") d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05)) d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05)) d3 = PhenotypeDatum("DOID:0080015", Experiment(1, 0.8, 0.05)) d4 = PhenotypeDatum("DOID:655", Experiment(1, 0.8, 0.05)) m.add(d1).add(d2).add(d3).add(d4) self.assertEqual(len(m.data), 4) m.trim_ancestors(self.obo) self.assertEqual(len(m.data), 2) result = set([_.phenotype for _ in m.data]) self.assertEqual(result, set(["DOID:11044", "DOID:655"]))
def test_add_phenotype_data(self): """cannot add corrupt data""" m = Entity("abc", "genes", marker_id="X:001", marker_symbol="x001") self.assertEqual(len(m.data), 0, "initial model has no pheontypes") d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.0555)) m.add(d1) d2 = PhenotypeDatum("MP:007", Experiment(1, 0.456, 0.0234)) m.add(d2) self.assertEqual(len(m.data), 2, "just added two phenotypes") # check content of each datum pheno_str_0 = str(m.data[0]) pheno_str_1 = str(m.data[1]) self.assertTrue("002" in pheno_str_0) self.assertTrue("555" in pheno_str_0) self.assertTrue("234" in pheno_str_1)
def test_equivalent_phenotypes(self): """entities with different phenotypes cannot be the same.""" m1, m2 = Entity("A", "X"), Entity("A", "X") m1.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))) self.assertFalse(m1.equivalent(m2)) self.assertFalse(m2.equivalent(m1))
def consensus(self): """redefine the phenotype data to collapse multiple entries using a consensus""" # partition the data by phenotype, then scan each phenotype parts = self._split_phenotypes() result = [] for phenotype in parts.keys(): data = [_.experiment for _ in parts[phenotype]] stamps = [_.timestamp for _ in parts[phenotype]] num0 = sum([_.value == 0 for _ in data]) numP = sum([ 0 < _.value < 1 for _ in data]) num1 = sum([_.value == 1 for _ in data]) num_majority = max(num0, numP, num1) if num_majority == numP: data = [_ for _ in data if 0 < _.value < 1] elif num_majority == num0: data = [_ for _ in data if _.value == 0] else: data = [_ for _ in data if _.value == 1] majority = mean([_.value for _ in data]) tpr = mean([_.tpr for _ in data]) fpr = mean([_.fpr for _ in data]) tpr = max(fpr, tpr*float(num_majority)/(num0+numP+num1)) datum = PhenotypeDatum(phenotype, Experiment(majority, tpr, fpr), stamps[0]) result.append(datum) self.data = result return self
def test_default_timestamp(self): """default should set a timestamp""" now = datetime.now() datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1)) stamp = datetime.strptime(datum.timestamp, timestamp_format) diff = (now - stamp).total_seconds() self.assertLess(diff, 60, "stamps should be within a few seconds")
def test_average_2(self): """summarize phenotypes using an average (discordant values).""" # first add several pieces of evidence into an entity object m = Entity("abc", "genes") d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)) d2 = PhenotypeDatum("MP:002", Experiment(0, 0.4, 0.05)) d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05)) m.add(d1).add(d2).add(d3) self.assertEqual(len(m.data), 3) # check that the consensus matches the inputs m.average() self.assertEqual(len(m.data), 1) self.assertEqual(m.data[0].phenotype, "MP:002") iexp = m.data[0].experiment self.assertGreater(iexp.value, 0) self.assertAlmostEqual(iexp.tpr, (0.8 + 0.0 + 0.6) / 3) self.assertAlmostEqual(iexp.fpr, 0.05)
def test_init(self): """init of basic object""" datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1)) self.assertEqual(datum.phenotype, "MP:1") self.assertEqual(datum.value, 1) self.assertEqual(datum.tpr, 0.7) self.assertEqual(datum.fpr, 0.1) self.assertFalse(datum.timestamp is None)
def control0(self, refname, refrep): """create a model with the same phenotypes as a representation. """ model = tech_model(refname + "_match", "match") model.set_description("control_for", refname) timestamp = self.timestamp for phen in refrep.keys(): phen_value = refrep.get(phen) phen_exp = Experiment(1, self.tpr, self.fpr) model.add(PhenotypeDatum(phen, phen_exp, timestamp)) return model.trim_ancestors(self.obo)
def test_consensus_2(self): """summarize multiple rows of phenotypes using a consensus with some discordance.""" # first add several pieces of evidence into an entity object m = Entity("abc", "genes") d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)) d2 = PhenotypeDatum("MP:002", Experiment(0, 0.4, 0.05)) d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05)) m.add(d1).add(d2).add(d3) self.assertEqual(len(m.data), 3) # check that the consensus matches the inputs m.consensus() self.assertEqual(len(m.data), 1) c1 = m.data[0] iphen = m.data[0].phenotype iexp = m.data[0].experiment self.assertEqual(iexp.value, 1) # the tpr will be lower than (0.6+0.8)/2 # it should be (0.7*2/3) self.assertEqual(iexp.tpr, 0.7 * (2 / 3)) self.assertEqual(iexp.fpr, 0.05)
def get_file_phenotypes(filepath, timestamp): """get model phenotypes from a file.""" result = dict() if filepath is None: return result with open_file(filepath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="'") for row in reader: experiment = Experiment(row["value"], row["TPR"], row["FPR"]) stamp = timestamp if "timestamp" in row: stamp = row["timestamp"] datum = PhenotypeDatum(row["phenotype"], experiment, stamp) id = row["id"] if id not in result: result[id] = [] result[id].append(datum) return result
def average(self): """redefine the phenotype data to collapse multiple entries using an average""" # partition the data by phenotype, then scan each phenotype parts = self._split_phenotypes() result = [] for phenotype in parts.keys(): data = [_.experiment for _ in parts[phenotype]] # this part requires work val = mean([_.value for _ in data]) if val > 0: tpr = mean([_.tpr*_.value for _ in data]) else: tpr = mean([_.tpr for _ in data]) fpr = mean([_.fpr for _ in data]) # create a single simplified phenotype datum = PhenotypeDatum(phenotype, Experiment(val, tpr, fpr)) result.append(datum) self.data = result return self
def prep_IMPC(datapath, tprfpr, pthreshold, simplify="average", obo=None): """parse IMPC statistical results and assemble a set of models. Args: datapath: path to MGI raw file tprfpr: list with two elements (tpr, fpr) pthreshold: float, minimum threshold for significance simplify: string, method for simplifying multiple data type (use 'none', 'average', or 'consensus') obo: object of class MinimalObo """ models = dict() if datapath is None: return models now = now_timestamp() base_tpr, base_fpr = tprfpr[0], tprfpr[1] male = set(["M", "B", "U"]) female = set(["F", "B", "U"]) def create_models(id, category, zygosity, row): """Create a family of model definitions, for sex=FMU, neg_phen=01""" prefix = "IMPC_" + id + "_" + zygosity + "_" for suffix in ["F", "FA", "M", "MA", "U", "UA"]: id = prefix + suffix if id not in models: models[id] = impc_model(id, category, row, zygosity) models[id].set_description("sex", sex_code(suffix)) with_negative = negative_code(suffix) models[id].set_description("neg_phenotypes", with_negative) def add_to_model(datum, id, zygosity, suffix): """add a datum into an existing model definition. Arguments: datum phenotype and experiment result id, zygosity, suffix characterization of model """ id = "IMPC_" + id + "_" + zygosity + "_" + suffix models[id].add(datum) def add_set_to_models(datum, row, val, sex): """helper to add a set of models, for alleles, markers Arguments: datum phenotype and experiment result row dict val value of phenotype (0/1) sex one-letter code """ zygosity = (row["zygosity"])[:3] zygosity = "hom" if zygosity == "hem" else zygosity marker = row["marker_accession_id"] allele = row["allele_accession_id"] # perhaps create model definitions create_models(marker, "marker", zygosity, row) create_models(allele, "allele", zygosity, row) # record phenotypes into the models if val == 1: add_to_model(datum, marker, zygosity, sex) add_to_model(datum, allele, zygosity, sex) add_to_model(datum, marker, zygosity, sex + "A") add_to_model(datum, allele, zygosity, sex + "A") # get a map from parameter to mp_terms - used for negative phenotypes parameter_phenotype_map = get_parameter_phenotype_map(datapath, obo) with open_file(datapath, "rt") as f: reader = csv.DictReader(f, delimiter=",", quotechar="\"") for row in reader: # skip over bad data rows if row["status"] not in ("Success", "Successful"): continue if row["allele_symbol"] == "": continue # get a phenotype MP id phenotype = row["mp_term_id"].strip() # redefine some phenotypes # (this handles morphology MP:0002169 annotations) parameter = row["parameter_name"].strip() if phenotype + " " + parameter in redef: phenotype = redef[phenotype + " " + parameter] if phenotype == "" and parameter in parameter_phenotype_map: phenotype = parameter_phenotype_map[parameter] if phenotype == "" or phenotype == "MP:0002169": continue sex = sex_code(row["phenotype_sex"]) # identify whether this is a positive or a negative phenotype value = get_value(row, pthreshold) # add data at marker level, allele level, by gender hit = Experiment(value, base_tpr, base_fpr) datum = PhenotypeDatum(phenotype, hit, now) add_set_to_models(datum, row, value, "U") if sex in male: add_set_to_models(datum, row, value, "M") if sex in female: add_set_to_models(datum, row, value, "F") # some models may have redundant rows (e.g. a phenotype recorded twice) # so collapse into a consensus here if simplify == "consensus": for id in models: models[id].consensus() elif simplify == "average": for id in models: models[id].average() return models
""" Tests for contents of phenoscoring/phenotypedatum.py """ import unittest from datetime import datetime from scoring.experiment import Experiment from phenoscoring.phenotypedatum import PhenotypeDatum from phenoscoring.time import timestamp_format e1 = Experiment(1, 0.8, 0.05) e2 = Experiment(0, 0.8, 0.05) e3 = Experiment(1, 0.6, 0.15) class PhenotypeDatumTests(unittest.TestCase): """Test cases for handling object with phenotype and experiment""" def test_init(self): """init of basic object""" datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1)) self.assertEqual(datum.phenotype, "MP:1") self.assertEqual(datum.value, 1) self.assertEqual(datum.tpr, 0.7) self.assertEqual(datum.fpr, 0.1) self.assertFalse(datum.timestamp is None) def test_str(self): """object can be summarized."""