예제 #1
0
    def test_average(self):
        """summarize phenotypes using an average (consistent values)."""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("MP:007", Experiment(0, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05))
        d4 = PhenotypeDatum("MP:007", Experiment(0, 0.4, 0.15))
        d5 = PhenotypeDatum("MP:009", Experiment(1, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        m.add(d4).add(d5)
        self.assertEqual(len(m.data), 5)

        # check that the average contains all phenotypes
        m.average()
        self.assertEqual(len(m.data), 3)
        expected_tpr = {"MP:002": 0.7, "MP:007": 0.4, "MP:009": 0.6}
        expected_fpr = {"MP:002": 0.05, "MP:007": 0.1, "MP:009": 0.05}
        expected_val = {"MP:002": 1, "MP:007": 0, "MP:009": 1}
        for i in range(3):
            iphen = m.data[i].phenotype
            iexp = m.data[i].experiment
            self.assertEqual(iexp.value, expected_val[iphen])
            self.assertEqual(iexp.tpr, expected_tpr[iphen])
            self.assertEqual(iexp.fpr, expected_fpr[iphen])
예제 #2
0
    def test_consensus_imputed(self):
        """summarize multiple rows of phenotypes using a consensus, with imputed values"""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        # here add with value between 0 and 1
        d2 = PhenotypeDatum("MP:007", Experiment(0.6, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:007", Experiment(0.4, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        self.assertEqual(len(m.data), 3)

        # check that the consensus matches the inputs
        m.consensus()
        self.assertEqual(len(m.data), 2)
        c1 = m.data[0]
        c2 = m.data[1]
        expected_tpr = {"MP:002": 0.8, "MP:007": 0.5}
        expected_fpr = {"MP:002": 0.05, "MP:007": 0.05}
        expected_val = {"MP:002": 1, "MP:007": 0.5}
        for i in range(2):
            iphen = m.data[i].phenotype
            iexp = m.data[i].experiment
            self.assertEqual(iexp.value, expected_val[iphen])
            self.assertEqual(iexp.tpr, expected_tpr[iphen])
            self.assertEqual(iexp.fpr, expected_fpr[iphen])
예제 #3
0
    def test_comparisons(self):
        """intialization and comparison"""

        e1 = Experiment(1, 0.4, 0.2)
        e2 = Experiment(1, 0.4, 0.2)
        e3 = Experiment(1, 0.5, 0.2)
        self.assertEqual(e1, e2, "all equal values")
        self.assertNotEqual(e1, e3, "tpr not equal")
        self.assertNotEqual(e1, 0, "different classes")
예제 #4
0
    def test_equivalent_same_phenotypes(self):
        """entities with the same phenotypes are equivalent"""

        m1, m2 = Entity("A", "X"), Entity("A", "X")
        m1.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        m1.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        self.assertTrue(m1.equivalent(m2))
        self.assertTrue(m2.equivalent(m1))
예제 #5
0
    def test_trim_easy_keep(self):
        """trimming does not eliminate node if ask to keep."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2)

        self.assertEqual(len(m.data), 2)
        m.trim_ancestors(self.obo, set(["DOID:4"]))
        self.assertEqual(len(m.data), 2)
예제 #6
0
    def test_trim_nothing(self):
        """trimming does nothing if there is nothing to do."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:3650", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2)

        self.assertEqual(len(m.data), 2)
        m.trim_ancestors(self.obo)
        self.assertEqual(len(m.data), 2)
예제 #7
0
    def test_lt_by_value(self):
        """experiments can be ordered for sorting"""

        e1 = Experiment(1, tpr=0.8, fpr=0.2)
        e2 = Experiment(1, tpr=0.8, fpr=0.2)
        self.assertFalse(e1 < e2, "equal experiments cannot be lt")
        self.assertFalse(e2 < e1, "equal experiments cannot be lt")
        e3 = Experiment(0, tpr=0.8, fpr=0.2)
        self.assertTrue(e3 < e1, "lt due to value")
        e4 = Experiment(0, tpr=1, fpr=0.3)
        self.assertTrue(e4 < e1, "lt despite tpr and fpr values")
예제 #8
0
    def test_equivalent_different_phenotypes(self):
        """entities with the same phenotypes are equivalent"""

        m1, m2 = Entity("A", "X"), Entity("A", "X")
        # add phenotypes, but two
        m1.add(PhenotypeDatum("MP:001", Experiment(1, 0.8, 0.05)))
        m1.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:005", Experiment(1, 0.8, 0.05)))
        m2.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        self.assertFalse(m1.equivalent(m2))
        self.assertFalse(m2.equivalent(m1))
예제 #9
0
    def test_trim_easy(self):
        """trimming eliminates root node."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2)

        self.assertEqual(len(m.data), 2)
        m.trim_ancestors(self.obo)
        self.assertEqual(len(m.data), 1)
        self.assertEqual(m.data[0].phenotype, "DOID:11044")
예제 #10
0
def get_gxd(gxd_path, emp_map, tprfpr):
    """read a file with marker-emapa associationss
    
    Arguments:
        gxd_path   file with columns ....
        emp_map      dict mapping EMAPA ids to other ids
        tprfpr     2-tuple with (tpr, fpr)
    
    Returns:
        dict mapping markers to phenotypes terms
    """

    tpr = tprfpr[0]
    fpr = tprfpr[1]

    # get all the mapping from the raw file
    result = dict()
    with open_file(gxd_path, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="'")
        for row in reader:
            feature = row["feature.primaryIdentifier"]
            emapa = row["structure.identifier"]
            strength = row["strength"]

            if feature not in result:
                modelid = "GXD_" + feature
                result[feature] = Entity(modelid,
                                         "expression",
                                         marker_id=feature)
                result[feature].set_description("expression", 1)
                result[feature].set_description("source", "GXD")

            if emapa not in emp_map:
                continue
            if strength not in gxd_strength:
                continue

            # determine whether to add a positive or negative phenotype
            strength_factor = gxd_strength[strength]
            row_exp = Experiment(1, fpr + (tpr - fpr) * strength_factor, fpr)
            if strength == "Absent":
                row_exp.value = 0
            for mp in emp_map[emapa]:
                result[feature].add(PhenotypeDatum(mp, row_exp))

    # get a concensus value
    for id in result:
        result[id].consensus()

    return result
예제 #11
0
    def test_trim_medium(self):
        """trimming eliminates when when there are several leafs."""

        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("DOID:4", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("DOID:11044", Experiment(1, 0.8, 0.05))
        d3 = PhenotypeDatum("DOID:0080015", Experiment(1, 0.8, 0.05))
        d4 = PhenotypeDatum("DOID:655", Experiment(1, 0.8, 0.05))
        m.add(d1).add(d2).add(d3).add(d4)

        self.assertEqual(len(m.data), 4)
        m.trim_ancestors(self.obo)
        self.assertEqual(len(m.data), 2)
        result = set([_.phenotype for _ in m.data])
        self.assertEqual(result, set(["DOID:11044", "DOID:655"]))
예제 #12
0
    def test_add_phenotype_data(self):
        """cannot add corrupt data"""

        m = Entity("abc", "genes", marker_id="X:001", marker_symbol="x001")
        self.assertEqual(len(m.data), 0, "initial model has no pheontypes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.0555))
        m.add(d1)
        d2 = PhenotypeDatum("MP:007", Experiment(1, 0.456, 0.0234))
        m.add(d2)
        self.assertEqual(len(m.data), 2, "just added two phenotypes")
        # check content of each datum
        pheno_str_0 = str(m.data[0])
        pheno_str_1 = str(m.data[1])
        self.assertTrue("002" in pheno_str_0)
        self.assertTrue("555" in pheno_str_0)
        self.assertTrue("234" in pheno_str_1)
예제 #13
0
    def test_equivalent_phenotypes(self):
        """entities with different phenotypes cannot be the same."""

        m1, m2 = Entity("A", "X"), Entity("A", "X")
        m1.add(PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05)))
        self.assertFalse(m1.equivalent(m2))
        self.assertFalse(m2.equivalent(m1))
예제 #14
0
 def consensus(self):
     """redefine the phenotype data to collapse multiple entries using a consensus"""
     
     # partition the data by phenotype, then scan each phenotype
     parts = self._split_phenotypes()
     result = []
     for phenotype in parts.keys():            
         data = [_.experiment for _ in parts[phenotype]]
         stamps = [_.timestamp for _ in parts[phenotype]]            
         num0 = sum([_.value == 0 for _ in data])
         numP = sum([ 0 < _.value < 1 for _ in data])
         num1 = sum([_.value == 1 for _ in data])
         num_majority = max(num0, numP, num1)
         if num_majority == numP:
             data = [_ for _ in data if 0 < _.value < 1]
         elif num_majority == num0:
             data = [_ for _ in data if _.value == 0]
         else:
             data = [_ for _ in data if _.value == 1]
         majority = mean([_.value for _ in data])             
         tpr = mean([_.tpr for _ in data])
         fpr = mean([_.fpr for _ in data])
         tpr = max(fpr, tpr*float(num_majority)/(num0+numP+num1))
         datum = PhenotypeDatum(phenotype, 
                                Experiment(majority, tpr, fpr), 
                                stamps[0])
         result.append(datum)
     
     self.data = result
     return self
예제 #15
0
    def test_default_timestamp(self):
        """default should set a timestamp"""

        now = datetime.now()
        datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1))
        stamp = datetime.strptime(datum.timestamp, timestamp_format)
        diff = (now - stamp).total_seconds()
        self.assertLess(diff, 60, "stamps should be within a few seconds")
예제 #16
0
    def test_average_2(self):
        """summarize phenotypes using an average (discordant values)."""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("MP:002", Experiment(0, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        self.assertEqual(len(m.data), 3)

        # check that the consensus matches the inputs
        m.average()
        self.assertEqual(len(m.data), 1)
        self.assertEqual(m.data[0].phenotype, "MP:002")
        iexp = m.data[0].experiment
        self.assertGreater(iexp.value, 0)
        self.assertAlmostEqual(iexp.tpr, (0.8 + 0.0 + 0.6) / 3)
        self.assertAlmostEqual(iexp.fpr, 0.05)
예제 #17
0
    def test_init(self):
        """init of basic object"""

        datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1))

        self.assertEqual(datum.phenotype, "MP:1")
        self.assertEqual(datum.value, 1)
        self.assertEqual(datum.tpr, 0.7)
        self.assertEqual(datum.fpr, 0.1)
        self.assertFalse(datum.timestamp is None)
예제 #18
0
    def control0(self, refname, refrep):
        """create a model with the same phenotypes as a representation. """

        model = tech_model(refname + "_match", "match")
        model.set_description("control_for", refname)
        timestamp = self.timestamp
        for phen in refrep.keys():
            phen_value = refrep.get(phen)
            phen_exp = Experiment(1, self.tpr, self.fpr)
            model.add(PhenotypeDatum(phen, phen_exp, timestamp))
        return model.trim_ancestors(self.obo)
예제 #19
0
    def test_consensus_2(self):
        """summarize multiple rows of phenotypes using a consensus with some discordance."""

        # first add several pieces of evidence into an entity object
        m = Entity("abc", "genes")
        d1 = PhenotypeDatum("MP:002", Experiment(1, 0.8, 0.05))
        d2 = PhenotypeDatum("MP:002", Experiment(0, 0.4, 0.05))
        d3 = PhenotypeDatum("MP:002", Experiment(1, 0.6, 0.05))
        m.add(d1).add(d2).add(d3)
        self.assertEqual(len(m.data), 3)

        # check that the consensus matches the inputs
        m.consensus()
        self.assertEqual(len(m.data), 1)
        c1 = m.data[0]
        iphen = m.data[0].phenotype
        iexp = m.data[0].experiment
        self.assertEqual(iexp.value, 1)
        # the tpr will be lower than (0.6+0.8)/2
        # it should be (0.7*2/3)
        self.assertEqual(iexp.tpr, 0.7 * (2 / 3))
        self.assertEqual(iexp.fpr, 0.05)
예제 #20
0
def get_file_phenotypes(filepath, timestamp):
    """get model phenotypes from a file."""

    result = dict()
    if filepath is None:
        return result
    with open_file(filepath, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="'")
        for row in reader:
            experiment = Experiment(row["value"], row["TPR"], row["FPR"])
            stamp = timestamp
            if "timestamp" in row:
                stamp = row["timestamp"]
            datum = PhenotypeDatum(row["phenotype"], experiment, stamp)
            id = row["id"]
            if id not in result:
                result[id] = []
            result[id].append(datum)
    return result
예제 #21
0
    def average(self):
        """redefine the phenotype data to collapse multiple entries using an average"""

        # partition the data by phenotype, then scan each phenotype
        parts = self._split_phenotypes()
        result = []
        for phenotype in parts.keys():
            data = [_.experiment for _ in parts[phenotype]]
            # this part requires work
            val = mean([_.value for _ in data])
            if val > 0:
                tpr = mean([_.tpr*_.value for _ in data])
            else:
                tpr = mean([_.tpr for _ in data])
            fpr = mean([_.fpr for _ in data])
            # create a single simplified phenotype
            datum = PhenotypeDatum(phenotype, Experiment(val, tpr, fpr))
            result.append(datum)
        self.data = result
        return self
예제 #22
0
def prep_IMPC(datapath, tprfpr, pthreshold, simplify="average", obo=None):
    """parse IMPC statistical results and assemble a set of models.
    
    Args:
        datapath:    path to MGI raw file            
        tprfpr:      list with two elements (tpr, fpr)
        pthreshold:  float, minimum threshold for significance
        simplify:    string, method for simplifying multiple data type
                    (use 'none', 'average', or 'consensus')
        obo:         object of class MinimalObo
    """

    models = dict()
    if datapath is None:
        return models

    now = now_timestamp()
    base_tpr, base_fpr = tprfpr[0], tprfpr[1]
    male = set(["M", "B", "U"])
    female = set(["F", "B", "U"])

    def create_models(id, category, zygosity, row):
        """Create a family of model definitions, for sex=FMU, neg_phen=01"""

        prefix = "IMPC_" + id + "_" + zygosity + "_"
        for suffix in ["F", "FA", "M", "MA", "U", "UA"]:
            id = prefix + suffix
            if id not in models:
                models[id] = impc_model(id, category, row, zygosity)
                models[id].set_description("sex", sex_code(suffix))
                with_negative = negative_code(suffix)
                models[id].set_description("neg_phenotypes", with_negative)

    def add_to_model(datum, id, zygosity, suffix):
        """add a datum into an existing model definition.
        
        Arguments:
            datum      phenotype and experiment result
            id, zygosity, suffix
                       characterization of model
        """
        id = "IMPC_" + id + "_" + zygosity + "_" + suffix
        models[id].add(datum)

    def add_set_to_models(datum, row, val, sex):
        """helper to add a set of models, for alleles, markers
        
        Arguments:
            datum     phenotype and experiment result
            row       dict
            val       value of phenotype (0/1)
            sex       one-letter code
        """
        zygosity = (row["zygosity"])[:3]
        zygosity = "hom" if zygosity == "hem" else zygosity
        marker = row["marker_accession_id"]
        allele = row["allele_accession_id"]
        # perhaps create model definitions
        create_models(marker, "marker", zygosity, row)
        create_models(allele, "allele", zygosity, row)
        # record phenotypes into the models
        if val == 1:
            add_to_model(datum, marker, zygosity, sex)
            add_to_model(datum, allele, zygosity, sex)
        add_to_model(datum, marker, zygosity, sex + "A")
        add_to_model(datum, allele, zygosity, sex + "A")

    # get a map from parameter to mp_terms - used for negative phenotypes
    parameter_phenotype_map = get_parameter_phenotype_map(datapath, obo)

    with open_file(datapath, "rt") as f:
        reader = csv.DictReader(f, delimiter=",", quotechar="\"")
        for row in reader:
            # skip over bad data rows
            if row["status"] not in ("Success", "Successful"):
                continue
            if row["allele_symbol"] == "":
                continue

            # get a phenotype MP id
            phenotype = row["mp_term_id"].strip()
            # redefine some phenotypes
            # (this handles morphology MP:0002169 annotations)
            parameter = row["parameter_name"].strip()
            if phenotype + " " + parameter in redef:
                phenotype = redef[phenotype + " " + parameter]
            if phenotype == "" and parameter in parameter_phenotype_map:
                phenotype = parameter_phenotype_map[parameter]
            if phenotype == "" or phenotype == "MP:0002169":
                continue

            sex = sex_code(row["phenotype_sex"])

            # identify whether this is a positive or a negative phenotype
            value = get_value(row, pthreshold)

            # add data at marker level, allele level, by gender
            hit = Experiment(value, base_tpr, base_fpr)
            datum = PhenotypeDatum(phenotype, hit, now)
            add_set_to_models(datum, row, value, "U")
            if sex in male:
                add_set_to_models(datum, row, value, "M")
            if sex in female:
                add_set_to_models(datum, row, value, "F")

    # some models may have redundant rows (e.g. a phenotype recorded twice)
    # so collapse into a consensus here
    if simplify == "consensus":
        for id in models:
            models[id].consensus()
    elif simplify == "average":
        for id in models:
            models[id].average()
    return models
예제 #23
0
"""
Tests for contents of phenoscoring/phenotypedatum.py
"""

import unittest
from datetime import datetime
from scoring.experiment import Experiment
from phenoscoring.phenotypedatum import PhenotypeDatum
from phenoscoring.time import timestamp_format

e1 = Experiment(1, 0.8, 0.05)
e2 = Experiment(0, 0.8, 0.05)
e3 = Experiment(1, 0.6, 0.15)


class PhenotypeDatumTests(unittest.TestCase):
    """Test cases for handling object with phenotype and experiment"""
    def test_init(self):
        """init of basic object"""

        datum = PhenotypeDatum("MP:1", Experiment(1, 0.7, 0.1))

        self.assertEqual(datum.phenotype, "MP:1")
        self.assertEqual(datum.value, 1)
        self.assertEqual(datum.tpr, 0.7)
        self.assertEqual(datum.fpr, 0.1)
        self.assertFalse(datum.timestamp is None)

    def test_str(self):
        """object can be summarized."""