示例#1
0
def get_raw_references(datapath, phenotype_set):
    """Parse a phenotype file and collect descriptions and raw phenotypes  
    
    raw phenotypes are phenotypes in the original ontology
    
    Args:
        datapath       path to phenotab file
        phenotype_set  set of acceptable phenotypes
    
    Returns:
        two objects
        - dict mapping reference codes to reference descriptions and phenotypes
        - set of phenotypes that could not be mapped
    """

    badphenotypes = set()
    references = dict()
    with open_file(datapath, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="\"")
        for row in reader:
            if not valid_reference_id(row["Reference"]):
                continue
            phenotype = row["Phenotype"]
            if phenotype not in phenotype_set:
                badphenotypes.add(phenotype)
                continue
            rowval = tofreq[row["Frequency"]]
            id = row["Source"] + ":" + row["Disease_number"]
            if id not in references:
                references[id] = Representation(name=id)
                references[id].title = row["Disease_title"]
            references[id].set(phenotype, rowval)

    return references, badphenotypes
示例#2
0
def get_emapa_map(emap_path, obo):
    """read a file definition of emap to mp mappings"""

    # get all the mapping from the raw file
    raw = dict()
    with open_file(emap_path, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="'")
        for row in reader:
            emapa = str(row["EMAPA_ID"])
            mp = str(row["MP_ID"])
            if not obo.has(mp):
                print("Skipping mp term: " + mp)
                continue
            ancestors = obo.ancestors(mp)
            if emapa not in raw:
                raw[emapa] = []
            raw[emapa].append((len(ancestors), mp))

    # extract the best hits
    result = dict()
    for k, hits in raw.items():
        hits.sort()
        best_value = hits[0][0]
        result[k] = [mp for _, mp in hits if _ == best_value]

    return result
示例#3
0
def prep_oo(owlfile, obo, siblings=False):
    """Scans one file and identify best 1-to-1 ontology mappings."""

    okterms = set(obo.ids())

    result = []

    # scan file, collect lines into an OOset, then remember best hits
    with open_file(owlfile, "rt") as f:
        state = OOset()
        for line in f:
            tokens = line.split("\t")
            tokens[0] = tokens[0].replace("_", ":")
            tokens[1] = tokens[1].replace("_", ":")
            # skip over mapping to terms that are not in the ontology
            if tokens[1] not in okterms:
                continue
            # perhaps reset the OOset
            if tokens[0] != state.term1:
                if state.term1 is not None:
                    result.extend(state.hits(obo, siblings=siblings))
                state = OOset(tokens[0])
            # add the mapping into the current store
            state.add(tokens[1], float(tokens[2]) * float(tokens[3]))
        if state.term1 != "":
            result.extend(state.hits(obo, siblings=siblings))

    return result
示例#4
0
def write_priors(priors, outprefix):
    """write phenotype priors to an output files."""

    outfile = outprefix + "-priors.tsv.gz"
    with open_file(outfile, "wt") as f:
        fwrite(f, "\t".join(["phenotype", "value"]))
        for phenotype, value in priors.items():
            fwrite(f, phenotype + "\t" + str(value))
示例#5
0
def fill_phenotype_frequency_table(dbpath, datapath):
    """Transfer phenotype frequencies from a file into the database."""
    
    freqtable = PhenotypeFrequencyTable(dbpath)
    with open_file(datapath, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="\"")
        for row in reader:
            freqtable.add(row["phenotype"], float(row["value"]))
    freqtable.save()
示例#6
0
def fill_concise_reference_table(dbpath, datapath):
    """transfer phenotypes from a data file into the database."""
    
    model = ReferenceConcisePhenotypeTable(dbpath)
    with open_file(datapath, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="\"")
        for row in reader:            
            model.add(row["id"], row["phenotype"], float(row["value"]))
    model.save()
示例#7
0
def write_descriptions(models, outfile, exclude=["timestamp"]):
    """ write a table with model descriptions."""

    colnames = ModelDescriptionTable.text_fields
    colnames = [_ for _ in colnames if _ not in set(exclude)]
    with open_file(outfile, "wt") as f:
        fwrite(f, "\t".join(colnames))
        for _, model in models.items():
            fwrite(f, format_line(model, colnames))
示例#8
0
def write_oo(ooarray, outprefix):
    """write a table summarizing ontology-ontology mapping."""

    header = ["term1", "term2", "score"]
    outfile = outprefix + "-oomap.tsv.gz"
    with open_file(outfile, "wt") as f:
        fwrite(f, "\t".join(header))
        for data in ooarray:
            fwrite(f, "\t".join([str(_) for _ in data]))
示例#9
0
def write_references(references, outprefix):
    """write phenotypes for a set of references into output files."""

    outfile = outprefix + "-phenotypes.tsv.gz"
    colnames = ["id", "phenotype", "value"]
    with open_file(outfile, "wt") as f:
        fwrite(f, "\t".join(colnames))
        for key, object in references.items():
            for phenotype, value in object.data.items():
                fwrite(f, "\t".join([key, phenotype, str(value)]))
示例#10
0
    def test_writing(self):
        """write the parsed MGI data onto files"""

        obo = MinimalObo(obo_file)
        models = prep_IMPC(impc_file, (0.8, 0.05), 0.01, obo=obo)
        write_models(models, out_prefix)

        # read contents back
        self.assertTrue(exists(desc_file))
        self.assertTrue(exists(pheno_file))
        with open_file(desc_file, "rt") as f:
            desc = f.read().strip().split("\n")
        with open_file(pheno_file, "rt") as f:
            pheno = f.read().strip().split("\n")

        # description file should have 25 lines, 24 data lines plus header
        self.assertEqual(len(desc), 25)
        # phenotype file should have at least 7 lines (more)
        self.assertGreater(len(pheno), 7)
示例#11
0
def get_file_models(filepath, timestamp):
    """get model descriptions defined in a file."""

    result = dict()
    if filepath is None:
        return result
    with open_file(filepath, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="'")
        for row in reader:
            result[row["id"]] = make_model(row, timestamp)
    return result
示例#12
0
    def test_writing(self):
        """write the parsed MGI data onto files"""

        models = prep_MGI(mgi_file, (0.8, 0.05), obo)
        write_models(models, out_prefix)

        # read contents back
        self.assertTrue(exists(desc_file))
        self.assertTrue(exists(pheno_file))
        with open_file(desc_file, "rt") as f:
            desc = f.read().strip().split("\n")
        with open_file(pheno_file, "rt") as f:
            pheno = f.read().strip().split("\n")

        # description file should have 5 lines, 4 data lines plus header
        # this one allele_id in two zygosities - 2 genotype models
        # 2 genotypes models will give 2 marker models
        self.assertEqual(len(desc), 5)
        # phenotype file should have at least 5 lines again (more)
        self.assertGreater(len(pheno), 5)
示例#13
0
 def test_writing(self):
     """write the parsed phenotypes into file"""
             
     write_references(self.references, out_prefix)
     
     # read contents back
     self.assertTrue(exists(out_file))        
     with open_file(out_file, "rt") as f:
         result = f.read().strip().split("\n")
     self.assertEqual(len(result), 12,
                      "rows 3+3+3+2 for phenotypes and 1 for header")
示例#14
0
 def test_writing(self):
     """check output is a two column tsv"""
     
     priors, num = get_priors_from_models(self.models, set(["genotype"]), 
                                          obo, dark=1)
     write_priors(priors, out_prefix)
     
     self.assertTrue(exists(priors_file))
     with open_file(priors_file, "rt") as f:
         data = f.read().strip().split("\n")
     self.assertEqual(len(data), 1+len(obo.ids()))
     self.assertEqual(len(data[0].split("\t")), 2)
示例#15
0
    def test_imputing(self):
        """create new models based on UA."""

        obo = MinimalObo(obo_file)
        models = prep_IMPC(impc_file, (0.8, 0.05), 0.01, obo=obo)
        models_allele = get_UA_models(models, "allele")
        imputed = impute_IMPC(models_allele, obo, 0)
        write_models(imputed, out_prefix + "-imputed")

        # check output files exist and contain proper content
        self.assertTrue(exists(imputed_desc_file))
        self.assertTrue(exists(imputed_pheno_file))
        with open_file(imputed_desc_file, "rt") as f:
            desc = f.read().strip().split("\n")
        with open_file(imputed_pheno_file, "rt") as f:
            pheno = f.read().strip().split("\n")

        # description file should have 3 lines, 2 desc lines plus header
        self.assertEqual(len(desc), 3)
        # phenotype file should have a few lines
        self.assertGreater(len(pheno), 3)
示例#16
0
def write_hits_summary(tested, hits, outprefix):
    """write a table linking parameters, MP, to number of markers."""

    header = ["parameter", "MP_term", "markers_tested", "markers_significant"]

    outfile = outprefix + "-hits-summary.tsv.gz"
    with open_file(outfile, "wt") as f:
        fwrite(f, "\t".join(header))
        for key in tested:
            num_tested = str(len(tested[key]))
            num_hits = str(len(hits[key]))
            fwrite(f, key + "\t" + num_tested + "\t" + num_hits)
示例#17
0
    def test_write_hits_summary(self):
        """get a summary of number of hits."""

        tested, hits = get_IMPC_hits_summary(impc_file, 0.01)
        write_hits_summary(tested, hits, out_prefix)

        self.assertTrue(exists(hits_file))
        with open_file(hits_file, "rt") as f:
            summary = f.read().strip().split("\n")

        # output should have three phenotypes plus header
        self.assertEqual(len(summary), 4)
示例#18
0
def get_gxd(gxd_path, emp_map, tprfpr):
    """read a file with marker-emapa associationss
    
    Arguments:
        gxd_path   file with columns ....
        emp_map      dict mapping EMAPA ids to other ids
        tprfpr     2-tuple with (tpr, fpr)
    
    Returns:
        dict mapping markers to phenotypes terms
    """

    tpr = tprfpr[0]
    fpr = tprfpr[1]

    # get all the mapping from the raw file
    result = dict()
    with open_file(gxd_path, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="'")
        for row in reader:
            feature = row["feature.primaryIdentifier"]
            emapa = row["structure.identifier"]
            strength = row["strength"]

            if feature not in result:
                modelid = "GXD_" + feature
                result[feature] = Entity(modelid,
                                         "expression",
                                         marker_id=feature)
                result[feature].set_description("expression", 1)
                result[feature].set_description("source", "GXD")

            if emapa not in emp_map:
                continue
            if strength not in gxd_strength:
                continue

            # determine whether to add a positive or negative phenotype
            strength_factor = gxd_strength[strength]
            row_exp = Experiment(1, fpr + (tpr - fpr) * strength_factor, fpr)
            if strength == "Absent":
                row_exp.value = 0
            for mp in emp_map[emapa]:
                result[feature].add(PhenotypeDatum(mp, row_exp))

    # get a concensus value
    for id in result:
        result[id].consensus()

    return result
示例#19
0
def write_phenotype_cooc(cooc, phenindex, outprefix):
    """write a table summarizing co-occurance of phenotypes."""

    header = ["A", "B", "value"]

    outfile = outprefix + "-cooc.tsv.gz"
    with open_file(outfile, "wt") as f:
        fwrite(f, "\t".join(header))
        for p1, i1 in phenindex.items():
            for p2, i2 in phenindex.items():
                if cooc[i1, i2] == 0:
                    continue
                line = [p1, p2, str(cooc[i1, i2])]
                fwrite(f, "\t".join(line))
示例#20
0
def get_file_phenotypes(filepath, timestamp):
    """get model phenotypes from a file."""

    result = dict()
    if filepath is None:
        return result
    with open_file(filepath, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="'")
        for row in reader:
            experiment = Experiment(row["value"], row["TPR"], row["FPR"])
            stamp = timestamp
            if "timestamp" in row:
                stamp = row["timestamp"]
            datum = PhenotypeDatum(row["phenotype"], experiment, stamp)
            id = row["id"]
            if id not in result:
                result[id] = []
            result[id].append(datum)
    return result
示例#21
0
def write_model_phenotypes(models, outfile, exclude=["id"]):
    """write a table with phenotypes
    
    Arguments:
        models    dict with Model models
        outfile    path to output file
        exclude    set of columns to omit in output
    
    Returns:
        nothing, writes data into output file
    """

    # get all column names (except id, which will be entered separately)
    colnames = get_colnames(ModelPhenotypeTable, exclude)

    with open_file(outfile, "wt") as f:
        fwrite(f, "id\t" + "\t".join(colnames))
        for key, object in models.items():
            for d in object.data:
                fwrite(f, object.id + "\t" + format_line(d, colnames))
示例#22
0
def get_oo_map(oopath):
    """read a file mapping ontology terms to another set of terms.
    
    Args:
        oopath       path to text file with term1, term2, score
    
    Return:
        dict mapping term1 -> [(term2, score)]
    """

    result = dict()
    with open_file(oopath, "rt") as f:
        reader = csv.DictReader(f, delimiter="\t", quotechar="\"")
        for row in reader:
            term1 = row["term1"]
            score = float(row["score"])
            if term1 not in result:
                result[term1] = []
            for term2 in row["term2"].split(";"):
                result[term1].append((term2, score))
    return result
示例#23
0
def prep_IMPC(datapath, tprfpr, pthreshold, simplify="average", obo=None):
    """parse IMPC statistical results and assemble a set of models.
    
    Args:
        datapath:    path to MGI raw file            
        tprfpr:      list with two elements (tpr, fpr)
        pthreshold:  float, minimum threshold for significance
        simplify:    string, method for simplifying multiple data type
                    (use 'none', 'average', or 'consensus')
        obo:         object of class MinimalObo
    """

    models = dict()
    if datapath is None:
        return models

    now = now_timestamp()
    base_tpr, base_fpr = tprfpr[0], tprfpr[1]
    male = set(["M", "B", "U"])
    female = set(["F", "B", "U"])

    def create_models(id, category, zygosity, row):
        """Create a family of model definitions, for sex=FMU, neg_phen=01"""

        prefix = "IMPC_" + id + "_" + zygosity + "_"
        for suffix in ["F", "FA", "M", "MA", "U", "UA"]:
            id = prefix + suffix
            if id not in models:
                models[id] = impc_model(id, category, row, zygosity)
                models[id].set_description("sex", sex_code(suffix))
                with_negative = negative_code(suffix)
                models[id].set_description("neg_phenotypes", with_negative)

    def add_to_model(datum, id, zygosity, suffix):
        """add a datum into an existing model definition.
        
        Arguments:
            datum      phenotype and experiment result
            id, zygosity, suffix
                       characterization of model
        """
        id = "IMPC_" + id + "_" + zygosity + "_" + suffix
        models[id].add(datum)

    def add_set_to_models(datum, row, val, sex):
        """helper to add a set of models, for alleles, markers
        
        Arguments:
            datum     phenotype and experiment result
            row       dict
            val       value of phenotype (0/1)
            sex       one-letter code
        """
        zygosity = (row["zygosity"])[:3]
        zygosity = "hom" if zygosity == "hem" else zygosity
        marker = row["marker_accession_id"]
        allele = row["allele_accession_id"]
        # perhaps create model definitions
        create_models(marker, "marker", zygosity, row)
        create_models(allele, "allele", zygosity, row)
        # record phenotypes into the models
        if val == 1:
            add_to_model(datum, marker, zygosity, sex)
            add_to_model(datum, allele, zygosity, sex)
        add_to_model(datum, marker, zygosity, sex + "A")
        add_to_model(datum, allele, zygosity, sex + "A")

    # get a map from parameter to mp_terms - used for negative phenotypes
    parameter_phenotype_map = get_parameter_phenotype_map(datapath, obo)

    with open_file(datapath, "rt") as f:
        reader = csv.DictReader(f, delimiter=",", quotechar="\"")
        for row in reader:
            # skip over bad data rows
            if row["status"] not in ("Success", "Successful"):
                continue
            if row["allele_symbol"] == "":
                continue

            # get a phenotype MP id
            phenotype = row["mp_term_id"].strip()
            # redefine some phenotypes
            # (this handles morphology MP:0002169 annotations)
            parameter = row["parameter_name"].strip()
            if phenotype + " " + parameter in redef:
                phenotype = redef[phenotype + " " + parameter]
            if phenotype == "" and parameter in parameter_phenotype_map:
                phenotype = parameter_phenotype_map[parameter]
            if phenotype == "" or phenotype == "MP:0002169":
                continue

            sex = sex_code(row["phenotype_sex"])

            # identify whether this is a positive or a negative phenotype
            value = get_value(row, pthreshold)

            # add data at marker level, allele level, by gender
            hit = Experiment(value, base_tpr, base_fpr)
            datum = PhenotypeDatum(phenotype, hit, now)
            add_set_to_models(datum, row, value, "U")
            if sex in male:
                add_set_to_models(datum, row, value, "M")
            if sex in female:
                add_set_to_models(datum, row, value, "F")

    # some models may have redundant rows (e.g. a phenotype recorded twice)
    # so collapse into a consensus here
    if simplify == "consensus":
        for id in models:
            models[id].consensus()
    elif simplify == "average":
        for id in models:
            models[id].average()
    return models
示例#24
0
"""
Prep data from IMPC into a format for Phenoscoring
"""

import csv
import pkg_resources
from tools.files import open_file
from scoring.experiment import Experiment
from phenoscoring.phenotypedatum import PhenotypeDatum
from phenoscoring.entity import Entity
from phenoscoring.time import now_timestamp

# fetch a dictionary of term redefinitions for MP:0002169 (no abnormal phenotype detected)
redef = dict()
redef_file = pkg_resources.resource_filename(__name__, "impc.2169.tsv")
with open_file(redef_file, "rt") as f:
    reader = csv.DictReader(f, delimiter="\t", quotechar="\"")
    for row in reader:
        rowkey = row["original_id"] + " " + row["parameter_name"]
        redef[rowkey] = row["redefined_id"]

# ###########################################################################
# Functions relevant to prep_IMPC


def is_float(x):
    """an ad-hoc way to determine if a string encodes a pvalue."""

    try:
        float(x)
    except:
示例#25
0
    config = parser.parse_args()    
    
    mousemine = Service("http://www.mousemine.org/mousemine/service")
    mousemine_views = ["assayType", "feature.symbol", "feature.primaryIdentifier",
                       "stage", "age", "structure.name", "strength", "pattern",
                       "genotype.symbol", "assayId", "probe", "image",
                       "publication.mgiJnum", "emaps", "structure.identifier"]
    
    # fetch all markers
    markers = set()    
    for filename in config.input.split(","):    
        markers.update(values_in_column(filename, "marker_id"))
    print("Working with "+str(len(markers))+" markers")
    
    # fetch data from mousemine
    markers = list(markers)    
    result = []
    for i in range(0, len(markers), config.group_size):
        imarkers = markers[i:(i+config.group_size)]
        print("querying: " + str(i) + " of " + str(len(markers)))       
        result.extend(download_mousemine(imarkers))
        sleep(config.sleep)
    print("done")

    # write expression data to disk
    with open_file(config.output, "wt") as f:
        f.write("\t".join(mousemine_views)+"\n")
        for row in result:
            f.write("\t".join([str(_) for _ in row]) + "\n")