예제 #1
0
def import_predixcan(path, name, tissue, assay, session):
    """
    Import weights from a PrediXcan db into the FOCUS db.

    :param path:  string path to the PrediXcan db
    :param name: str name of the reference panel
    :param tissue: str name of the tissue
    :param assay: technology assay to measure abundance
    :param session: sqlalchemy.Session object for the FOCUS db

    :return:  None
    """
    log = logging.getLogger(pf.LOG)

    import re
    import numpy as np

    from collections import defaultdict
    try:
        import mygene
    except ImportError:
        log.error("Import submodule requires mygene and rpy2 to be installed.")
        raise

    log.info("Starting import from PrediXcan database {}".format(path))
    pred_engine = create_engine("sqlite:///{}".format(path))

    weights = pd.read_sql_table('weights', pred_engine)
    extra = pd.read_sql_table('extra', pred_engine)

    def gencode2ensmble(x):
        idx = x.rfind(".")
        return x if idx == -1 else x[:idx]

    # get unique genes
    genes = weights.gene.unique()
    genes = [gencode2ensmble(g) for g in genes]

    log.info("Querying mygene servers for gene annotations")
    mg = mygene.MyGeneInfo()
    results = mg.querymany(genes, scopes='ensembl.gene', verbose=False,
                           fields=["genomic_pos_hg19,symbol,alias"], species="human")

    res_map = defaultdict(list)
    for result in results:
        res_map[result["query"]].append(result)

    db_ref_panel = pf.RefPanel(ref_name=name, tissue=tissue, assay=assay)
    ses = None
    method = "ElasticNet"

    count = 0
    log.info("Starting individual model conversion")
    for gid, gene in weights.groupby("gene"):
        log.debug("Importing gene model {}".format(gid))
        gene_extra = extra.loc[extra.gene == gid]

        chrom = gene.varID.values[0].split("_")[0]  # grab chromosome from varID
        pos = gene.varID.map(lambda x: int(x.split("_")[1])).values  # grab basepair pos
        txstart = txstop = np.median(pos)

        g_id = gene_extra.gene.values[0]
        g_name = gene_extra.genename.values[0]
        query_id = gencode2ensmble(g_id)

        for hit in res_map[query_id]:
            if "notfound" in hit:
                continue

            if hit["symbol"] != g_name and "alias" in hit and g_name not in hit["alias"]:
                continue

            if "genomic_pos_hg19" not in hit:
                continue

            gpos = hit["genomic_pos_hg19"]
            if type(gpos) is dict:
                gpos = [gpos]

            for entry in gpos:
                # skip non-primary assembles. they have weird CHR entries e.g., CHR_HSCHR1_1_CTG3
                if not re.match("[0-9]{1,2}|X|Y", entry["chr"], re.IGNORECASE):
                    continue

                txstart = entry['start']
                txstop = entry['end']
                break

            if txstart is not None:
                # we want to use standardized Ensembl identifiers; not GENCODE modified ones...
                g_id = query_id
                break

        gene_info = dict()
        gene_info["geneid"] = g_id
        gene_info["txid"] = None
        gene_info["name"] = g_name
        gene_info["type"] = gene_extra.gene_type.values[0]
        gene_info["chrom"] = chrom
        gene_info["txstart"] = txstart
        gene_info["txstop"] = txstop

        snp_info = pd.DataFrame({"snp": gene.rsid.values,
                                "chrom": [chrom] * len(gene),
                                "pos": pos,
                                "a1": gene.eff_allele.values,
                                "a0": gene.ref_allele.values})

        wgts = gene.weight.values

        attrs = dict()
        attrs["cv.R2"] = gene_extra["cv_R2_avg"].values[0]
        attrs["cv.R2.pval"] = gene_extra["nested_cv_fisher_pval"].values[0]

        # build model
        model = pf.build_model(gene_info, snp_info, db_ref_panel, wgts, ses, attrs, method)
        session.add(model)
        try:
            session.commit()
        except Exception as comm_err:
            session.rollback()
            raise Exception("Failed committing to db")

        count += 1
        if count % 500 == 0:
            log.info("Committed 500 models to db")

    if count % 500 != 0:
        log.info("Committed {} models to db".format(count % 500))


    log.info("Finished import from PrediXcan database {}".format(path))
    return
예제 #2
0
def import_fusion(path, name, tissue, assay, session):
    """
    Import weights from a PrediXcan db into the FOCUS db.

    :param path:  string path to the PrediXcan db
    :param tissue: str name of the tissue
    :param assay: technology assay to measure abundance
    :param session: sqlalchemy.Session object for the FOCUS db

    :return:  None
    """
    log = logging.getLogger(pf.LOG)

    import re
    import os
    import warnings

    from collections import defaultdict

    import numpy as np
    try:
        import mygene
        # suppress warnings about R build
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            import rpy2.robjects as robj
    except ImportError:
        log.error("Import submodule requires mygene and rpy2 to be installed.")
        raise

    log.info("Starting import from FUSION database {}".format(path))
    db_ref_panel = pf.RefPanel(ref_name=name, tissue=tissue, assay=assay)
    ses = None

    load_func = robj.r['load']
    local_dir = os.path.dirname(os.path.abspath(path))

    # we need this to grab Ensembl IDs for genes
    mg = mygene.MyGeneInfo()

    # WGT ID CHR P0 P1
    fusion_db = pd.read_csv(path, delim_whitespace=True)
    genes = fusion_db.ID.values

    # we need to do batch queries in order to not get throttled by the mygene servers
    log.info("Querying mygene servers for gene annotations")
    results = mg.querymany(genes, scopes='symbol', verbose=False,
                           fields=['ensembl.gene,genomic_pos,symbol,ensembl.type_of_gene,alias'], species="human")

    res_map = defaultdict(list)
    for result in results:
        res_map[result["query"]].append(result)

    count = 0
    log.info("Starting individual model conversion")
    for rdx, row in fusion_db.iterrows():
        wgt_name, g_name, chrom, txstart, txstop = row.WGT, row.ID, row.CHR, row.P0, row.P1

        # METSIM.ADIPOSE.RNASEQ/METSIM.LINC00115.wgt.RDat LINC00115 1 761586 762902
        log.debug("Importing {} model".format(wgt_name))

        # this call should create the following:
        # 'wgt.matrix', 'snps', 'cv.performance', 'hsq', and 'hsq.pv'
        wgt_path = "{}/{}".format(local_dir, wgt_name)

        # load the Rdat data
        load_func(wgt_path)

        gene_info = dict()
        id_dict = dict()
        # hits are ordered by match quality.
        for hit in res_map[g_name]:
            if "notfound" in hit:
                continue

            if "ensembl" not in hit:
                # nothing in db
                continue

            if hit["symbol"] != g_name and "alias" in hit and g_name not in hit["alias"]:
                # not direct match
                continue

            if "genomic_pos" not in hit:
                continue

            ens = hit["ensembl"]
            pos = hit["genomic_pos"]

            # sometimes we have multiple ENSG entries due to diff haplotypes.
            # just reduce the single-case to the multi by a singleton list
            if type(ens) is dict:
                ens = [ens]
            if type(pos) is dict:
                pos = [pos]

            # grab the type for when we match against pos
            for e_hit in ens:
                id_dict[e_hit["gene"]] = e_hit["type_of_gene"]

            for p_hit in pos:
                if not re.match("[0-9]{1,2}|X|Y", p_hit["chr"], re.IGNORECASE):
                    continue

                g_id = p_hit["ensemblgene"]
                g_type = id_dict.get(p_hit["ensemblgene"])

                if len(gene_info) == 0:
                    # grab any info if we haven't yet
                    gene_info["geneid"] = g_id
                    gene_info["type"] = g_type
                elif "protein" in g_type:
                    # prioritize protein coding and break out if we find one
                    gene_info["geneid"] = g_id
                    gene_info["type"] = g_type

        if len(gene_info) == 0:
            # we didn't get any hits from our query
            # just use the gene-name as ens-id...
            log.warning("Unable to match {} to Ensembl ID. Using symbol for ID".format(g_name))
            gene_info["geneid"] = g_name
            gene_info["type"] = None

        # build info on the gene
        gene_info["txid"] = None
        gene_info["name"] = g_name
        gene_info["chrom"] = chrom
        gene_info["txstart"] = txstart
        gene_info["txstop"] = txstop

        # get the multi-SNP method with the best cvR2
        methods = np.array(robj.r['cv.performance'].colnames)
        types = list(robj.r['cv.performance'].rownames)
        if "rsq" not in types:
            raise ValueError("No R2 value for model {}".format(path))
        if "pval" not in types:
            raise ValueError("No R2 p-value for model {}".format(path))

        # grab the actual weights
        wgts = np.array(robj.r['wgt.matrix'])

        # sometimes weights are constant or only contain NANs; drop them
        keep = np.logical_not(np.isnan(np.std(wgts, axis=0)))
        wgts = wgts.T[keep].T
        methods = methods[keep]

        rsq_idx = types.index("rsq")
        pval_idx = types.index("pval")

        values = np.array(robj.r['cv.performance'])
        v_shape = values.shape

        # is this always stored/retrieved as 2 x M ?
        if v_shape[0] > v_shape[1]:
            values = values.T

        values = values.T[keep].T

        method = None
        r2idx = 0
        r2 = -100  # FUSION reports the generalized R2 which can be negative
        for idx, value in enumerate(values[rsq_idx]):
            if methods[idx] == "top1":
                continue

            if value > r2:
                r2 = value
                method = methods[idx]
                r2idx = idx
                pval = values[pval_idx, idx]

        wgts = wgts.T[r2idx]

        # keep attributes
        attrs = dict()
        attrs["cv.R2"] = r2
        attrs["cv.R2.pval"] = pval

        # SNPs data frame
        # V1 V2 V3 V4 V5 V6
        # 11 rs2729762 0 77033699 G A
        snps = robj.r['snps']
        snp_info = pd.DataFrame({"snp": list(snps[1]),
                                 "chrom": [str(chrom) for chrom in snps[0]],
                                 "pos": list(snps[3]),
                                 "a1": list(snps[4]),
                                 "a0": list(snps[5])})

        # if we're using a sparse model there is no need to store info on zero'd SNPs
        keep = np.logical_not(np.isclose(wgts, 0))
        wgts = wgts[keep]
        snp_info = snp_info[keep]

        model = pf.build_model(gene_info, snp_info, db_ref_panel, wgts, ses, attrs, method)
        session.add(model)
        try:
            session.commit()
        except Exception as comm_err:
            session.rollback()
            raise Exception("Failed committing to db")
        count += 1
        if count % 500 == 0:
            log.info("Committed 500 models to db")

    if count % 500 != 0:
        log.info("Committed {} models to db".format(count % 500))

    log.info("Finished import from FUSION database {}".format(path))
    return