示例#1
0
def addPhenotypesToRSList(rsIDs, build="38"):
    LOGGER.debug("Input rs list: %d variants" % len(rsIDs))
    R = dict()
    # exclude possible NAs first
    for L in utils.chunks(list(filter(lambda x: x != "NA", rsIDs)),
                          config.VARIATION_POST_MAX):
        r = query.restQuery(query.makeRSPhenotypeQueryURL(build=build),
                            data=utils.list2string(L),
                            qtype="post")
        if not r is None:
            LOGGER.debug(
                "\n=== phenotype query ====\n%s\n==========================\n"
                % json.dumps(r, indent=4, sort_keys=True))
            for v in r:
                if not v in rsIDs:
                    continue
                if "phenotypes" in r[v]:
                    R[v] = set([
                        x["trait"] for x in list(
                            filter(
                                lambda x: not re.search(
                                    "phenotype\s+not\s+specified", x["trait"]),
                                r[v]["phenotypes"]))
                    ])
                else:
                    R[v] = set()
    for v in set(rsIDs) - (set(R.keys()) - {"NA"}):
        R[v] = set()
    return R
示例#2
0
def addConsequencesToIDList(varIDs,
                            build="38",
                            most_severe_only=False,
                            gene_key="gene_id"):
    LOGGER.debug("Input ID list: %d variants" % len(varIDs))
    R = dict()
    # double check, make sure IDs have correct format
    for L in utils.chunks(list(filter(utils.checkID, varIDs)),
                          config.VEP_POST_MAX // 2):
        h = {"variants": []}
        for varid in L:
            V = utils.convertVariantID(varid)
            if utils.checkDEL(V, build=build):
                h["variants"].append(utils.variant2vep(V))
            else:
                V = utils.convertVariantID(varid, reverse=True)
                if utils.checkDEL(V, build=build):
                    h["variants"].append(utils.variant2vep(V))
        r = query.restQuery(query.makeVepListQueryURL(build=build),
                            data=json.dumps(h),
                            qtype="post")
        if not r is None:
            LOGGER.debug(
                "\n======= VEP query ========\n%s\n==========================\n"
                % json.dumps(r, indent=4, sort_keys=True))
            for x in r:
                rs = x["id"]
                mcsq = x[
                    "most_severe_consequence"] if "most_severe_consequence" in x else "NA"
                H = dict()
                if "transcript_consequences" in x:
                    for g in x["transcript_consequences"]:
                        H.setdefault(g[gene_key],
                                     []).extend(g["consequence_terms"])
                    for g in H:
                        H[g] = utils.getMostSevereConsequence(H[g])
                else:
                    H["NA"] = mcsq
                if most_severe_only is True:
                    if mcsq == "NA":
                        R[rs] = {"NA": "NA"}
                    else:
                        g0 = "NA"
                        for g in H:
                            if H[g] == mcsq:
                                g0 = g
                        R[rs] = {g0: mcsq}
                else:
                    R[rs] = H
    s = set(varIDs) - (set(R.keys()) - {"NA"})
    LOGGER.debug("No consequences found for %d IDs" % len(s))
    for v in s:
        R[v] = {"NA": "NA"}
    return R
示例#3
0
def addConsequencesToRSList(rsIDs,
                            build="38",
                            most_severe_only=False,
                            gene_key="gene_id"):
    LOGGER.debug("Input rs list: %d variants" % len(rsIDs))
    R = dict()
    # exclude possible NAs from the input list first
    for L in utils.chunks(list(filter(lambda x: x != "NA", rsIDs)),
                          config.VEP_POST_MAX):
        r = query.restQuery(query.makeVepRSListQueryURL(build=build),
                            data=utils.list2string(L),
                            qtype="post")
        if not r is None:
            LOGGER.debug(
                "\n======= VEP query ========\n%s\n==========================\n"
                % json.dumps(r, indent=4, sort_keys=True))
            for x in r:
                rs = x["id"]
                mcsq = x[
                    "most_severe_consequence"] if "most_severe_consequence" in x else "NA"
                H = dict()
                if "transcript_consequences" in x:
                    for g in x["transcript_consequences"]:
                        H.setdefault(g[gene_key],
                                     []).extend(g["consequence_terms"])
                    for g in H:
                        H[g] = utils.getMostSevereConsequence(H[g])
                else:
                    H["NA"] = mcsq
                if most_severe_only is True:
                    if mcsq == "NA":
                        R[rs] = {"NA": "NA"}
                    else:
                        g0 = "NA"
                        for g in H:
                            if H[g] == mcsq:
                                g0 = g
                        R[rs] = {g0: mcsq}
                else:
                    R[rs] = H
    s = set(rsIDs) - (set(R.keys()) - {"NA"})
    LOGGER.debug("No consequences found for %d rs IDs" % len(s))
    for v in s:
        R[v] = {"NA": "NA"}
    return R
示例#4
0
ch.setFormatter(formatter)
LOGGER.addHandler(ch)

logging.getLogger("varannot.utils").addHandler(ch)
logging.getLogger("varannot.utils").setLevel(verbosity)


def f(ID):
    L = ID.split("_")
    L.insert(2, ID)
    return " ".join(L) + " . . ."


#---------------------------------------------------------------------------------------------------------------------------

for L in utils.chunks([line.rstrip() for line in sys.stdin.readlines()],
                      config.VEP_POST_MAX):
    string = "{\"variants\":[\"" + "\",\"".join(list(map(lambda x: f(x),
                                                         L))) + "\"]}"
    LOGGER.debug("data: %s" % (string))
    r = query.restQuery(query.makeVepListQueryURL(build=build),
                        data=string,
                        qtype="post")
    if r:
        print(json.dumps(r, indent=4, sort_keys=True))
        for x in r:
            rsid = "NA"
            if "colocated_variants" in x:
                if "id" in x["colocated_variants"][0]:
                    rsid = x["colocated_variants"][0]["id"]
            mcsq = x[
                "most_severe_consequence"] if "most_severe_consequence" in x else "NA"
示例#5
0
def id2rs_list(varIDs, build="38", skip_non_rs=False, keep_all=True):
    H = dict()
    R = dict()
    # TODO: check ID validity and if it's an rsID
    # trying fast method first
    LOGGER.debug("Input variant list: %d elements" % len(varIDs))
    c = 0
    t = 2 * len(varIDs) // config.VARIATION_POST_MAX
    if t % 2:
        t = t + 1
    for L in utils.chunks(varIDs, config.VARIATION_POST_MAX // 2):
        L1 = list()
        for x in L:
            # TODO: checks
            spdi = utils.var2spdi(utils.convertVariantID(x))
            H[spdi] = x
            L1.append(spdi)
            spdi = utils.var2spdi(utils.convertVariantID(x, reverse=True))
            H[spdi] = x
            L1.append(spdi)
        r = None
        while r is None:
            r = query.restQuery(query.makeRSListQueryURL(build=build),
                                data=utils.list2string(L1),
                                qtype="post")
            if r is None:
                LOGGER.debug("Retrying")
        for x1 in r:
            for x2 in x1:
                if "id" in x1[x2]:
                    v = H[x1[x2]["input"]]
                    if not v in R:
                        R[v] = set()
                        R[v].update(x1[x2]["id"])
                    else:
                        R[v].update(x1[x2]["id"])
        c += 1
        LOGGER.debug("Chunk %d (%d) done" % (c, t))
    LOGGER.debug("Found rsIDs for %d variants using fast method" %
                 len(R.keys()))
    # slow method for unmapped
    unmapped = list(set(varIDs) - set(R.keys()))
    LOGGER.debug("Using slow method for %d variants" % len(unmapped))
    for v in unmapped:
        R[v] = id2rs_mod2(v, build)
    if skip_non_rs == True:
        LOGGER.debug("Filtering non rs IDs")
        for v in R:
            s = set(filter(utils.isRS, R[v]))
            if len(s) == 0:
                R[v] = {"NA"}
            else:
                R[v] = s
    if not keep_all is True:
        LOGGER.debug("Keeping only one rs ID")
        c = 0
        for v in R:
            if len(R[v]) > 1:
                z = R[v].pop()
                R[v] = {z}
                c += 1
        LOGGER.debug("Truncated %d sets" % c)
    return R
示例#6
0
def getVariantsWithPhenotypes(chrom,
                              pos,
                              window=config.PHENO_WINDOW,
                              build="38"):
    '''
    For a given genomic region, return dataframe containing variants with phenotype annotations

    Input: chromosome, position, window (default: config.PHENO_WINDOW), build (default: "38") 
    Output: pandas dataframe with columns: "ID", "Consequence", "Location", "Phenotype", "Source", "Distance"
    '''

    start = pos - window
    end = pos + window

    if start < 1:
        start = 1

    empty_df = pd.DataFrame(columns=[
        "ID", "Consequence", "Location", "Phenotype", "Source", "Distance"
    ])

    if end - start > 5000000:
        LOGGER.error("Maximal region size allowed: 5Mbp")
        return empty_df

    LOGGER.debug("%s:%d; window: %d" % (chrom, pos, window))
    variants = query.restQuery(query.makePhenoOverlapQueryURL(chrom,
                                                              start,
                                                              end,
                                                              build=build),
                               qtype="get")
    #print(json.dumps(variants,indent=4,sort_keys=True))

    if not variants:
        return empty_df

    if len(variants) == 0:
        LOGGER.info(
            "No variants with phenotypes were found in the region %s:%d-%d" %
            (chrom, start, end))
        return empty_df

    rsIDs = list()
    for var in variants:
        rsIDs.append(var["id"])

    if len(rsIDs) == 0:
        LOGGER.info(
            "No variants with phenotypes were found in the region %s:%d-%d" %
            (chrom, start, end))
        return empty_df
    else:
        LOGGER.info(
            "%d variant(s) with phenotypes were found in the region %s:%d-%d" %
            (len(rsIDs), chrom, start, end))

    output = []
    i = 0
    df = pd.DataFrame(columns=[
        "ID", "Consequence", "Location", "Phenotype", "Source", "Link"
    ])
    for L in utils.chunks(rsIDs, config.VARIATION_POST_MAX):
        r = query.restQuery(query.makeRSPhenotypeQueryURL(build=build),
                            data=utils.list2string(L),
                            qtype="post")
        if r:
            #print(json.dumps(r,indent=4,sort_keys=True))
            for rsID in r:
                for phenotype in r[rsID]["phenotypes"]:
                    m = re.search("phenotype\s+not\s+specified",
                                  phenotype["trait"])
                    if m:
                        continue

                    x = next((m for m in r[rsID]["mappings"]
                              if m["seq_region_name"] == chrom), None)
                    if not x:
                        continue

                    link = utils.makeLink(config.ENSEMBL_PHENO_URL % rsID,
                                          "ENSEMBL")
                    if phenotype["source"] == "ClinVar":
                        link = utils.makeLink(config.CLINVAR_URL + rsID,
                                              "ClinVar")
                    elif phenotype["source"] == "NHGRI-EBI GWAS catalog":
                        link = utils.makeLink(config.NHGRI_URL + rsID,
                                              "NHGRI-EBI")
                    df.loc[i] = [
                        rsID,
                        r[rsID]["most_severe_consequence"].replace("_", " "),
                        chrom + ":" + str(x["start"]), phenotype["trait"],
                        phenotype["source"], link
                    ]
                    i += 1

    return df
示例#7
0
D = dict()
rest_in = set()
LOGGER.debug("Preparing data")
for x in S:
    (c, pos, ID, p, pmid, trait) = x.split(";", maxsplit=5)
    if c == "NA":
        rest_in.add(ID)
    else:
        # TODO: check uniqueness of ID --> chr:pos
        D[ID] = ":".join([c, pos])
LOGGER.debug("Done preparing data")

rest_out = dict()
LOGGER.debug("Start REST (%d records)" % len(rest_in))
count = 1
for chunk in utils.chunks(list(rest_in), config.VARIANT_RECODER_POST_MAX):
    res = variant.rsList2position(chunk, build="38", alleles=False)
    if res:
        LOGGER.debug("Done REST chunk %d" % count)
        for x in res:
            rest_out[x] = res[x]
    else:
        LOGGER.error("REST query for chunk %d failed" % count)
    count += 1

LOGGER.debug("Done, REST output: %d records" % len(rest_out))

#============================================================= OUTPUT ====================================================================================

LOGGER.debug("Start output")
print("CHR_ID",