Exemplo n.º 1
0
def id2rs_spdi(varid, build="38"):
    '''
    THIS METHOD IS NOT RELIABLE FOR INDELS AS SEVERAL DIFFERENT SPDI NOTATIONS CAN BE EQUIVALENT

    For a given variant ID (chr_pos_A1_A2), return a set of matching rs IDs
    Variant ID is converted to SPDI which is used in a variant_recoder query

    Input: variant ID, build (default: 38)
    Output: set of rs IDs
    '''

    S = set()

    if utils.isRS(varid):
        return {varid}

    if not utils.checkID(varid):
        LOGGER.error("Variant ID %s is malformed" % varid)
        return S

    # convert ID to dict
    V = utils.convertVariantID(varid)
    V1 = utils.convertVariantID(varid, reverse=True)
    # get SPDI records
    spdi = utils.var2spdi(V)
    spdi1 = utils.var2spdi(V1)

    r = query.restQuery(query.makeRSQueryURL(spdi, build=build), quiet=True)
    # r is a list of dicts
    if not r is None:
        LOGGER.debug("Got results for %s" % (str(V)))
        LOGGER.debug("\n%s" % json.dumps(r, indent=4, sort_keys=True))
        for x1 in r:
            for x2 in x1:
                if "id" in x1[x2]:
                    for rs in x1[x2]["id"]:
                        S.add(rs)
    else:
        LOGGER.debug("No results for %s" % (str(V)))

    r = query.restQuery(query.makeRSQueryURL(spdi1, build=build), quiet=True)
    if not r is None:
        LOGGER.debug("Got results for %s" % (str(V1)))
        LOGGER.debug("\n%s" % json.dumps(r, indent=4, sort_keys=True))
        for x1 in r:
            for x2 in x1:
                if "id" in x1[x2]:
                    for rs in x1[x2]["id"]:
                        S.add(rs)
    else:
        LOGGER.debug("No results for %s" % (str(V1)))

    return S
Exemplo n.º 2
0
def addPhenotypesToRSList(rsIDs, build="38"):
    LOGGER.debug("Input rs list: %d variants" % len(rsIDs))
    R = dict()
    # exclude possible NAs first
    for L in utils.chunks(list(filter(lambda x: x != "NA", rsIDs)),
                          config.VARIATION_POST_MAX):
        r = query.restQuery(query.makeRSPhenotypeQueryURL(build=build),
                            data=utils.list2string(L),
                            qtype="post")
        if not r is None:
            LOGGER.debug(
                "\n=== phenotype query ====\n%s\n==========================\n"
                % json.dumps(r, indent=4, sort_keys=True))
            for v in r:
                if not v in rsIDs:
                    continue
                if "phenotypes" in r[v]:
                    R[v] = set([
                        x["trait"] for x in list(
                            filter(
                                lambda x: not re.search(
                                    "phenotype\s+not\s+specified", x["trait"]),
                                r[v]["phenotypes"]))
                    ])
                else:
                    R[v] = set()
    for v in set(rsIDs) - (set(R.keys()) - {"NA"}):
        R[v] = set()
    return R
Exemplo n.º 3
0
def addConsequencesToIDList(varIDs,
                            build="38",
                            most_severe_only=False,
                            gene_key="gene_id"):
    LOGGER.debug("Input ID list: %d variants" % len(varIDs))
    R = dict()
    # double check, make sure IDs have correct format
    for L in utils.chunks(list(filter(utils.checkID, varIDs)),
                          config.VEP_POST_MAX // 2):
        h = {"variants": []}
        for varid in L:
            V = utils.convertVariantID(varid)
            if utils.checkDEL(V, build=build):
                h["variants"].append(utils.variant2vep(V))
            else:
                V = utils.convertVariantID(varid, reverse=True)
                if utils.checkDEL(V, build=build):
                    h["variants"].append(utils.variant2vep(V))
        r = query.restQuery(query.makeVepListQueryURL(build=build),
                            data=json.dumps(h),
                            qtype="post")
        if not r is None:
            LOGGER.debug(
                "\n======= VEP query ========\n%s\n==========================\n"
                % json.dumps(r, indent=4, sort_keys=True))
            for x in r:
                rs = x["id"]
                mcsq = x[
                    "most_severe_consequence"] if "most_severe_consequence" in x else "NA"
                H = dict()
                if "transcript_consequences" in x:
                    for g in x["transcript_consequences"]:
                        H.setdefault(g[gene_key],
                                     []).extend(g["consequence_terms"])
                    for g in H:
                        H[g] = utils.getMostSevereConsequence(H[g])
                else:
                    H["NA"] = mcsq
                if most_severe_only is True:
                    if mcsq == "NA":
                        R[rs] = {"NA": "NA"}
                    else:
                        g0 = "NA"
                        for g in H:
                            if H[g] == mcsq:
                                g0 = g
                        R[rs] = {g0: mcsq}
                else:
                    R[rs] = H
    s = set(varIDs) - (set(R.keys()) - {"NA"})
    LOGGER.debug("No consequences found for %d IDs" % len(s))
    for v in s:
        R[v] = {"NA": "NA"}
    return R
Exemplo n.º 4
0
def rs2spdi(ID, build="38"):
    L = []
    z = query.restQuery(query.makeRSQueryURL(ID, build=build))
    if z:
        LOGGER.debug("\n%s" % json.dumps(z, indent=4, sort_keys=True))
        for x in z:
            for x1 in x:
                if "spdi" in x[x1]:
                    spdis = x[x1]["spdi"]
                    for spdi in spdis:
                        if not spdi in L:
                            L.append(spdi)

    return L
Exemplo n.º 5
0
def rsList2position(L, build="38", alleles=False):
    '''
    Input: list of rsID, build (default: 38), alleles=True/False (if we need alleles as well)
    Output: a dictionary rsID --> [{"chr":c,"pos":p}, ...], or None if query fails
    '''

    D = {}
    data = utils.list2string(L)
    url = query.makeRSListQueryURL(build=build)
    z = query.restQuery(url, qtype="post", data=data)
    if z:
        for x in z:
            inputID = x["input"]
            D[inputID] = []
            spdis = x["spdi"]
            for spdi in spdis:
                h = query.parseSPDI(spdi, build=build, alleles=alleles)
                p = h["pos"]
                c = h["chr"]
                ref = h["ref"]
                alt = h["alt"]
                z = None
                if alleles:
                    z = next(
                        (x
                         for x in D[inputID] if x["chr"] == c and x["pos"] == p
                         and x["ref"] == ref and x["alt"] == alt), None)
                else:
                    z = next((x for x in D[inputID]
                              if x["chr"] == c and x["pos"] == p), None)
                if not z:
                    D[inputID].append({
                        "chr": c,
                        "pos": p,
                        "ref": ref,
                        "alt": alt
                    })

        # in case some input IDs are missing in the response
        # for ID in L:
        #     if not ID in D:
        #         D[ID]=[{"chr":None,"pos":None,"ref":None,"alt":None}]
    else:
        return None

    return D
Exemplo n.º 6
0
def addConsequencesToRSList(rsIDs,
                            build="38",
                            most_severe_only=False,
                            gene_key="gene_id"):
    LOGGER.debug("Input rs list: %d variants" % len(rsIDs))
    R = dict()
    # exclude possible NAs from the input list first
    for L in utils.chunks(list(filter(lambda x: x != "NA", rsIDs)),
                          config.VEP_POST_MAX):
        r = query.restQuery(query.makeVepRSListQueryURL(build=build),
                            data=utils.list2string(L),
                            qtype="post")
        if not r is None:
            LOGGER.debug(
                "\n======= VEP query ========\n%s\n==========================\n"
                % json.dumps(r, indent=4, sort_keys=True))
            for x in r:
                rs = x["id"]
                mcsq = x[
                    "most_severe_consequence"] if "most_severe_consequence" in x else "NA"
                H = dict()
                if "transcript_consequences" in x:
                    for g in x["transcript_consequences"]:
                        H.setdefault(g[gene_key],
                                     []).extend(g["consequence_terms"])
                    for g in H:
                        H[g] = utils.getMostSevereConsequence(H[g])
                else:
                    H["NA"] = mcsq
                if most_severe_only is True:
                    if mcsq == "NA":
                        R[rs] = {"NA": "NA"}
                    else:
                        g0 = "NA"
                        for g in H:
                            if H[g] == mcsq:
                                g0 = g
                        R[rs] = {g0: mcsq}
                else:
                    R[rs] = H
    s = set(rsIDs) - (set(R.keys()) - {"NA"})
    LOGGER.debug("No consequences found for %d rs IDs" % len(s))
    for v in s:
        R[v] = {"NA": "NA"}
    return R
Exemplo n.º 7
0
def rs2position(ID, build="38", alleles=False):
    '''
    Given rsID, return a list of dictionaries with keys "chr", "pos"
    
    Input: rsID, build (default: 38), alleles=True/False (if we need alleles as well)
    Output: a list of dictionaries with keys "chr", "pos", or None if query fails
    '''

    L = []
    z = query.restQuery(query.makeRSQueryURL(ID, build=build))
    if z:
        print(json.dumps(z, indent=4, sort_keys=True))
        for x in z:
            for x1 in x:
                spdis = x[x1]["spdi"]
                for spdi in spdis:
                    LOGGER.debug("SPDI: %s" % spdi)
                    h = query.parseSPDI(spdi, build=build, alleles=alleles)
                    p = h["pos"]
                    c = h["chr"]
                    ref = h["ref"]
                    alt = h["alt"]
                    LOGGER.debug("%s:%d:%s:%s" % (c, p, ref, alt))
                    z = None
                    if alleles:
                        z = next(
                            (x for x in L if x["chr"] == c and x["pos"] == p
                             and x["ref"] == ref and x["alt"] == alt), None)
                    else:
                        z = next(
                            (x for x in L if x["chr"] == c and x["pos"] == p),
                            None)
                    if not z:
                        L.append({"chr": c, "pos": p, "ref": ref, "alt": alt})
    else:
        return None
    return L
Exemplo n.º 8
0
def id2rs_mod2(varid, build="38"):
    '''
    For a given variant ID (chr_pos_A1_A2), return a set of matching rs IDs

    Input: variant ID, build (default: 38)
    Output: set of rs IDs
    '''

    S = set()
    if utils.isRS(varid):
        return {varid}
    if not utils.checkID(varid):
        LOGGER.error("Variant ID %s is malformed" % varid)
        return S

    V = utils.convertVariantID(varid)
    V1 = utils.convertVariantID(varid, reverse=True)

    window = max(len(V["del"]), len(V["ins"]))

    if utils.getVarType(V) == "SNP":
        r = query.restQuery(
            query.makeOverlapVarQueryURL(V["seq"],
                                         V["pos"],
                                         V["pos"],
                                         build=build))
        if not r:
            return S

        for v in r:
            if V["del"] in v["alleles"] and V["ins"] in v["alleles"] and v[
                    "strand"] == 1 and v["start"] == v["end"]:
                S.add(v["id"])
    else:
        r = query.restQuery(
            query.makeOverlapVarQueryURL(V["seq"],
                                         V["pos"] - window,
                                         V["pos"] + window,
                                         build=build))
        if not r:
            return S

        LOGGER.debug("Got %d variants around %s:%d\n" %
                     (len(r), V["seq"], V["pos"]))
        LOGGER.debug("\n%s" % json.dumps(r, indent=4, sort_keys=True))

        # only save indel IDs in L
        L = []
        for v in r:
            if "alleles" in v and "id" in v:
                for a in v["alleles"]:
                    if a == "-" or len(a) > 1:
                        L.append(v["id"])
                        break
        if len(L) == 0:
            LOGGER.debug("No indels found")
            return S
        LOGGER.debug("%d indels found: %s" % (len(L), str(L)))

        # TODO: check if L is larger than allowed POST size
        z1 = query.restQuery(query.makeRSListQueryURL(build=build),
                             qtype="post",
                             data=utils.list2string(L))
        LOGGER.debug(
            "\n=======================\n%s\n==========================\n" %
            json.dumps(z1, indent=4, sort_keys=True))

        LOGGER.debug("---------- CHECK START ----------------\n")
        for v in z1:
            for x1 in v:
                if "spdi" in v[x1] and "id" in v[x1]:
                    var = v[x1]["id"][0]
                    spdis = v[x1]["spdi"]
                    for spdi in spdis:
                        V2 = utils.convertSPDI(spdi, build=build)
                        LOGGER.debug("SPDI: %s; V2: %s" % (spdi, V2))
                        if utils.equivalentVariants(V, V2, build=build):
                            S.add(var)
                            break
                        if utils.equivalentVariants(V1, V2, build=build):
                            S.add(var)
                            break
        LOGGER.debug("----------- CHECK END -----------------\n")
    return S
Exemplo n.º 9
0
def id2rs_list(varIDs, build="38", skip_non_rs=False, keep_all=True):
    H = dict()
    R = dict()
    # TODO: check ID validity and if it's an rsID
    # trying fast method first
    LOGGER.debug("Input variant list: %d elements" % len(varIDs))
    c = 0
    t = 2 * len(varIDs) // config.VARIATION_POST_MAX
    if t % 2:
        t = t + 1
    for L in utils.chunks(varIDs, config.VARIATION_POST_MAX // 2):
        L1 = list()
        for x in L:
            # TODO: checks
            spdi = utils.var2spdi(utils.convertVariantID(x))
            H[spdi] = x
            L1.append(spdi)
            spdi = utils.var2spdi(utils.convertVariantID(x, reverse=True))
            H[spdi] = x
            L1.append(spdi)
        r = None
        while r is None:
            r = query.restQuery(query.makeRSListQueryURL(build=build),
                                data=utils.list2string(L1),
                                qtype="post")
            if r is None:
                LOGGER.debug("Retrying")
        for x1 in r:
            for x2 in x1:
                if "id" in x1[x2]:
                    v = H[x1[x2]["input"]]
                    if not v in R:
                        R[v] = set()
                        R[v].update(x1[x2]["id"])
                    else:
                        R[v].update(x1[x2]["id"])
        c += 1
        LOGGER.debug("Chunk %d (%d) done" % (c, t))
    LOGGER.debug("Found rsIDs for %d variants using fast method" %
                 len(R.keys()))
    # slow method for unmapped
    unmapped = list(set(varIDs) - set(R.keys()))
    LOGGER.debug("Using slow method for %d variants" % len(unmapped))
    for v in unmapped:
        R[v] = id2rs_mod2(v, build)
    if skip_non_rs == True:
        LOGGER.debug("Filtering non rs IDs")
        for v in R:
            s = set(filter(utils.isRS, R[v]))
            if len(s) == 0:
                R[v] = {"NA"}
            else:
                R[v] = s
    if not keep_all is True:
        LOGGER.debug("Keeping only one rs ID")
        c = 0
        for v in R:
            if len(R[v]) > 1:
                z = R[v].pop()
                R[v] = {z}
                c += 1
        LOGGER.debug("Truncated %d sets" % c)
    return R
Exemplo n.º 10
0
                          help="varID",
                          required=True)

if len(sys.argv[1:]) == 0:
    parser.print_help()
    sys.exit(0)

try:
    args = parser.parse_args()
except:
    parser.print_help()
    sys.exit(0)

if args.build != None:
    build = args.build

rs = args.id
logging.getLogger("variant").setLevel(logging.DEBUG)

#---------------------------------------------------------------------------------------------------------------------------

data = query.restQuery(query.makeRsPhenotypeQuery2URL(rs, build))

L = list()
if data:
    if "synonyms" in data:
        L = list(filter(lambda x: x != rs, data["synonyms"]))

for x in L:
    print(x)
Exemplo n.º 11
0
def id2rs_mod(varid, build="38"):
    '''
    For a given variant ID (chr_pos_A1_A2), return a set of matching rs IDs

    Input: variant ID, build (default: 38)
    Output: set of rs IDs
    '''

    S = set()

    if utils.isRS(varid):
        return {varid}

    if not utils.checkID(varid):
        LOGGER.error("Variant ID %s is malformed" % varid)
        return S

    batchsize = 100

    V = utils.convertVariantID(varid)
    V1 = utils.convertVariantID(varid, reverse=True)
    b = utils.checkDEL(V, build=build)
    b1 = utils.checkDEL(V1, build=build)

    window = max(len(V["del"]), len(V["ins"]))

    if utils.getVarType(V) == "SNP":
        r = query.restQuery(
            query.makeOverlapVarQueryURL(V["seq"],
                                         V["pos"],
                                         V["pos"],
                                         build=build))
        if not r:
            return S

        for v in r:
            if V["del"] in v["alleles"] and V["ins"] in v["alleles"] and v[
                    "strand"] == 1 and v["start"] == v["end"]:
                S.add(v["id"])

    else:
        r = query.restQuery(
            query.makeOverlapVarQueryURL(V["seq"],
                                         V["pos"] - window,
                                         V["pos"] + window,
                                         build=build))
        if not r:
            return S

        LOGGER.debug("\n%s" % json.dumps(r, indent=4, sort_keys=True))
        LOGGER.debug("Got %d variants around %s:%d\n" %
                     (len(r), V["seq"], V["pos"]))
        for v in r:
            LOGGER.debug("Current variant: %s" % v["id"])
            z = query.restQuery(query.makeRSQueryURL(v["id"], build=build))
            if not z:
                continue

            LOGGER.debug("\n%s" % json.dumps(z, indent=4, sort_keys=True))
            for x in z:
                for x1 in x:
                    spdis = x[x1]["spdi"]
                    var = x[x1]["id"][0]
                    for spdi in spdis:
                        LOGGER.debug("SPDI: %s" % spdi)
                        V2 = utils.convertSPDI(spdi, build=build)
                        LOGGER.debug("V2: %s" % V2)
                        if b:
                            if utils.equivalentVariants(V, V2, build=build):
                                S.add(var)
                                break
                        if b1:
                            if utils.equivalentVariants(V1, V2, build=build):
                                S.add(var)
                                break

    return S
Exemplo n.º 12
0
def id2rs(varid, build="38"):
    '''
    For a given variant ID (chr_pos_A1_A2), return a set of matching rs IDs

    Input: variant ID, build (default: 38)
    Output: set of rs IDs
    '''
    S = set()

    if varid.startswith("rs"):
        return varid

    m = re.search("^(\d+)_(\d+)_([ATGC]+)_([ATGC]+)", varid)
    if not m:
        LOGGER.error("%s is malformed" % varid)
        return S

    chrom = m.group(1)
    pos = int(m.group(2))
    a1 = m.group(3)
    a2 = m.group(4)

    batchsize = 100

    if len(a1) == 1 and len(a2) == 1:
        # SNP
        r = query.restQuery(
            query.makeOverlapVarQueryURL(chrom, pos, pos, build=build))
        if not r:
            return S

        for v in r:
            if a1 in v["alleles"] and a2 in v["alleles"]:
                S.add(v["id"])
    else:
        # in case of indels, pull all variants around the variant's position
        window = max(len(a1), len(a2))

        r = query.restQuery(
            query.makeOverlapVarQueryURL(chrom,
                                         pos - window,
                                         pos + window,
                                         build=build))
        if not r:
            return S

        for v in r:
            z = query.restQuery(query.makeRSQueryURL(v["id"], build=build))
            if not z:
                continue

            for x in z:
                spdis = x["spdi"]
                var = x["id"][0]
                for spdi in spdis:
                    h = query.parseSPDI(spdi, alleles=True)
                    ref = h["ref"]
                    alt = h["alt"]
                    p = h["pos"]
                    c = h["chr"]
                    LOGGER.debug("%s : %s : %s %d %s %s" %
                                 (var, spdi, c, p, ref, alt))
                    #print(spdi)
                    #print(c,p,ref,alt,sep="\t")

                    if p != pos:
                        continue

                    if len(ref) == 1 and len(alt) == 1:
                        continue

                    if (ref == a1 and alt == a2) or (ref == a2 and alt == a1):
                        S.add(var)
                        break
    return S
Exemplo n.º 13
0
def getVariantInfo(rs, build="38"):
    '''
    For a given variant ID, return a dictionary with variant information; keys are:
    "minor_allele"
    "MAF"
    "rsID"
    "class" : variant class
    "synonyms" : list of synonym IDs
    "consequence" : most severe consequence
    "mappings" : list of mapping dictionaries with keys: "chr", "pos", "ref", "alt", "polyphen_score", "polyphen_prediction", "sift_score", "sift_preddiction"
    "population_data" : list of dictionaries "population":{"allele":"frequency"} (from phase 3 of 1KG)
    "phenotype_data" : list of dictionaries with keys "trait", "source", "risk_allele"
    "clinical_significance" : list of clinical significance terms
    "scores" : dictionary mapping "chr:pos" string to a dictionary with keys "avg_gerp", "gerp", "gwava"
    '''

    res = dict()

    # in case provided ID is not an RS
    if not utils.isRS(rs):
        t = utils.splitID(rs)
        if t:  # TODO: check if ref/alt mappings are correct: compare to reference sequence
            return {
                "minor_allele":
                None,
                "MAF":
                None,
                "rsID":
                None,
                "class":
                rs,
                "synonyms": [],
                "consequence":
                None,
                "mappings": [{
                    "chr": t["chr"],
                    "pos": t["pos"],
                    "ref": t["a1"],
                    "alt": t["a2"],
                    "polyphen_score": "NA",
                    "polyphen_prediction": "NA",
                    "sift_score": "NA",
                    "sift_prediction": "NA"
                }, {
                    "chr": t["chr"],
                    "pos": t["pos"],
                    "ref": t["a2"],
                    "alt": t["a1"],
                    "polyphen_score": "NA",
                    "polyphen_prediction": "NA",
                    "sift_score": "NA",
                    "sift_prediction": "NA"
                }],
                "population_data":
                None,
                "phenotype_data":
                None,
                "clinical_significance":
                None,
                "scores":
                None
            }
        else:
            return None

#------------------- general information ---------------

    data = query.restQuery(query.makeRsPhenotypeQuery2URL(rs, build))
    #print(json.dumps(data,indent=4,sort_keys=True))

    if not data:
        return None

    res["minor_allele"] = data["minor_allele"]
    if re.search("[01]\.\d+", str(data["MAF"])):
        res["MAF"] = str(data["MAF"])
    else:
        res["MAF"] = "NA"

    res["rsID"] = rs
    res["class"] = data["var_class"]
    res["consequence"] = data["most_severe_consequence"]
    if "synonyms" in data:
        res["synonyms"] = list(filter(lambda x: x != rs, data["synonyms"]))
    else:
        res["synonyms"] = []

#------------------- mappings----------------------

    mappings = list()

    z = query.restQuery(query.makeRSQueryURL(rs, build=build))
    if z is None:
        return None

    for x in z:
        spdis = x["spdi"]
        for spdi in spdis:
            h = query.parseSPDI(spdi, alleles=True)
            ref = h["ref"]
            alt = h["alt"]
            p = h["pos"]
            c = h["chr"]
            mappings.append({
                "chr": c,
                "pos": p,
                "ref": ref,
                "alt": alt,
                "sift_score": "NA",
                "sift_prediction": "NA",
                "polyphen_score": "NA",
                "polyphen_prediction": "NA"
            })

#------------------ population data ----------------

    population_data = list()

    for pop in data["populations"]:
        pop_name = pop["population"].split(":")
        if pop_name[0] == "1000GENOMES" and pop_name[1] == "phase_3":
            name = pop_name[2]
            try:
                z = next(x for x in population_data if name == x["population"])
                z["frequency"][pop["allele"]] = pop["frequency"]
            except:
                population_data.append({
                    "population": name,
                    "frequency": {
                        pop["allele"]: pop["frequency"]
                    }
                })

#------------------ phenotype data -------------------

    phenotype_data = list()

    for p in data["phenotypes"]:
        trait = p["trait"] if "trait" in p else "NA"
        source = p["source"] if "source" in p else "NA"
        risk = p["risk_allele"] if "risk_allele" in p else "NA"
        if trait:
            phenotype_data.append({
                "trait": trait,
                "source": source,
                "risk_allele": risk
            })

#------------------ clinical significance -------------------

    clinical_significance = list()

    if "clinical_significance" in data:
        for cs in data["clinical_significance"]:
            if cs != "other" and cs != "not provided":
                clinical_significance.append(cs)

#---------------- chr:pos dependent scores -----------------

    scores = dict()
    for m in mappings:
        #scores[m["chr"]+":"+str(m["pos"])]={"avg_gerp":"NA","gerp":"NA","gwava":"NA"}
        scores[m["chr"] + ":" + str(m["pos"])] = {"gwava": "NA"}

#-----------------------------------------------------

    res["mappings"] = mappings
    res["population_data"] = population_data
    res["phenotype_data"] = phenotype_data
    res["clinical_significance"] = clinical_significance
    res["scores"] = scores

    return res
Exemplo n.º 14
0
def getVariantsWithPhenotypes(chrom,
                              pos,
                              window=config.PHENO_WINDOW,
                              build="38"):
    '''
    For a given genomic region, return dataframe containing variants with phenotype annotations

    Input: chromosome, position, window (default: config.PHENO_WINDOW), build (default: "38") 
    Output: pandas dataframe with columns: "ID", "Consequence", "Location", "Phenotype", "Source", "Distance"
    '''

    start = pos - window
    end = pos + window

    if start < 1:
        start = 1

    empty_df = pd.DataFrame(columns=[
        "ID", "Consequence", "Location", "Phenotype", "Source", "Distance"
    ])

    if end - start > 5000000:
        LOGGER.error("Maximal region size allowed: 5Mbp")
        return empty_df

    LOGGER.debug("%s:%d; window: %d" % (chrom, pos, window))
    variants = query.restQuery(query.makePhenoOverlapQueryURL(chrom,
                                                              start,
                                                              end,
                                                              build=build),
                               qtype="get")
    #print(json.dumps(variants,indent=4,sort_keys=True))

    if not variants:
        return empty_df

    if len(variants) == 0:
        LOGGER.info(
            "No variants with phenotypes were found in the region %s:%d-%d" %
            (chrom, start, end))
        return empty_df

    rsIDs = list()
    for var in variants:
        rsIDs.append(var["id"])

    if len(rsIDs) == 0:
        LOGGER.info(
            "No variants with phenotypes were found in the region %s:%d-%d" %
            (chrom, start, end))
        return empty_df
    else:
        LOGGER.info(
            "%d variant(s) with phenotypes were found in the region %s:%d-%d" %
            (len(rsIDs), chrom, start, end))

    output = []
    i = 0
    df = pd.DataFrame(columns=[
        "ID", "Consequence", "Location", "Phenotype", "Source", "Link"
    ])
    for L in utils.chunks(rsIDs, config.VARIATION_POST_MAX):
        r = query.restQuery(query.makeRSPhenotypeQueryURL(build=build),
                            data=utils.list2string(L),
                            qtype="post")
        if r:
            #print(json.dumps(r,indent=4,sort_keys=True))
            for rsID in r:
                for phenotype in r[rsID]["phenotypes"]:
                    m = re.search("phenotype\s+not\s+specified",
                                  phenotype["trait"])
                    if m:
                        continue

                    x = next((m for m in r[rsID]["mappings"]
                              if m["seq_region_name"] == chrom), None)
                    if not x:
                        continue

                    link = utils.makeLink(config.ENSEMBL_PHENO_URL % rsID,
                                          "ENSEMBL")
                    if phenotype["source"] == "ClinVar":
                        link = utils.makeLink(config.CLINVAR_URL + rsID,
                                              "ClinVar")
                    elif phenotype["source"] == "NHGRI-EBI GWAS catalog":
                        link = utils.makeLink(config.NHGRI_URL + rsID,
                                              "NHGRI-EBI")
                    df.loc[i] = [
                        rsID,
                        r[rsID]["most_severe_consequence"].replace("_", " "),
                        chrom + ":" + str(x["start"]), phenotype["trait"],
                        phenotype["source"], link
                    ]
                    i += 1

    return df
Exemplo n.º 15
0
def f(ID):
    L = ID.split("_")
    L.insert(2, ID)
    return " ".join(L) + " . . ."


#---------------------------------------------------------------------------------------------------------------------------

for L in utils.chunks([line.rstrip() for line in sys.stdin.readlines()],
                      config.VEP_POST_MAX):
    string = "{\"variants\":[\"" + "\",\"".join(list(map(lambda x: f(x),
                                                         L))) + "\"]}"
    LOGGER.debug("data: %s" % (string))
    r = query.restQuery(query.makeVepListQueryURL(build=build),
                        data=string,
                        qtype="post")
    if r:
        print(json.dumps(r, indent=4, sort_keys=True))
        for x in r:
            rsid = "NA"
            if "colocated_variants" in x:
                if "id" in x["colocated_variants"][0]:
                    rsid = x["colocated_variants"][0]["id"]
            mcsq = x[
                "most_severe_consequence"] if "most_severe_consequence" in x else "NA"
            H = {}
            if "transcript_consequences" in x:
                for g in x["transcript_consequences"]:
                    gene_id = g["gene_id"]
                    csq = g["consequence_terms"][0]
Exemplo n.º 16
0
           line_width=1,
           line_dash="dashed")
    label = Label(x=pos,
                  y=e['logp'].max(),
                  text=traits,
                  angle=90,
                  angle_units="deg",
                  text_align="right",
                  text_color="firebrick",
                  text_font_size="11pt",
                  text_font_style="italic")
    p.add_layout(label)

overlapping_genes = query.restQuery(
    query.makeGeneOverlapQueryURL(str(e['#chr'][0]),
                                  e['ps'].min(),
                                  e['ps'].max(),
                                  build="38"))
genes_df = pd.DataFrame(json.loads(json.dumps(overlapping_genes)))
#print(json.dumps(overlapping_genes,indent=4,sort_keys=True))
#genes_df.to_csv(sys.stdout,sep="\t",index=False)

# overlapping_GWASCAT_vars=query.restQuery(query.makeOverlapVarGWASCATQueryURL(str(e['#chr'][0]),e['ps'].min(),e['ps'].max(),build="38"))
# cat=pd.DataFrame(json.loads(json.dumps(overlapping_GWASCAT_vars)))
# print(cat)
#print("")
#cat.to_csv(sys.stdout,sep="\t",index=False)

# TODO: max POST size
# pheno_vars=query.restQuery(query.makeRSPhenotypeQueryURL(build="38"),data=utils.list2string(cat["id"].tolist()),qtype="post")
# for rsid in pheno_vars:
Exemplo n.º 17
0
    '%(levelname)s - %(name)s - %(asctime)s - %(message)s',
    datefmt='%d-%b-%y %H:%M:%S')
ch.setFormatter(formatter)
LOGGER.addHandler(ch)

logging.getLogger("varannot.variant").addHandler(ch)
logging.getLogger("varannot.variant").setLevel(logging.DEBUG)
logging.getLogger("varannot.query").addHandler(ch)
logging.getLogger("varannot.query").setLevel(logging.DEBUG)

#---------------------------------------------------------------------------------------------------------------------------

if rs1 == rs2:
    sys.exit(0)

data1 = query.restQuery(query.makeRsPhenotypeQuery2URL(rs1, build))
data2 = query.restQuery(query.makeRsPhenotypeQuery2URL(rs2, build))

L1 = list()
L2 = list()
if data1:
    if "synonyms" in data1:
        L1 = list(filter(lambda x: x != rs1, data1["synonyms"]))

if rs2 in L1:
    sys.exit(0)

if data2:
    if "synonyms" in data2:
        L2 = list(filter(lambda x: x != rs2, data2["synonyms"]))