def addPhenotypesToRSList(rsIDs, build="38"): LOGGER.debug("Input rs list: %d variants" % len(rsIDs)) R = dict() # exclude possible NAs first for L in utils.chunks(list(filter(lambda x: x != "NA", rsIDs)), config.VARIATION_POST_MAX): r = query.restQuery(query.makeRSPhenotypeQueryURL(build=build), data=utils.list2string(L), qtype="post") if not r is None: LOGGER.debug( "\n=== phenotype query ====\n%s\n==========================\n" % json.dumps(r, indent=4, sort_keys=True)) for v in r: if not v in rsIDs: continue if "phenotypes" in r[v]: R[v] = set([ x["trait"] for x in list( filter( lambda x: not re.search( "phenotype\s+not\s+specified", x["trait"]), r[v]["phenotypes"])) ]) else: R[v] = set() for v in set(rsIDs) - (set(R.keys()) - {"NA"}): R[v] = set() return R
def addConsequencesToIDList(varIDs, build="38", most_severe_only=False, gene_key="gene_id"): LOGGER.debug("Input ID list: %d variants" % len(varIDs)) R = dict() # double check, make sure IDs have correct format for L in utils.chunks(list(filter(utils.checkID, varIDs)), config.VEP_POST_MAX // 2): h = {"variants": []} for varid in L: V = utils.convertVariantID(varid) if utils.checkDEL(V, build=build): h["variants"].append(utils.variant2vep(V)) else: V = utils.convertVariantID(varid, reverse=True) if utils.checkDEL(V, build=build): h["variants"].append(utils.variant2vep(V)) r = query.restQuery(query.makeVepListQueryURL(build=build), data=json.dumps(h), qtype="post") if not r is None: LOGGER.debug( "\n======= VEP query ========\n%s\n==========================\n" % json.dumps(r, indent=4, sort_keys=True)) for x in r: rs = x["id"] mcsq = x[ "most_severe_consequence"] if "most_severe_consequence" in x else "NA" H = dict() if "transcript_consequences" in x: for g in x["transcript_consequences"]: H.setdefault(g[gene_key], []).extend(g["consequence_terms"]) for g in H: H[g] = utils.getMostSevereConsequence(H[g]) else: H["NA"] = mcsq if most_severe_only is True: if mcsq == "NA": R[rs] = {"NA": "NA"} else: g0 = "NA" for g in H: if H[g] == mcsq: g0 = g R[rs] = {g0: mcsq} else: R[rs] = H s = set(varIDs) - (set(R.keys()) - {"NA"}) LOGGER.debug("No consequences found for %d IDs" % len(s)) for v in s: R[v] = {"NA": "NA"} return R
def addConsequencesToRSList(rsIDs, build="38", most_severe_only=False, gene_key="gene_id"): LOGGER.debug("Input rs list: %d variants" % len(rsIDs)) R = dict() # exclude possible NAs from the input list first for L in utils.chunks(list(filter(lambda x: x != "NA", rsIDs)), config.VEP_POST_MAX): r = query.restQuery(query.makeVepRSListQueryURL(build=build), data=utils.list2string(L), qtype="post") if not r is None: LOGGER.debug( "\n======= VEP query ========\n%s\n==========================\n" % json.dumps(r, indent=4, sort_keys=True)) for x in r: rs = x["id"] mcsq = x[ "most_severe_consequence"] if "most_severe_consequence" in x else "NA" H = dict() if "transcript_consequences" in x: for g in x["transcript_consequences"]: H.setdefault(g[gene_key], []).extend(g["consequence_terms"]) for g in H: H[g] = utils.getMostSevereConsequence(H[g]) else: H["NA"] = mcsq if most_severe_only is True: if mcsq == "NA": R[rs] = {"NA": "NA"} else: g0 = "NA" for g in H: if H[g] == mcsq: g0 = g R[rs] = {g0: mcsq} else: R[rs] = H s = set(rsIDs) - (set(R.keys()) - {"NA"}) LOGGER.debug("No consequences found for %d rs IDs" % len(s)) for v in s: R[v] = {"NA": "NA"} return R
ch.setFormatter(formatter) LOGGER.addHandler(ch) logging.getLogger("varannot.utils").addHandler(ch) logging.getLogger("varannot.utils").setLevel(verbosity) def f(ID): L = ID.split("_") L.insert(2, ID) return " ".join(L) + " . . ." #--------------------------------------------------------------------------------------------------------------------------- for L in utils.chunks([line.rstrip() for line in sys.stdin.readlines()], config.VEP_POST_MAX): string = "{\"variants\":[\"" + "\",\"".join(list(map(lambda x: f(x), L))) + "\"]}" LOGGER.debug("data: %s" % (string)) r = query.restQuery(query.makeVepListQueryURL(build=build), data=string, qtype="post") if r: print(json.dumps(r, indent=4, sort_keys=True)) for x in r: rsid = "NA" if "colocated_variants" in x: if "id" in x["colocated_variants"][0]: rsid = x["colocated_variants"][0]["id"] mcsq = x[ "most_severe_consequence"] if "most_severe_consequence" in x else "NA"
def id2rs_list(varIDs, build="38", skip_non_rs=False, keep_all=True): H = dict() R = dict() # TODO: check ID validity and if it's an rsID # trying fast method first LOGGER.debug("Input variant list: %d elements" % len(varIDs)) c = 0 t = 2 * len(varIDs) // config.VARIATION_POST_MAX if t % 2: t = t + 1 for L in utils.chunks(varIDs, config.VARIATION_POST_MAX // 2): L1 = list() for x in L: # TODO: checks spdi = utils.var2spdi(utils.convertVariantID(x)) H[spdi] = x L1.append(spdi) spdi = utils.var2spdi(utils.convertVariantID(x, reverse=True)) H[spdi] = x L1.append(spdi) r = None while r is None: r = query.restQuery(query.makeRSListQueryURL(build=build), data=utils.list2string(L1), qtype="post") if r is None: LOGGER.debug("Retrying") for x1 in r: for x2 in x1: if "id" in x1[x2]: v = H[x1[x2]["input"]] if not v in R: R[v] = set() R[v].update(x1[x2]["id"]) else: R[v].update(x1[x2]["id"]) c += 1 LOGGER.debug("Chunk %d (%d) done" % (c, t)) LOGGER.debug("Found rsIDs for %d variants using fast method" % len(R.keys())) # slow method for unmapped unmapped = list(set(varIDs) - set(R.keys())) LOGGER.debug("Using slow method for %d variants" % len(unmapped)) for v in unmapped: R[v] = id2rs_mod2(v, build) if skip_non_rs == True: LOGGER.debug("Filtering non rs IDs") for v in R: s = set(filter(utils.isRS, R[v])) if len(s) == 0: R[v] = {"NA"} else: R[v] = s if not keep_all is True: LOGGER.debug("Keeping only one rs ID") c = 0 for v in R: if len(R[v]) > 1: z = R[v].pop() R[v] = {z} c += 1 LOGGER.debug("Truncated %d sets" % c) return R
def getVariantsWithPhenotypes(chrom, pos, window=config.PHENO_WINDOW, build="38"): ''' For a given genomic region, return dataframe containing variants with phenotype annotations Input: chromosome, position, window (default: config.PHENO_WINDOW), build (default: "38") Output: pandas dataframe with columns: "ID", "Consequence", "Location", "Phenotype", "Source", "Distance" ''' start = pos - window end = pos + window if start < 1: start = 1 empty_df = pd.DataFrame(columns=[ "ID", "Consequence", "Location", "Phenotype", "Source", "Distance" ]) if end - start > 5000000: LOGGER.error("Maximal region size allowed: 5Mbp") return empty_df LOGGER.debug("%s:%d; window: %d" % (chrom, pos, window)) variants = query.restQuery(query.makePhenoOverlapQueryURL(chrom, start, end, build=build), qtype="get") #print(json.dumps(variants,indent=4,sort_keys=True)) if not variants: return empty_df if len(variants) == 0: LOGGER.info( "No variants with phenotypes were found in the region %s:%d-%d" % (chrom, start, end)) return empty_df rsIDs = list() for var in variants: rsIDs.append(var["id"]) if len(rsIDs) == 0: LOGGER.info( "No variants with phenotypes were found in the region %s:%d-%d" % (chrom, start, end)) return empty_df else: LOGGER.info( "%d variant(s) with phenotypes were found in the region %s:%d-%d" % (len(rsIDs), chrom, start, end)) output = [] i = 0 df = pd.DataFrame(columns=[ "ID", "Consequence", "Location", "Phenotype", "Source", "Link" ]) for L in utils.chunks(rsIDs, config.VARIATION_POST_MAX): r = query.restQuery(query.makeRSPhenotypeQueryURL(build=build), data=utils.list2string(L), qtype="post") if r: #print(json.dumps(r,indent=4,sort_keys=True)) for rsID in r: for phenotype in r[rsID]["phenotypes"]: m = re.search("phenotype\s+not\s+specified", phenotype["trait"]) if m: continue x = next((m for m in r[rsID]["mappings"] if m["seq_region_name"] == chrom), None) if not x: continue link = utils.makeLink(config.ENSEMBL_PHENO_URL % rsID, "ENSEMBL") if phenotype["source"] == "ClinVar": link = utils.makeLink(config.CLINVAR_URL + rsID, "ClinVar") elif phenotype["source"] == "NHGRI-EBI GWAS catalog": link = utils.makeLink(config.NHGRI_URL + rsID, "NHGRI-EBI") df.loc[i] = [ rsID, r[rsID]["most_severe_consequence"].replace("_", " "), chrom + ":" + str(x["start"]), phenotype["trait"], phenotype["source"], link ] i += 1 return df
D = dict() rest_in = set() LOGGER.debug("Preparing data") for x in S: (c, pos, ID, p, pmid, trait) = x.split(";", maxsplit=5) if c == "NA": rest_in.add(ID) else: # TODO: check uniqueness of ID --> chr:pos D[ID] = ":".join([c, pos]) LOGGER.debug("Done preparing data") rest_out = dict() LOGGER.debug("Start REST (%d records)" % len(rest_in)) count = 1 for chunk in utils.chunks(list(rest_in), config.VARIANT_RECODER_POST_MAX): res = variant.rsList2position(chunk, build="38", alleles=False) if res: LOGGER.debug("Done REST chunk %d" % count) for x in res: rest_out[x] = res[x] else: LOGGER.error("REST query for chunk %d failed" % count) count += 1 LOGGER.debug("Done, REST output: %d records" % len(rest_out)) #============================================================= OUTPUT ==================================================================================== LOGGER.debug("Start output") print("CHR_ID",