def phewas(): builds = request.args.getlist("build") if len(builds) == 0: raise FlaskException("Must provide build parameter",400) filter_str = request.args.get("filter") if filter_str is None: raise FlaskException("No filter string specified",400) fparser = FilterParser() stmts = fparser.statements(filter_str) variant = getattr(stmts.get("variant"),"value",None) if variant is None: raise FlaskException(400,"Must provide a filter string with field 'variant' specified") db_cols = ["id","description","study","trait","trait_label","trait_group","tech","build","pmid", "variant","chromosome","position","ref_allele", "ref_allele_freq","log_pvalue","beta","se","score_test_stat"] sql = """ SELECT sa.id,sa.study,sa.trait,traits.label as trait_label,traits.grouping as trait_group,sa.tech,sa.build,sa.analysis as description,sa.pmid, sr.variant_name as variant,sr.chrom as chromosome,sr.pos as position,sr.ref_allele,sr.ref_freq as ref_allele_freq, sr.log_pvalue,sr.beta,sr.se,sr.score_stat as score_test_stat FROM rest.assoc_master sa JOIN rest.assoc_results sr ON sa.id = sr.id LEFT JOIN rest.traits ON sa.trait = traits.trait WHERE variant_name = :vname AND sa.build = ANY(:builds) AND traits.grouping IS NOT NULL AND traits.label IS NOT NULL ORDER BY log_pvalue DESC; """ return_fmt = request.args.get("format") if return_fmt is None or return_fmt == "": return_fmt = "table" if return_fmt not in ("table","objects"): raise FlaskException(400,"format must be either 'table' or 'objects'") cur = g.db.connection.cursor(cursor_factory=psycopg2.extras.DictCursor) cur.callproc("rest.phewas_query",[variant,builds]) data = reshape_data(cur,db_cols,None,return_fmt) return jsonify({ "meta": { "build": builds }, "data": data, "lastPage": None })
def invalid_request_check(): illegal_words = ["null", "undefined", "nan", "none", "\n", "\\n", ";", "="] filter_str = request.args.get("filter") if filter_str is not None: filter_str = filter_str.lower() for w in illegal_words: if w in filter_str: raise FlaskException(f"Invalid string {w} found in filter string", 400)
def single_results(): db_table = "rest.assoc_results" db_cols = "id variant_name chrom pos ref_allele ref_freq log_pvalue beta se score_stat".split() field_to_col = dict( analysis = "id", variant = "variant_name", chromosome = "chrom", position = "pos", score_test_stat = "score_stat", ref_allele_freq = "ref_freq" ) limit = request.args.get("limit") try: if limit is not None: limit = int(limit) except: raise FlaskException("Invalid limit parameter, must be integer",400) return std_response(db_table,db_cols,field_to_col,limit=limit)
def get_metadata(dbid, table, schema="rest", rename=None): """ Get metadata about a dataset ID (or list of IDs) from a given master table :param dbid: int or list of int dataset IDs :param table: master table, e.g. 'rest.gwascat_master' or 'rest.gene_master' :param schema: schema that master table belongs to :param rename: dictionary of old field -> new field name to return in metadata :return: List of dictionaries with information for each ID """ if rename is None: rename = {} if isinstance(dbid, int): ids = (dbid,) else: try: ids = tuple(int(x) for x in dbid) except: raise FlaskException("Invalid ID type when retrieving metadata", 500) query = psycopg2.sql.SQL("SELECT * FROM {}.{} WHERE id in %s").format( psycopg2.sql.Identifier(schema), psycopg2.sql.Identifier(table) ) cur = g.db.connection.cursor(cursor_factory=psycopg2.extras.DictCursor) cur.execute(query, (ids,)) results = cur.fetchall() if results is None or len(results) == 0: return None else: final = list() for i, row in enumerate(results): final.append(dict()) for k, v in results[i].items(): if k in rename: k = rename[k] final[i][k] = v return final
def genes(): db_table = "rest.gene_data" db_cols = "id feature_type gene_id gene_name chrom start end strand transcript_id exon_id annotation".split() field_to_col = { "source": "id" } orig_filter = request.args.get("filter") if orig_filter is None: raise FlaskException("Filter is a required parameter for this endpoint") fp = FilterParser() orig_stmts = fp.statements(orig_filter) build = request.args.get("build") if 'source' not in orig_stmts: if build is None: raise FlaskException("If no gene source ID is specified via filter parameter, the best recommended gene information source will " "automatically be selected, but you *must* specify the build (genome build) query parameter at a minimum") allowed_builds = fetch_distinct_builds('gene_master') if build not in allowed_builds: raise FlaskException(f"Invalid build {build}, must be one of: {allowed_builds}") dataset_id = fetch_recommended_id(build, 'gene_master') if not dataset_id: raise FlaskException(f"No best recommended gene source is available for build {build}, try querying the metadata endpoint to see all available gene sources") orig_filter += f' and source eq {dataset_id}' sources = [dataset_id] else: sources = orig_stmts["source"].value if build is not None: for v in sources: allowed_build = fetch_build_for_id(v, "gene_master", "genome_build", "rest") if allowed_build != build: raise FlaskException(f"Invalid build {build} given for source ID {v}") sql_compiler = SQLCompiler() cols = "id gene_id gene_name chrom start end strand annotation".split() sql_stmt, sql_params = sql_compiler.to_sql(orig_filter,db_table,db_cols,cols,None,field_to_col) sql_stmt += " AND feature_type = 'gene'" cur = g.db.execute(text(sql_stmt),sql_params) dgenes = {} genes_arr = [] for row in cur: gene_data = dict(zip(cols,row)) annotation = gene_data.pop("annotation") gene_data["gene_type"] = annotation["gene_type"] gene = Gene(**gene_data) dgenes[gene_data["gene_id"]] = gene genes_arr.append(gene) # Now retrieve transcripts/exons # Only retrieve if there were genes found in the region skip_transcripts = request.args.get("transcripts","").lower() in ("f","false","no") if not skip_transcripts and len(genes_arr) > 0: cols = "id feature_type gene_id chrom start end strand transcript_id exon_id".split() trans_keys = "transcript_id chrom start end strand".split() exon_keys = "exon_id chrom start end strand".split() dtranscripts = {} sql_stmt = ( "SELECT {} from {} " "WHERE id in :p1 AND gene_id IN :p2 AND feature_type IN ('transcript','exon') " "ORDER BY CASE feature_type WHEN 'transcript' THEN 1 WHEN 'exon' THEN 2 ELSE 3 END " ).format(",".join(map(sql_compiler.quote_keywords,cols)),db_table) sql_params = { "p1": tuple(sources), "p2": tuple(dgenes.keys()), } cur = g.db.execute(text(sql_stmt),sql_params) for row in cur: rowd = dict(zip(cols,row)) if rowd["feature_type"] == "transcript": tx_id = row["transcript_id"] gene_id = row["gene_id"] chrom = row["chrom"] transcript = dtranscripts.get(tx_id,None) if transcript is None: transcript_data = {k: rowd[k] for k in trans_keys} transcript = Transcript(**transcript_data) dtranscripts[tx_id] = transcript dgenes[gene_id].add_transcript(transcript) elif rowd["feature_type"] == "exon": gene_id = row["gene_id"] tx_id = row["transcript_id"] exon_id = row["exon_id"] exon_data = {k: rowd[k] for k in exon_keys} exon = Exon(**exon_data) gene = dgenes.get(gene_id) if gene is not None: gene.add_exon(exon) transcript = dtranscripts.get(tx_id) if transcript is not None: transcript.add_exon(exon) json_genes = [gene.to_dict() for gene in genes_arr] metadata = get_metadata(sources, "gene_master", "rest") outer = { "data": json_genes, "meta": { "datasets": metadata }, "lastPage": None } return jsonify(outer)
def ld_results(): # GET request parameters filter_str = request.args.get("filter") #fields_str = request.args.get("fields") #sort_str = request.args.get("sort") #format_str = request.args.get("format") if filter_str is None: raise FlaskException("No filter string specified",400) # Figure out the LD API URL to send the request through #base_url = "http://localhost:8888/api_ld/ld?" base_url = "http://portaldev.sph.umich.edu/api_ld/ld?" trans = LDAPITranslator() param_str, param_dict = trans.to_refsnp_url(filter_str) final_url = base_url + param_str # Cache ld_cache = RedisIntervalCache(g.redis_client) # Cache key for this particular request. # Note that in Daniel's API, for now, "reference" is implicitly # attached to build, reference panel, and population all at the same time. # In the future, it will hopefully expand to accepting paramters for all 3, and # then we can include this in the cache key. if "variant1" not in param_dict: raise FlaskException("Must provide variant1 in filter",400) if "reference" not in param_dict: raise FlaskException("Must provide reference in filter",400) refvariant = param_dict["variant1"]["eq"][0] reference = param_dict["reference"]["eq"][0] refpos = int(refvariant.split("_")[0].split(":")[1]) cache_key = "{reference}__{refvariant}".format( reference = reference, refvariant = refvariant ) try: start = int(param_dict["position2"]["ge"][0]) end = int(param_dict["position2"]["le"][0]) except: raise FlaskException("position2 compared to non-integer",400) chromosome = param_dict["chromosome2"]["eq"][0] rlength = abs(end - start) # Is the region larger than we're willing to calculate? max_ld_size = current_app.config["LD_MAX_SIZE"] if math.fabs(end - start) > max_ld_size: raise FlaskException("Requested LD window is too large, exceeded maximum of {}".format(max_ld_size),413) outer = { "data": None, "lastPage": None } data = { "chromosome2": [], "position2": [], "rsquare": [], "variant2": [] } outer["data"] = data # Do we need to compute, or is the cache sufficient? cache_data = None try: cache_data = ld_cache.retrieve(cache_key,start,end) except redis.ConnectionError: print("Warning: cache retrieval failed (redis was unable to connect)") except: print("Error: redis connected, but retrieving data failed") traceback.print_exc() if cache_data is None: # Need to compute. Either the range given is larger than we've previously computed, # or redis is down. print("Cache miss for {reference}__{refvariant} in {start}-{end} ({rlength:,.2f}kb), recalculating".format( reference=reference, refvariant=refvariant, start=start, end=end, rlength=rlength/1000 )) # Fire off the request to the LD server. try: resp = requests.get(final_url) except Exception as e: raise FlaskException("Failed retrieving data from LD server, error was {}".format(e.message),500) # Did it come back OK? if not resp.ok: raise FlaskException("Failed retrieving data from LD server, error was {}".format(resp.reason),500) # Get JSON ld_json = resp.json() # Store in format needed for API response for obj in ld_json["pairs"]: data["chromosome2"].append(ld_json["chromosome"]) data["position2"].append(obj["position2"]) data["rsquare"].append(JSONFloat(obj["rsquare"])) data["variant2"].append(obj["name2"]) # Store data to cache keep = ("name2","rsquare") for_cache = dict(zip( (x["position2"] for x in ld_json["pairs"]), (dict((x,d[x]) for x in keep) for d in ld_json["pairs"]) )) if len(for_cache) > 0: # If we actually received LD data for this variant, store it in the cache. try: ld_cache.store(cache_key,start,end,for_cache) except redis.ConnectionError: print("Warning: cache storage failed (redis was unable to connect)") except: print("Error: storing data in cache failed, traceback was: ") traceback.print_exc() else: print("Cache *match* for {reference}__{refvariant} in {start}-{end} ({rlength:,.2f}kb), using cached data".format( reference=reference, refvariant=refvariant, start=start, end=end, rlength=rlength/1000 )) # We can just use the cache's data directly. for position, ld_pair in iteritems(cache_data): data["chromosome2"].append(chromosome) data["position2"].append(position) data["rsquare"].append(JSONFloat(ld_pair["rsquare"])) data["variant2"].append(ld_pair["name2"]) final_resp = jsonify(outer) return final_resp
def gwascat_results(): db_table = "rest.gwascat_data" db_cols = "id variant rsid chrom pos ref alt trait trait_group risk_allele risk_frq log_pvalue or_beta genes pmid pubdate first_author study".split() filter_str = request.args.get("filter") if filter_str is None: raise FlaskException("No filter string specified",400) fp = FilterParser() filter_stmts = fp.statements(filter_str) build = request.args.get("build") if 'id' not in filter_stmts: if build is None: raise FlaskException("If no GWAS catalog ID is specified via filter parameter, the best recommended catalog will " "automatically be selected, but you *must* specify the build (genome build) parameter at a minimum") allowed_builds = fetch_distinct_builds('gwascat_master') if build not in allowed_builds: raise FlaskException(f"Invalid build {build}, must be one of: {allowed_builds}") dataset_id = fetch_recommended_id(build, 'gwascat_master') if not dataset_id: raise FlaskException(f"No best recommended GWAS catalog is available for build {build}, try querying the metadata endpoint to see all available catalogs") filter_str += f' and id eq {dataset_id}' else: dataset_id = filter_stmts["id"].value if build is not None: for dbid in dataset_id: allowed_build = fetch_build_for_id(dbid, "gwascat_master", "genome_build", "rest") if allowed_build != build: raise FlaskException(f"Invalid build {build} given for GWAS catalog ID {dbid}") json = std_response(db_table,db_cols,return_json=False) if 'decompose' in request.args: if isinstance(json, list): # This is "array of objects" format. new_json = [] for entry in json: if len(entry["alt"]) == 1: new_json.append(entry) else: for alt in entry["alt"].split(","): alt_entry = deepcopy(entry) alt_entry["alt"] = alt alt_entry["variant"] = "{}:{}_{}/{}".format(entry["chrom"], entry["pos"], entry["ref"], alt) new_json.append(alt_entry) elif isinstance(json, dict): # This is "dictionary of arrays" format. new_json = OrderedDict() for k in json: new_json[k] = [] other_keys = json.keys() - ["variant", "alt"] alt_array = json["alt"] for i in range(len(alt_array)): if len(alt_array[i]) == 1: for k in json: new_json[k].append(json[k][i]) else: for alt in alt_array[i].split(","): new_variant = "{}:{}_{}/{}".format( json["chrom"][i], json["pos"][i], json["ref"][i], alt ) new_json["variant"].append(new_variant) new_json["alt"].append(alt) for k in other_keys: new_json[k].append(json[k][i]) else: raise FlaskException("Server error, resulting json object was not dict or list", 500) json = new_json variant_format = request.args.get("variant_format") if variant_format == "colons": if isinstance(json, list): for entry in json: entry["variant"] = re.sub("[:_/]",":",entry["variant"]) elif isinstance(json, dict): for i, v in enumerate(json["variant"]): json["variant"][i] = re.sub("[:_/]",":",v) else: raise FlaskException("Server error, resulting json object was not dict or list", 500) metadata = get_metadata(dataset_id, "gwascat_master", "rest", {"catalog_version": "version"}) return jsonify({ "data": json, "meta": { "datasets": metadata }, "lastPage": None })
def recomb_results(): # Database columns and table db_table = "rest.recomb_results" db_cols = ["id","chromosome","position","recomb_rate","pos_cm"] filter_str = request.args.get("filter") fp = FilterParser() filter_stmts = fp.statements(filter_str) build = request.args.get("build") if 'id' not in filter_stmts: if build is None: raise FlaskException("If no ID is specified via filter parameter, the best recommended recombination rate " "dataset will automatically be selected, but you *must* specify the build (genome build) " "parameter at a minimum") allowed_builds = fetch_distinct_builds('recomb', 'build') if build not in allowed_builds: raise FlaskException(f"Invalid build {build}, must be one of: {allowed_builds}") dataset_id = fetch_recommended_id(build, 'recomb') if not dataset_id: raise FlaskException(f"No best recommended recombination rate dataset is available for build {build}, " f"try querying the metadata endpoint to see all available datasets") filter_str += f' and id eq {dataset_id}' else: dataset_id = filter_stmts["id"].value if build is not None: for dbid in dataset_id: allowed_build = fetch_build_for_id(dbid, "recomb", "build", "rest") if allowed_build != build: raise FlaskException(f"Invalid build {build} given for recombination rate dataset ID {dbid}") matches = fp.parse(filter_str) lrm = fp.left_middle_right(matches) sql_compiler = SQLCompiler() def fetch(terms,limit=None): sql, params = sql_compiler.to_sql_parsed(terms,db_table,db_cols,limit=limit) rows = g.db.execute(text(sql),params).fetchall() return rows def interp(at,left,right): if left is None or right is None: return None left_at = left["position"] right_at = right["position"] left_r = left["recomb_rate"] right_r = right["recomb_rate"] result = dict(left) result["recom_rate"] = left_r + (at-left_at)/(right_at-left_at) * (right_r-left_r) result["position"] = at return result left = fetch(lrm["left"], 1) right = fetch(lrm["right"], 1) middle = fetch(lrm["middle"]) if len(left) < 1: left = [{"id": "chrleft", "position": lrm["range"]["left"], "chromosome": lrm["range"]["chrom"], "recomb_rate": 0, "pos_cm": 0}] if len(right) < 1: right = [{"id": "chrright", "position": lrm["range"]["right"], "chromosome": lrm["range"]["chrom"], "recomb_rate": 0, "pos_cm": 0}] if len(middle) > 0: left_end = interp(lrm["range"]["left"],left[0],middle[0]) right_end = interp(lrm["range"]["right"],middle[-1],right[0]) else: left_end = interp(lrm["range"]["left"],left[0],right[0]) right_end = interp(lrm["range"]["right"],left[0],right[0]) data = reshape_data([left_end] + middle + [right_end],db_cols) metadata = get_metadata(dataset_id, "recomb", "rest", {"build": "genome_build"}) return jsonify({ "data": data, "meta": { "datasets": metadata }, "lastPage": None })