def clean_up(d): d = d.assign(sample_size=[ int(x) if not math.isnan(x) else "NA" for x in d.sample_size ]) if "chromosome" in d.columns.values and "position" in d.columns.values: d = Genomics.sort(d) return d
def process_original_gwas(args, imputed): logging.info("Processing GWAS file %s", args.gwas_file) g = pandas.read_table(args.gwas_file) g = g.assign(current_build="hg38", imputation_status="original")[COLUMN_ORDER] # Remember the palindromic snps are to be excluded from the input GWAS; logging.info("Read %d variants", g.shape[0]) if not args.keep_all_observed: if args.keep_criteria == "GTEX_VARIANT_ID": g = g.loc[~g.panel_variant_id.isin(imputed.panel_variant_id)] elif args.keep_criteria == "CHR_POS": g = g.assign(k=gwas_k(g)) imputed = imputed.assign(k=gwas_k(imputed)) g = g.loc[~g.k.isin({x for x in imputed.k})] g.drop("k", axis=1, inplace=True) imputed.drop("k", axis=1, inplace=True) else: raise RuntimeError("Unsupported keep option") logging.info("Kept %d variants as observed", g.shape[0]) g = pandas.concat([g, imputed])[COLUMN_ORDER] logging.info("%d variants", g.shape[0]) logging.info("Filling median") g = Genomics.fill_column_to_median(g, "sample_size", numpy.int32) logging.info("Sorting by chromosome-position") g = Genomics.sort(g) logging.info("Saving") Utilities.save_dataframe(g, args.output) return g[["panel_variant_id"]]
def run(args): if os.path.exists(args.output): logging.info("Output already exists, either delete it or move it") return logging.info("Loading group") groups = pandas.read_table(args.group) groups = groups.assign(chromosome = groups.gtex_intron_id.str.split(":").str.get(0)) groups = groups.assign(position=groups.gtex_intron_id.str.split(":").str.get(1)) groups = Genomics.sort(groups) logging.info("Getting parquet genotypes") file_map = get_file_map(args) logging.info("Getting genes") with sqlite3.connect(args.model_db_group_key) as connection: # Pay heed to the order. This avoids arbitrariness in sqlite3 loading of results. extra = pandas.read_sql("SELECT * FROM EXTRA order by gene", connection) extra = extra[extra["n.snps.in.model"] > 0] individuals = TextFileTools.load_list(args.individuals) if args.individuals else None logging.info("Processing") Utilities.ensure_requisite_folders(args.output) genes_ = groups[["chromosome", "position", "gene_id"]].drop_duplicates() with gzip.open(args.output, "w") as f: f.write("GENE RSID1 RSID2 VALUE\n".encode()) with sqlite3.connect(args.model_db_group_key) as db_group_key: with sqlite3.connect(args.model_db_group_values) as db_group_values: for i,t_ in enumerate(genes_.itertuples()): g_ = t_.gene_id chr_ = t_.chromosome.split("chr")[1] logging.log(8, "Proccessing %i/%i:%s", i+1, len(genes_), g_) if not n_.search(chr_): logging.log(9, "Unsupported chromosome: %s", chr_) continue dosage = file_map[int(chr_)] group = groups[groups.gene_id == g_] wg=[] for value in group.intron_id: wk = pandas.read_sql("select * from weights where gene = '{}';".format(value), db_group_values) if wk.shape[0] == 0: continue wg.append(wk) if len(wg) > 0: wg = pandas.concat(wg) w = pandas.concat([wk, wg])[["varID", "rsid"]].drop_duplicates() else: w = wk[["varID", "rsid"]].drop_duplicates() if w.shape[0] == 0: logging.log(8, "No data, skipping") continue if individuals: d = Parquet._read(dosage, columns=w.varID.values, specific_individuals=individuals) del d["individual"] else: d = Parquet._read(dosage, columns=w.varID.values, skip_individuals=True) var_ids = list(d.keys()) if len(var_ids) == 0: if len(w.varID.values) == 1: logging.log(9, "workaround for single missing genotype at %s", g_) d = {w.varID.values[0]:[0,1]} else: logging.log(9, "No genotype available for %s, skipping",g_) next if args.output_rsids: ids = [x for x in pandas.DataFrame({"varID": var_ids}).merge(w[["varID", "rsid"]], on="varID").rsid.values] else: ids = var_ids c = numpy.cov([d[x] for x in var_ids]) c = matrices._flatten_matrix_data([(g_, ids, c)]) for entry in c: l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2], entry[3]) f.write(l.encode()) logging.info("Finished building covariance.")