def run(args): if os.path.exists(args.output_file): logging.info("Output %s exists. Nope", args.output_file) return results_order = [] results = {} logging.info("Streaming file for groups") for i,line in Utilities.iterate_file(args.input_file): if i==0: continue comps = line.strip().split() key = comps[0] if not key in results: results_order.append(key) results[key] = 0 logging.log(9, "Key: %s", str(key)) results[key] += 1 r = [] logging.info("Producing output") for key in results_order: r.append((key, results[key])) r = pandas.DataFrame(r, columns=["key","count"]) logging.info("Saving") Utilities.ensure_requisite_folders(args.output_file) Utilities.save_dataframe(r, args.output_file) logging.info("Finished.")
def run(args): Utilities.ensure_requisite_folders(args.output) logging.info("starting lifting over.") liftover = pyliftover.LiftOver(args.liftover) with gzip.open(args.output, "w") as _o: with open(args.input) as _i: for i,line in enumerate(_i): if i ==0: line = "\t".join(line.strip().split()) + "\n" _o.write(line.encode()) continue try: comps = line.strip().split() chr = comps[0] start = int(comps[1]) end = int(comps[2]) _chrs, _s = _l(liftover, chr, start) _chre, _e = _l(liftover, chr, end) if _chrs != _chre: logging.warning("{}:{}:{} have different target chromosomes: {}/{}".format(chr, start, end, _chrs, _chre)) line = "{}\n".format("\t".join([_chrs, str(_s), str(_e)])) _o.write(line.encode()) except Exception as e: logging.info("Error for: %s", line) logging.info("Finished lifting over.")
def process_original_gwas(args, imputed): logging.info("Processing GWAS file %s", args.gwas_file) g = pandas.read_table(args.gwas_file) g = g.assign(current_build="hg38", imputation_status="original")[COLUMN_ORDER] # Remember the palindromic snps are to be excluded from the input GWAS; logging.info("Read %d variants", g.shape[0]) if not args.keep_all_observed: if args.keep_criteria == "GTEX_VARIANT_ID": g = g.loc[~g.panel_variant_id.isin(imputed.panel_variant_id)] elif args.keep_criteria == "CHR_POS": g = g.assign(k=gwas_k(g)) imputed = imputed.assign(k=gwas_k(imputed)) g = g.loc[~g.k.isin({x for x in imputed.k})] g.drop("k", axis=1, inplace=True) imputed.drop("k", axis=1, inplace=True) else: raise RuntimeError("Unsupported keep option") logging.info("Kept %d variants as observed", g.shape[0]) g = pandas.concat([g, imputed])[COLUMN_ORDER] logging.info("%d variants", g.shape[0]) logging.info("Filling median") g = Genomics.fill_column_to_median(g, "sample_size", numpy.int32) logging.info("Sorting by chromosome-position") g = Genomics.sort(g) logging.info("Saving") Utilities.save_dataframe(g, args.output) return g[["panel_variant_id"]]
def __enter__(self): logging.info("initializing resources") logging.info("Loading regions") regions = load_regions(self.args.region_file, self.args.chromosome) if args.sub_batches and args.sub_batch is not None: logging.log(9, "Selecting target regions from sub-batches") regions = PandasHelpers.sub_batch(regions, args.sub_batches, args.sub_batch) self.regions = regions logging.info("Opening variants metadata") self.vmf = pq.ParquetFile(args.parquet_genotype_metadata) logging.info("Creating destination") if args.text_output: if os.path.exists(args.text_output): raise RuntimeError("Output exists. Nope.") Utilities.ensure_requisite_folders(args.text_output) self.of = TextFileTools.TextDataSink( args.text_output, [("region", "id1", "id2", "value")]) self.of.initialize() elif args.text_output_folder: Utilities.maybe_create_folder(args.text_output_folder) else: raise RuntimeError("Unrecognized output specification") if (args.parquet_genotype_folder and args.parquet_genotype_pattern): self.file_map = get_file_map(args) else: raise RuntimeError("Unrecognized genotype specification") return self
def run(args): if not args.reentrant: if os.path.exists(args.output_folder): logging.info("Output path exists. Nope.") return Utilities.maybe_create_folder(args.output_folder) logging.info("Checking input folder") r = re.compile(args.rule) folders = [x for x in sorted(os.listdir(args.input_folder)) if r.search(x)] if args.exclude: folders = [x for x in folders if not x in {y for y in args.exclude}] names = {} for f in folders: name = r.search(f).group(1) if not name in names: names[name] = [] names[name].append(os.path.join(args.input_folder, f)) _f = shutil.move if args.move else shutil.copy for name in sorted(names): logging.info("Processing %s", name) output_folder = os.path.join(args.output_folder, name) Utilities.maybe_create_folder(output_folder) for input_folder in names[name]: logging.log(8, "Processing %s", input_folder) files = os.listdir(input_folder) for file in files: i = os.path.join(input_folder, file) o = os.path.join(output_folder, file) _f(i, o) logging.info("Finished collapse")
def run(args): logging.info("Starting") Utilities.ensure_requisite_folders(args.output) logging.info("Read covariate") covariate = pq.read_table(args.covariate).to_pandas() logging.info("Read data") data = pq.read_table(args.data).to_pandas() logging.info("Processing") covariate_names = covariate.columns.values[1:] results = {"individual": data.individual.values} variables = [x for x in data.columns.values[1:]] for i, column in enumerate(variables): logging.log(9, "%i/%i:%s", i, len(variables), column) d = data[["individual", column]].rename(columns={ column: "y" }).merge(covariate, on="individual", how="inner").drop("individual", axis=1) y, X = dmatrices("y ~ {}".format(" + ".join(covariate_names)), data=d, return_type="dataframe") model = sm.OLS(y, X) result = model.fit() results[column] = result.resid results = pandas.DataFrame(results)[["individual"] + variables] Parquet.save_variable(args.output, results) logging.info("Finished")
def run(args): Coloc.initialize(args.coloc_script) if os.path.exists(args.output): logging.info("Output exists. Nope.") return start = timer() logging.info("Loading gwas") gwas = Coloc.read_gwas(args.gwas, args.gwas_sample_size, args.gwas_mode) streamer = Coloc.eqtl_streamer(args.eqtl, gwas) results = [] logging.info("Beggining process") MAX_N = args.MAX_N for i, d in enumerate(streamer): gene = d.gene_id.values[0] logging.log(9, "Processing gene %s", gene) eqtl = Coloc.get_eqtl(d, args.eqtl_sample_size, args.eqtl_mode) r = Coloc.coloc_on_gwas_eqtl(gene, gwas, eqtl, args.gwas_mode, args.eqtl_mode, args.p1, args.p2, args.p12) results.append(r) if MAX_N and i > MAX_N: logging.info("Early exit") break logging.info("Saving") results = Coloc.results_to_dataframe(results) Utilities.ensure_requisite_folders(args.output) Utilities.save_dataframe(results, args.output) end = timer() logging.info("Finished COLOC in %s seconds" % (str(end - start)))
def run(args): start = timer() if os.path.exists(args.output_folder): logging.info("Output folder exists. Nope.") return if os.path.exists(args.intermediate_folder): logging.info("Intermediate folder exists. Nope.") return stats = [] context = DAPUtilities.context_from_args(args) available_genes = context.get_available_genes() for i,gene in enumerate(available_genes): if args.MAX_M and i==args.MAX_M: break _start = timer() logging.log(8, "Processing %i/%i:%s", i+1, len(available_genes), gene) _stats = RunDAP.run_dap(context, gene) _end = timer() logging.log(7, "Elapsed: %s", str(_end - _start)) stats.append(_stats) end = timer() logging.info("Ran DAP in %s seconds" % (str(end - start))) Utilities.ensure_requisite_folders(args.output_folder) stats_ = args.stats_name if args.stats_name else "stats.txt" stats_path = os.path.join(args.output_folder, stats_) stats = RunDAP.data_frame_from_stats(stats).fillna("NA") Utilities.save_dataframe(stats, stats_path)
def run(args): start = timer() Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading SNP annotation") snp_key = KeyedDataSource.load_data(args.snp_annotation_file, "varID", "rsid_dbSNP150", should_skip=KeyedDataSource.skip_na) logging.info("Loading Genotype") genotype, individual_ids = ModelTraining.load_genotype_folder( args.input_genotype_folder, args.input_genotype_file_pattern, snp_key) logging.info("Saving Genotype") path_variant = args.output_prefix + ".variants.parquet" Parquet.save_variants(path_variant, genotype, individual_ids) path_metadata_variant = args.output_prefix + ".variants_metadata.parquet" Parquet.save_metadata(path_metadata_variant, genotype) logging.info("Processing Expression Phenotype") expression_logic = Utilities.file_logic( args.input_phenotype_folder, args.input_phenotype_expression_pattern) for row in expression_logic.itertuples(): logging.info("Phenotype: %s", row.name) process_phenotype(row.path, row.name, args.output_prefix) end = timer() logging.info("Finished in %s", str(end - start))
def run(args): if os.path.exists(args.output): logging.info("%s exists. Nope.", args.output) return logging.info("Loading regions") regions = pandas.read_table( args.region_file).rename(columns={"chr": "chromosome"}) regions.dropna(inplace=True) regions.start = regions.start.astype(int) regions.stop = regions.stop.astype(int) logging.info("Loading gwas") gwas = pandas.read_table( args.gwas_file, usecols=["panel_variant_id", "chromosome", "position", "zscore"]) gwas.dropna(inplace=True) logging.info("Processing") sliced = [] for i, region in enumerate(regions.itertuples()): logging.log(8, "Processing region %d", i + 1) if numpy.isnan(region.start) or numpy.isnan(region.stop) or \ (type(region.chromosome) != str and numpy.isnan(region.chromosome)): logging.log(8, "skipping incomplete region") continue slice = gwas[(gwas.chromosome == region.chromosome) & (gwas.position >= region.start) & (gwas.position < region.stop)] slice = slice.sort_values(by="position") if slice.shape[0] == 0: continue slice = slice.assign(region="region-{}-{}-{}".format( region.chromosome, region.start, region.stop), r=i) slice = slice[["panel_variant_id", "region", "r", "zscore"]] sliced.append(slice) sliced = pandas.concat(sliced).sort_values(by="r") if args.output_format == "dapg": sliced.region = sliced.r.apply(lambda x: "region{}".format(x)) sliced = sliced.drop(["r"], axis=1) Utilities.save_dataframe(sliced, args.output, header=False) elif args.output_format == "gtex_eqtl": sliced = sliced.assign(gene_id=sliced.region, variant_id=sliced.panel_variant_id, tss_distance=numpy.nan, ma_samples=numpy.nan, ma_count=numpy.nan, maf=numpy.nan, pval_nominal=numpy.nan, slope=sliced.zscore, slope_se=1) sliced = sliced[[ "gene_id", "variant_id", "tss_distance", "ma_samples", "ma_count", "maf", "pval_nominal", "slope", "slope_se" ]] Utilities.save_dataframe(sliced, args.output, header=True) logging.info("Finished slicing gwas")
def sink(self, cov, ids, region): logging.log(9, "Serializing covariance") _region = "{}_{}_{}_{}".format(region.name, region.chr, region.start, region.stop) if args.text_output: if args.dapg_output: raise RuntimeError("Not supported for this option") else: cov = matrices._flatten_matrix_data([(_region, ids, cov)]) self.of.sink(cov) elif args.text_output_folder: if args.dapg_output: f = os.path.join(args.text_output_folder, _region) + ".txt.gz" with gzip.open(f, "w") as o: for i in range(0, cov.shape[0]): l = "\t".join(["{:0.4f}".format(x) for x in cov[i]]) + "\n" o.write(l.encode()) id = os.path.join(args.text_output_folder, _region) + ".id.txt.gz" with gzip.open(id, "w") as o: l = "\n".join(ids).encode() o.write(l) else: cov = matrices._flatten_matrix_data_2(ids, cov) cov = pandas.DataFrame(cov)[["id1", "id2", "value"]] f = os.path.join(args.text_output_folder, _region) + ".txt.gz" Utilities.save_dataframe(cov, f)
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope") return filters = {x[0]: x[1:] for x in args.filter} maf_filter = float(filters["MAF"][0]) if "MAF" in filters else None logging.info("Loading GTEX variant map") gtex_snp_key = GTExMisc.load_gtex_variant_to_rsid(args.annotation[0]) logging.info("Processing genotype") m = [] for mean, metadata, ids in ModelTraining.dosage_generator( args.genotype, gtex_snp_key, dosage_conversion=ModelTraining._mean, do_none=True): if maf_filter: f = mean / 2 if mean < 1 else 1 - mean / 2 if f < maf_filter: continue m.append(metadata) m = Utilities.to_dataframe(m, [x[1] for x in Genotype.MetadataTFE.order]) if "TOP_CHR_POS_BY_FREQ" in filters: logging.info("Simplifying multi-allelic variants") m = Genotype._monoallelic_by_frequency(m) logging.info("Saving...") Utilities.save_dataframe(m, args.output) logging.info("Finished")
def run(args): Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading snp reference") key = KeyedDataSource.load_data(args.snp_reference_file, "variant_id", "rs_id_dbSNP150_GRCh38p7", value_conversion=KeyedDataSource.dot_to_na) logging.info("Loading samples") samples = TextFileTools.load_list(args.samples) genotype_format_string = "\t".join(["{}"] * (len(samples) + 1)) + "\n" og = args.output_prefix + "_genotype.txt.gz" oa = args.output_prefix + "_annotation.txt.gz" if os.path.exists(og) or os.path.exists(oa): logging.info("Output exists. Nope.") return logging.info("Processing") with gzip.open(args.genotype) as geno: with gzip.open(og, "w") as _og: _og.write(_to_gl(["varID"] + samples, genotype_format_string)) with gzip.open(oa, "w") as _oa: _oa.write( _to_al([ "chromosome", "position", "id", "allele_0", "allele_1", "allele_1_frequency", "rsid" ])) for i, line in enumerate(geno): comps = line.decode().strip().split() chr = "chr" + comps[0] pos = comps[2] ref = comps[3] alt = comps[4] af = comps[5] dosage = comps[6:] var_id = "{}_{}_{}_{}_b38".format(chr, pos, ref, alt) if var_id in key: id = key[var_id] comps[1] = var_id _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, ref, alt, af, id])) next var_id = "{}_{}_{}_{}_b38".format(chr, pos, alt, ref) if var_id in key and len(ref) == 1 and len(alt) == 1: id = key[var_id] af = str(1 - float(af)) dosage = list(map(lambda x: str(2 - int(x)), comps[6:])) _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, alt, ref, af, id])) next logging.info("Finished conversion")
def run(args): if os.path.exists(args.output): logging.info("output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output) logging.info("Loading data annotation") gene_annotation = StudyUtilities.load_gene_annotation(args.gene_annotation) gene_annotation = gene_annotation.rename( {"gene_name": "genename"}, axis=1)[["gene_id", "genename", "gene_type"]] logging.info("Loading variant annotation") features_metadata = pq.read_table(args.features_annotation).to_pandas() logging.info("Loading spec") weights = get_weights(args.spec) w = weights.merge(features_metadata[["id", "allele_0", "allele_1", "rsid"]], on="id", how="left") w = w.rename( { "allele_0": "ref_allele", "allele_1": "eff_allele", "id": "varID" }, axis=1) w["gene"] = w.gene_id.str.cat(w.cluster_id.astype(str), sep="_") w = w.drop(["w", "cluster_id"], axis=1) w = w.sort_values(by="gene").assign(weight=1) logging.info("Building models") with sqlite3.connect(args.output) as conn: w.drop("gene_id", axis=1).fillna("NA")[[ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ]].to_sql("weights", conn, index=False) e = w[["gene_id", "gene"]].merge(gene_annotation, on="gene_id").drop("gene_id", axis=1) e["n_snps_in_window"] = None e["n.snps.in.model"] = 1 e["pred.perf.pval"] = None e["pred.perf.qval"] = None e["pred.perf.R2"] = None e = e[[ "gene", "genename", "gene_type", "n_snps_in_window", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval" ]] e.to_sql("extra", conn, index=False) Models.model_indexes(conn) logging.info("Finished")
def _get_lines(input_file, columns, in_delim, out_delim): with Utilities.open_any(input_file) as _input: header = _input.readline().strip().split(in_delim) indexes = [header.index(x) for x in columns] yield _to_line(header, indexes, out_delim) for i, line in Utilities._iterate_file(_input): comps = line.strip().split() yield _to_line(comps, indexes, out_delim)
def run(args): logging.info("Loading models") weights, extra = Models.read_model(args.input) Utilities.save_dataframe(weights, args.output_prefix + "_weights.txt.gz") Utilities.save_dataframe(extra, args.output_prefix + "_extra.txt.gz") logging.info("Done")
def run(args): start = timer() Utilities.ensure_requisite_folders(args.parquet_output) logging.info("Loading variable") variables = ModelTraining.load_variable_file(args.variable_file) logging.info("Saving") Parquet.save_variable(args.parquet_output, variables) end = timer() logging.info("Finished in %s", str(end-start))
def save_expression(intermediate_folder, gene, d_, features_data_): y_folder = _y_folder(intermediate_folder, gene) os.makedirs(_y_folder(intermediate_folder, gene)) for k, v in d_.items(): if not gene in v: logging.log(8, "%s not present in %s", gene, k) continue p = os.path.join(y_folder, k) + ".txt" v = v.merge(features_data_[["individual", "id"]], on="individual")[["id", gene]] Utilities.save_dataframe(v, p, header=False)
def save_x(intermediate_folder, gene, features_, features_data_): Utilities.save_dataframe(features_data_.drop("individual", axis=1), _x_path(intermediate_folder, gene), header=False, sep=" ") Utilities.save_dataframe( features_[["id", "allele_0", "allele_1"]].rename(columns={ "id": "SNP", "allele_0": "REF.0.", "allele_1": "ALT.1." }), _info_path(intermediate_folder, gene))
def run(args): logging.info("Loading model summaries") extra = _read_2(args.input_prefix, "_summary.txt.gz") extra = extra[extra["n.snps.in.model"] > 0] if "rho_avg" in extra: extra = extra[(extra["pred.perf.pval"] < 0.05) & (extra.rho_avg > 0.1)] else: extra = extra[(extra["pred.perf.pval"] < 0.05)] extra = extra.assign(rho_avg=None) if not "pred.perf.qval" in extra: extra["pred.perf.qval"] = None if "nested_cv_converged" in extra: extra.nested_cv_converged = extra.nested_cv_converged.astype( numpy.int32) logging.info("Loading weights") weights = _read_2(args.input_prefix, "_weights.txt.gz") weights = weights[weights.gene.isin(extra.gene)] if args.output_prefix: logging.info("Saving dbs and covariance") db = args.output_prefix + ".db" logging.info("Saving db") Models.create_model_db(db, extra, weights) logging.info("Processing covariances") genes = {x for x in extra.gene} path_ = os.path.split(args.input_prefix) r = re.compile(path_[1] + "_covariance.txt.gz") files = sorted([x for x in os.listdir(path_[0]) if r.search(x)]) files = [os.path.join(path_[0], x) for x in files] cov = args.output_prefix + ".txt.gz" with gzip.open(cov, "w") as cov_: cov_.write("GENE RSID1 RSID2 VALUE\n".encode()) for nf, f in enumerate(files): logging.log(9, "file %i/%i: %s", nf, len(files), f) with gzip.open(f) as f_: f_.readline() for l in f_: gene = l.decode().strip().split()[0] if not gene in genes: continue cov_.write(l) if args.output_prefix_text: logging.info("Saving text output") Utilities.save_dataframe(weights, args.output_prefix_text + "_t_weights.txt") Utilities.save_dataframe(extra, args.output_prefix_text + "_t_extra.txt") logging.info("Done")
def run(args): start = timer() if os.path.exists(args.output_folder): logging.info("Output folder exists. Nope.") return if os.path.exists(args.intermediate_folder): logging.info("Intermediate folder exists. Nope.") return os.makedirs(args.intermediate_folder) os.makedirs(args.output_folder) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table( args.parquet_genotype_metadata).to_pandas() else: features_metadata = pq.ParquetFile( args.parquet_genotype_metadata).read_row_group(args.chromosome - 1).to_pandas() logging.info("Opening features") features = pq.ParquetFile(args.parquet_genotype) logging.info("Opening summary stats") summary_stats = load_summary_stats(args.summary_stats) summary_stats = summary_stats[summary_stats.variant_id.isin( features_metadata.id)] regions = summary_stats[["region_id"]].drop_duplicates() if args.sub_batches is not None and args.sub_batch is not None: regions = PandasHelpers.sub_batch(regions, args.sub_batches, args.sub_batch) stats = [] for i, region in enumerate(regions.itertuples()): logging.log(9, "Region %i/%i:%s", i, regions.shape[0], region.region_id) _stats = run_dapg(region, features, features_metadata, summary_stats, args.intermediate_folder, args.output_folder, args.options, args.dap_command, not args.keep_intermediate_folder) stats.append(_stats) stats_path = os.path.join(args.output_folder, "stats.txt") stats = RunDAP.data_frame_from_stats(stats).fillna("NA") Utilities.save_dataframe(stats, stats_path) end = timer() logging.info("Ran DAP in %s seconds" % (str(end - start)))
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return logging.info("Loading samples") samples = {x for x in TextFileTools.load_list(args.samples_whitelist)} logging.info("Processing file") Utilities.ensure_requisite_folders(args.output) Utilities.write_iterable_to_file(input_generator(args.input_file, samples), args.output) logging.info("Finished")
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return Utilities.ensure_requisite_folders(args.output) logging.info("Acquiring files") logic = Utilities.file_logic_2(args.input_folder, args.input_pattern, args.name_subfield, args.input_filter) trait_map = None if args.trait_map: logging.info("Loading file mapping") trait_map = get_trait_map(args.trait_map) gene_id_map, gene_name_map = None, None if args.gene_annotation: logging.info("Loading gene annotation") gene_id_map, gene_name_map = get_gene_map(args.gene_annotation) logging.info("Processing files") r = [] for f in logic.itertuples(): logging.info("Processing %s", f.file) names = get_header_names(args.header_names) if args.separator == ",": d = pandas.read_csv(f.path, header='infer' if not names else None, names=names) elif args.separator is None: d = pandas.read_table(f.path, header='infer' if not names else None, names=get_header_names(args.header_names), sep="\s+") else: raise RuntimeError("Unsupported separator") if args.specific_post_processing == "FAST_ENLOC": d = fast_enloc_postprocessing(d, gene_id_map, gene_name_map) elif args.specific_post_processing: raise RuntimeError("Unsupported postprocessing option") d = d.assign(trait=trait_map[f.trait], tissue=f.tissue) r.append(d) r = pandas.concat(r) logging.info("Saving") Utilities.save_dataframe(r, args.output) logging.info("Finished processing.")
def run(args): start = timer() Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading SNP annotation") #TODO: make more generic variant_key = get_variant_key(args) if args.split_by_chromosome: generate_multi_backend(args, variant_key) else: generate_single_backend(args, variant_key) end = timer() logging.info("Finished in %s", str(end - start))
def run(args): logging.info("Loading annotation") annotation = pandas.read_table(args.input_annotation) logging.info("Loading region") regions, genes = build_regions(annotation, args.chromosome, args.sub_jobs, args.window) file_name = os.path.split(args.input_file)[1] name = file_name.split(".txt.gz")[0] logging.info("Saving gene lists") gene_outputs = [ os.path.join(args.output_folder, name) + "_{}_genes.txt.gz".format(i) for i in range(1, args.sub_jobs + 1) ] for i, p in enumerate(gene_outputs): with gzip.open(p, "w") as f: genes_ = genes[i] for gene in genes_: f.write("{}\n".format(gene).encode()) logging.info("Processing file") outputs = [ os.path.join(args.output_folder, name) + "_{}.txt.gz".format(i) for i in range(1, args.sub_jobs + 1) ] Utilities.ensure_requisite_folders(outputs[0]) output_files = [gzip.open(x, "w") for x in outputs] with gzip.open(args.input_file) as input_file: header = input_file.readline() for f in output_files: f.write(header) for i, line in enumerate(input_file): comps = line.decode().strip().split() pos = int(comps[0].split("_")[1]) targets = regions[(regions.start <= pos) & (pos < regions.end)] for target in targets.itertuples(): f = output_files[target.Index] f.write(line) logging.info("Finalizing output files") for f in output_files: f.close() logging.info("Finished")
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return Utilities.ensure_requisite_folders(args.output) logging.info("Loading variant annotation") variants = KeyedDataSource.load_data(args.variant_annotation, "variant_id", args.rsid_column) logging.info("Loading data annotation") if len(args.data_annotation) == 1: data_annotation = pandas.read_table(args.data_annotation[0]) data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][data_annotation.feature_type == "gene"].drop_duplicates() elif len(args.data_annotation) == 2: data_annotation = pandas.read_table(args.data_annotation[0]) data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][ data_annotation.feature_type == args.data_annotation[1]].drop_duplicates() else: raise RuntimeError("Unsupported annotation length") logging.info("Loading model_input") data = pandas.read_table(args.model_input, usecols=["gene_id", "gene_name", "variant", "weight"]) logging.info("Processing") if args.model_filter and args.model_filter[1] == "PIP": w = Miscellaneous.dapg_signals(args.model_filter[0], float(args.model_filter[2]), variants) w = w.rename(columns={"gene":"gene_id", "variant_id":"variant"}) data = data.merge(w[["gene_id", "variant"]], on=["gene_id", "variant"]) v = pandas.DataFrame([(k,variants[k]) for k in data.variant.drop_duplicates()], columns=["variant", "rsid"]) v.loc[v.rsid == ".", "rsid"] = v.loc[v.rsid == ".", "variant"] weights = data.merge(v, on="variant") weights = weights.assign( ref_allele = weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(3)), eff_allele=weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(4))) weights = weights.rename(columns={"variant":"varID", "gene_id":"gene"})[["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]] extra = data.groupby("gene_id").size().to_frame("n.snps.in.model").reset_index() extra = extra.merge(data_annotation[["gene_id", "gene_name", "gene_type"]], on="gene_id") extra["pred.perf.pval"] = None extra["pred.perf.qval"] = None extra["pred.perf.R2"] = None extra = extra[["gene_id", "gene_name", "gene_type", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]].rename(columns={"gene_id":"gene", "gene_name":"genename"}) logging.info("Saving db") Models.create_model_db(args.output, extra, weights) logging.info("Done")
def run(args): if os.path.exists(args.output): logging.info("output path %s exists. Nope.", args.output) return start = timer() logging.info("Parsing input GWAS") d = GWAS.load_gwas(args.gwas_file, args.output_column_map, force_special_handling=args.force_special_handling, skip_until_header=args.skip_until_header, separator=args.separator, handle_empty_columns=args.handle_empty_columns, input_pvalue_fix=args.input_pvalue_fix, enforce_numeric_columns=args.enforce_numeric_columns) logging.info("loaded %d variants", d.shape[0]) d = pre_process_gwas(args, d) if args.fill_from_snp_info: d = fill_coords(args, d) if args.chromosome_format: d = d.assign(chromosome=Genomics.to_int(d.chromosome)) d = d.assign(chromosome=["chr{}".format(x) for x in d.chromosome]) if args.liftover: d = liftover(args, d) if args.snp_reference_metadata: d = fill_from_metadata(args, d, extra_col_dict=load_extra_col_key_value_pairs( args.meta_extra_col)) if args.output_order: order = args.output_order for c in order: if not c in d: d = d.assign(**{c: numpy.nan}) d = d[order] d = clean_up(d) logging.info("Saving...") Utilities.save_dataframe(d, args.output, fill_na=True) end = timer() logging.info("Finished converting GWAS in %s seconds", str(end - start))
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return if args.output_column_map: selected = [x[0] for x in args.output_column_map] else: selected = [ Gencode.GFTF.K_GENE_ID, Gencode.GFTF.K_GENE_NAME, Gencode.GFTF.K_GENE_TYPE ] logging.info("Loading Gencode") gencode = Gencode.load( args.gencode_file, feature_type_whitelist={x for x in args.feature_type_whitelist}, gene_type_white_list={x for x in args.gene_type_whitelist}, transcript_type_whitelist={x for x in args.transcript_type_whitelist}, selected_key_value_pairs=selected) #gencode = _reformat(gencode) logging.info("Converting format") if args.output_column_map: gencode = gencode.rename( columns={x[0]: x[1] for x in args.output_column_map}) if "gene_version" in gencode and "gene_id" in gencode: gencode["gene_id"] = gencode.gene_id + "." + gencode.gene_version keep = [ "chromosome", "start_location", "end_location", "feature_type", "strand" ] + [ x[1] for x in args.output_column_map if x[1] not in {"gene_version"} ] gencode = gencode[keep] else: gencode = gencode[[ "chromosome", "start_location", "end_location", "feature_type", "strand" ] + [x[1] for x in args.output_column_map]] logging.info("Saving") Utilities.save_dataframe(gencode, args.output) logging.info("Finished")
def process_imputed(args): r = re.compile(args.pattern) files = sorted([x for x in os.listdir(args.folder) if r.search(x)]) count = 0 keys = set() for i, file in enumerate(files): logging.info("Processing imputed %s", file) p = os.path.join(args.folder, file) g = pandas.read_table(p) if g.shape[0] == 0: logging.info("Empty set of results for %s", p) continue count += g.shape[0] #Fast dropping of observed values #g = g.merge(observed_ids, on="panel_variant_id", how="left", copy=False, indicator=True) #g = g[g._merge == "left_only"] g.drop(["n", "n_indep", "most_extreme_z"], axis=1, inplace=True) g.rename(columns={ "effect_allele_frequency": "frequency", "status": "imputation_status" }, inplace=True) g = g.assign(pvalue=2 * stats.norm.sf(numpy.abs(g.zscore)), effect_size=numpy.nan, standard_error=numpy.nan, sample_size=numpy.nan, current_build="hg38") g = g[COLUMN_ORDER] Utilities.save_dataframe(g, args.output, mode="a" if i > 0 else "w", header=i == 0) if not args.keep_all_observed: if args.keep_criteria == "GTEX_VARIANT_ID": keys.update(g.panel_variant_id.values) elif args.keep_criteria == "CHR_POS": chr_pos = g.apply( lambda x: "{}_{}".format(x.chromosome, int(x.position)), axis=1) keys.update(chr_pos) else: raise RuntimeError("Unsupported keep option") logging.info("Processed %d imputed variants", count) return keys
def run(args): if os.path.exists(args.output): logging.info("Output already exists, either delete it or move it") return logging.info("Getting parquet genotypes") file_map = get_file_map(args) logging.info("Getting variants") gene_variants = get_gene_variant_list(args.model_db_folder, args.model_db_file_pattern) genes = list(gene_variants.gene.drop_duplicates()) Utilities.ensure_requisite_folders(args.output) logging.info("Processing") with gzip.open(args.output, "w") as f: f.write("GENE RSID1 RSID2 VALUE\n".encode()) for i, g in enumerate(gene_variants.gene.drop_duplicates()): logging.log(9, "Proccessing %i/%i:%s", i + 1, len(genes), g) w = gene_variants[gene_variants.gene == g] chr_ = w.varID.values[0].split("_")[0].split("chr")[1] if not n_.search(chr_): logging.log(9, "Unsupported chromosome: %s", chr_) continue dosage = file_map[int(chr_)] d = Parquet._read(dosage, columns=w.varID.values, skip_individuals=True) var_ids = list(d.keys()) if args.output_rsids: ids = [ x for x in pandas.DataFrame({ "varID": var_ids }).merge(w[["varID", "rsid"]], on="varID").rsid.values ] else: ids = var_ids c = numpy.cov([d[x] for x in var_ids]) c = matrices._flatten_matrix_data([(w.gene.values[0], ids, c)]) for entry in c: l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2], entry[3]) f.write(l.encode()) logging.info("Finished building covariance.")