def run(args): if os.path.exists(args.cs_output) or os.path.exists(args.var_output): logging.info("Output exists. Nope.") return study, variants_whitelist = get_study(args.parquet_genotype_folder, args.parquet_genotype_pattern, args.parquet_genotype_metadata) #_skip = lambda x: x not in variants_whitelist columns = ["maf", "pval_nominal", "slope", "slope_se"] eqtl_streamer = DataFrameStreamer.data_frame_streamer(args.eqtl, sanitize=True, to_numeric=columns, sentinel_column="gene_id") individuals = None if not args.restrict_to_individuals else TextFileTools.load_list(args.restrict_to_individuals) genes = None if not args.restrict_to_genes else set(TextFileTools.load_list(args.restrict_to_genes)) cs_results = [] var_results = [] logging.info("Beggining process") MAX_N=args.MAX_N n=args.sample_size for i, d in enumerate(eqtl_streamer): if MAX_N and i > MAX_N: logging.info("Early exit") break gene = d.gene_id.values[0] if genes is not None and gene.split('.')[0] not in genes: logging.log(9, "Skipping gene: %s", gene) continue logging.log(9, "Processing gene %i:%s", i+1, gene) d = d.loc[(~d.slope_se.isnull()) & (d.slope!=0) & (~d.slope.isnull())] try: res_, d_ = _do_susie(d, study, variants_whitelist, n, individuals, args.mode) cs, vars =_process_result(res_, d_, gene) except Exception as e: logging.log(9, "Error while doing susie:\n%s", traceback.format_exc()) cs = _void_cs("susie_error").assign(gene_id=gene, pp_sum=None) vars = _void_var().assign(gene_id=[gene], var_id=[None]) cs_results.append(cs) #if vars.shape[1]>0: var_results.append(vars) if len(cs_results) > 0: logging.info("Saving") cs_results = pandas.concat(cs_results)[["gene_id", "cs", "cs_avg_r2", "cs_log10bf", "cs_min_r2", "var_id", "pp_sum", "status"]] Utilities.ensure_requisite_folders(args.cs_output) Utilities.save_dataframe(cs_results, args.cs_output) else: logging.info('No results') if len(var_results) > 0: var_results = pandas.concat(var_results)[["gene_id", "var_id", "cs", "variable_prob"]] Utilities.ensure_requisite_folders(args.var_output) Utilities.save_dataframe(var_results, args.var_output) logging.info("Ran susie")
def __enter__(self): logging.info("initializing resources") logging.info("Loading regions") regions = load_regions(self.args.region_file, self.args.chromosome) if args.sub_batches and args.sub_batch is not None: logging.log(9, "Selecting target regions from sub-batches") regions = PandasHelpers.sub_batch(regions, args.sub_batches, args.sub_batch) self.regions = regions logging.info("Opening variants metadata") self.vmf = pq.ParquetFile(args.parquet_genotype_metadata) logging.info("Creating destination") if args.text_output: if os.path.exists(args.text_output): raise RuntimeError("Output exists. Nope.") Utilities.ensure_requisite_folders(args.text_output) self.of = TextFileTools.TextDataSink( args.text_output, [("region", "id1", "id2", "value")]) self.of.initialize() elif args.text_output_folder: Utilities.maybe_create_folder(args.text_output_folder) else: raise RuntimeError("Unrecognized output specification") if (args.parquet_genotype_folder and args.parquet_genotype_pattern): self.file_map = get_file_map(args) else: raise RuntimeError("Unrecognized genotype specification") return self
def run(args): Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading snp reference") key = KeyedDataSource.load_data(args.snp_reference_file, "variant_id", "rs_id_dbSNP150_GRCh38p7", value_conversion=KeyedDataSource.dot_to_na) logging.info("Loading samples") samples = TextFileTools.load_list(args.samples) genotype_format_string = "\t".join(["{}"] * (len(samples) + 1)) + "\n" og = args.output_prefix + "_genotype.txt.gz" oa = args.output_prefix + "_annotation.txt.gz" if os.path.exists(og) or os.path.exists(oa): logging.info("Output exists. Nope.") return logging.info("Processing") with gzip.open(args.genotype) as geno: with gzip.open(og, "w") as _og: _og.write(_to_gl(["varID"] + samples, genotype_format_string)) with gzip.open(oa, "w") as _oa: _oa.write( _to_al([ "chromosome", "position", "id", "allele_0", "allele_1", "allele_1_frequency", "rsid" ])) for i, line in enumerate(geno): comps = line.decode().strip().split() chr = "chr" + comps[0] pos = comps[2] ref = comps[3] alt = comps[4] af = comps[5] dosage = comps[6:] var_id = "{}_{}_{}_{}_b38".format(chr, pos, ref, alt) if var_id in key: id = key[var_id] comps[1] = var_id _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, ref, alt, af, id])) next var_id = "{}_{}_{}_{}_b38".format(chr, pos, alt, ref) if var_id in key and len(ref) == 1 and len(alt) == 1: id = key[var_id] af = str(1 - float(af)) dosage = list(map(lambda x: str(2 - int(x)), comps[6:])) _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, alt, ref, af, id])) next logging.info("Finished conversion")
def metadata_white_list(black_list_path, column, variants): w = {x for x in variants} if black_list_path: b = TextFileTools.load_column(black_list_path, column, unique_entries=True, white_list=w) w = {x for x in w if not x in b} return w
def fill_coords(args, d): logging.info("Loading SNP metadata whitelist") w = metadata_white_list(args.snp_info_blacklist, "name", d.variant_id) logging.info("Loading SNP specification") s = TextFileTools.load_dataframe( args.fill_from_snp_info, keys=w, key_column_name="name").rename(columns={"start": "position"}) d = d.merge(s, left_on="variant_id", right_on="name", how="left") logging.info("%d variants after filling coordinates", d.shape[0]) return d
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return logging.info("Loading samples") samples = {x for x in TextFileTools.load_list(args.samples_whitelist)} logging.info("Processing file") Utilities.ensure_requisite_folders(args.output) Utilities.write_iterable_to_file(input_generator(args.input_file, samples), args.output) logging.info("Finished")
def run(args): wp = args.output_prefix + "_weights.txt.gz" if os.path.exists(wp): logging.info("Weights output exists already, delete it or move it") return sp = args.output_prefix + "_summary.txt.gz" if os.path.exists(sp): logging.info("Summary output exists already, delete it or move it") return cp = args.output_prefix + "_covariance.txt.gz" if os.path.exists(wp): logging.info("covariance output exists already, delete it or move it") return r = args.output_prefix + "_run.txt.gz" if os.path.exists(wp): logging.info("run output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") data = pq.ParquetFile(args.data) available_data = {x for x in data.metadata.schema.names} logging.info("Loading data annotation") data_annotation = StudyUtilities.load_gene_annotation( args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch) data_annotation = data_annotation[data_annotation.gene_id.isin( available_data)] if args.gene_whitelist: logging.info("Applying gene whitelist") data_annotation = data_annotation[data_annotation.gene_id.isin( set(args.gene_whitelist))] logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile( args.features_annotation).read_row_group(args.chromosome - 1).to_pandas() if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation( features_metadata, data_annotation, args.window) if args.rsid_whitelist: logging.info("Filtering features annotation") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin( whitelist)] if args.features_weights: logging.info("Loading weights") x_weights = get_weights(args.features_weights, {x for x in features_metadata.id}) logging.info( "Filtering features metadata to those available in weights") features_metadata = features_metadata[features_metadata.id.isin( x_weights.id)] logging.info("Kept %d entries", features_metadata.shape[0]) else: x_weights = None logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") s = numpy.random.randint(1e8) set_seed(s) if args.run_tag: d = pandas.DataFrame({ "run": [args.run_tag], "cv_seed": [s] })[["run", "cv_seed"]] Utilities.save_dataframe(d, r) WEIGHTS_FIELDS = [ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ] SUMMARY_FIELDS = [ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval", "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval" ] train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols with gzip.open(wp, "w") as w: w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode()) with gzip.open(sp, "w") as s: s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode()) with gzip.open(cp, "w") as c: c.write("GENE RSID1 RSID2 VALUE\n".encode()) for i, data_annotation_ in enumerate( data_annotation.itertuples()): if args.MAX_M and i >= args.MAX_M: logging.info("Early abort") break logging.log(9, "processing %i/%i:%s", i + 1, data_annotation.shape[0], data_annotation_.gene_id) if args.repeat: for j in range(0, args.repeat): logging.log(9, "%i-th reiteration", j) process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, args.nested_cv_folds) else: process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds) logging.info("Finished")
def run(args): Utilities.maybe_create_folder(args.intermediate_folder) Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") p_ = re.compile(args.data_name_pattern) f = [x for x in sorted(os.listdir(args.data_folder)) if p_.search(x)] tissue_names = [p_.search(x).group(1) for x in f] data = [] for i in range(0, len(tissue_names)): logging.info("Loading %s", tissue_names[i]) data.append((tissue_names[i], pq.ParquetFile(os.path.join(args.data_folder, f[i])))) data = collections.OrderedDict(data) available_data = { x for p in data.values() for x in p.metadata.schema.names } logging.info("Preparing output") WEIGHTS_FIELDS = [ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ] SUMMARY_FIELDS = [ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ] Utilities.ensure_requisite_folders(args.output_prefix) if args.skip_regression: weights, summaries, covariances = None, None, None else: weights, summaries, covariances = setup_output(args.output_prefix, tissue_names, WEIGHTS_FIELDS, SUMMARY_FIELDS) logging.info("Loading data annotation") data_annotation = StudyUtilities._load_gene_annotation( args.data_annotation) data_annotation = data_annotation[data_annotation.gene_id.isin( available_data)] if args.chromosome or (args.sub_batches and args.sub_batch): data_annotation = StudyUtilities._filter_gene_annotation( data_annotation, args.chromosome, args.sub_batches, args.sub_batch) logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile( args.features_annotation).read_row_group(args.chromosome - 1).to_pandas() if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation( features_metadata, data_annotation, args.window) if args.rsid_whitelist: logging.info("Filtering features annotation") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin( whitelist)] logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") seed = numpy.random.randint(1e8) if args.run_tag: d = pandas.DataFrame({ "run": [args.run_tag], "cv_seed": [seed] })[["run", "cv_seed"]] for t in tissue_names: Utilities.save_dataframe( d, "{}_{}_runs.txt.gz".format(args.output_prefix, t)) failed_run = False try: for i, data_annotation_ in enumerate(data_annotation.itertuples()): logging.log(9, "processing %i/%i:%s", i + 1, data_annotation.shape[0], data_annotation_.gene_id) logging.log(8, "loading data") d_ = {} for k, v in data.items(): d_[k] = Parquet._read(v, [data_annotation_.gene_id], to_pandas=True) features_ = Genomics.entries_for_gene_annotation( data_annotation_, args.window, features_metadata) if features_.shape[0] == 0: logging.log(9, "No features available") continue features_data_ = Parquet._read(features, [x for x in features_.id.values], to_pandas=True) features_data_["id"] = range(1, features_data_.shape[0] + 1) features_data_ = features_data_[["individual", "id"] + [x for x in features_.id.values]] logging.log(8, "training") prepare_ctimp(args.script_path, seed, args.intermediate_folder, data_annotation_, features_, features_data_, d_) del (features_data_) del (d_) if args.skip_regression: continue subprocess.call([ "bash", _execution_script(args.intermediate_folder, data_annotation_.gene_id) ]) w = pandas.read_table(_weights(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") s = pandas.read_table(_summary(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") for e_, entry in enumerate(s.itertuples()): entry_weights = w[["SNP", "REF.0.", "ALT.1.", entry.tissue]].rename( columns={ "SNP": "varID", "REF.0.": "ref_allele", "ALT.1.": "eff_allele", entry.tissue: "weight" }) entry_weights = entry_weights[entry_weights.weight != 0] entry_weights = entry_weights.assign( gene=data_annotation_.gene_id) entry_weights = entry_weights.merge(features_, left_on="varID", right_on="id", how="left") entry_weights = entry_weights[WEIGHTS_FIELDS] if args.output_rsids: entry_weights.loc[entry_weights.rsid == "NA", "rsid"] = entry_weights.loc[ entry_weights.rsid == "NA", "varID"] weights[entry.tissue].write( entry_weights.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) entry_summary = s[s.tissue == entry.tissue].rename( columns={ "zscore_pval": "pred.perf.pval", "rho_avg_squared": "pred.perf.R2" }) entry_summary = entry_summary.assign( gene=data_annotation_.gene_id, alpha=0.5, genename=data_annotation_.gene_name, gene_type=data_annotation_.gene_type, n_snps_in_window=features_.shape[0]) entry_summary["n.snps.in.model"] = entry_weights.shape[0] #must repeat strings beause of weird pandas indexing issue entry_summary = entry_summary.drop( ["R2", "n", "tissue"], axis=1)[[ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ]] summaries[entry.tissue].write( entry_summary.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) features_data_ = Parquet._read( features, [x for x in entry_weights.varID.values], to_pandas=True) var_ids = [x for x in entry_weights.varID.values] cov = numpy.cov([features_data_[k] for k in var_ids], ddof=1) ids = [x for x in entry_weights.rsid.values ] if args.output_rsids else var_ids cov = matrices._flatten_matrix_data([(data_annotation_.gene_id, ids, cov)]) for cov_ in cov: l = "{} {} {} {}\n".format(cov_[0], cov_[1], cov_[2], cov_[3]).encode() covariances[entry.tissue].write(l) if not args.keep_intermediate_folder: logging.info("Cleaning up") shutil.rmtree( _intermediate_folder(args.intermediate_folder, data_annotation_.gene_id)) if args.MAX_M and i >= args.MAX_M: logging.info("Early abort") break except Exception as e: logging.info("Exception running model training:\n%s", traceback.format_exc()) failed_run = True finally: pass # if not args.keep_intermediate_folder: # shutil.rmtree(args.intermediate_folder) if not args.skip_regression: set_down(weights, summaries, covariances, tissue_names, failed_run) logging.info("Finished")
def run(args): if os.path.exists(args.output): logging.info("Output already exists, either delete it or move it") return logging.info("Getting parquet genotypes") file_map = get_file_map(args) logging.info("Getting genes") with sqlite3.connect(args.model_db) as connection: # Pay heed to the order. This avoids arbitrariness in sqlite3 loading of results. extra = pandas.read_sql("SELECT * FROM EXTRA order by gene", connection) extra = extra[extra["n.snps.in.model"] > 0] individuals = TextFileTools.load_list( args.individuals) if args.individuals else None logging.info("Processing") Utilities.ensure_requisite_folders(args.output) with gzip.open(args.output, "w") as f: f.write("GENE RSID1 RSID2 VALUE\n".encode()) with sqlite3.connect(args.model_db) as connection: for i, t in enumerate(extra.itertuples()): g_ = t.gene logging.log(9, "Proccessing %i/%i:%s", i + 1, extra.shape[0], g_) w = pandas.read_sql( "select * from weights where gene = '{}';".format(g_), connection) chr_ = w.varID.values[0].split("_")[0].split("chr")[1] if not n_.search(chr_): logging.log(9, "Unsupported chromosome: %s", chr_) continue dosage = file_map[int(chr_)] if individuals: d = Parquet._read(dosage, columns=w.varID.values, specific_individuals=individuals) del d["individual"] else: d = Parquet._read(dosage, columns=w.varID.values, skip_individuals=True) var_ids = list(d.keys()) if len(var_ids) == 0: if len(w.varID.values) == 1: logging.log( 9, "workaround for single missing genotype at %s", g_) d = {w.varID.values[0]: [0, 1]} else: logging.log(9, "No genotype available for %s, skipping", g_) next if args.output_rsids: ids = [ x for x in pandas.DataFrame({ "varID": var_ids }).merge(w[["varID", "rsid"]], on="varID").rsid.values ] else: ids = var_ids c = numpy.cov([d[x] for x in var_ids]) c = matrices._flatten_matrix_data([(w.gene.values[0], ids, c)]) for entry in c: l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2], entry[3]) f.write(l.encode()) logging.info("Finished building covariance.")
def run(args): wp = args.output_prefix + "_weights.txt.gz" if os.path.exists(wp): logging.info("Weights output exists already, delete it or move it") return sp = args.output_prefix + "_summary.txt.gz" if os.path.exists(sp): logging.info("Summary output exists already, delete it or move it") return cp = args.output_prefix + "_covariance.txt.gz" if os.path.exists(wp): logging.info("covariance output exists already, delete it or move it") return r = args.output_prefix + "_run.txt.gz" if os.path.exists(wp): logging.info("run output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") data = pq.ParquetFile(args.data) available_data = {x for x in data.metadata.schema.names} logging.info("Loading data annotation") data_annotation = StudyUtilities.load_gene_annotation(args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch, args.simplify_data_annotation) data_annotation = data_annotation[data_annotation.gene_id.isin(available_data)] if args.gene_whitelist: logging.info("Applying gene whitelist") data_annotation = data_annotation[data_annotation.gene_id.isin(set(args.gene_whitelist))] logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile(args.features_annotation).read_row_group(args.chromosome-1).to_pandas() if args.output_rsids: if not args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]: logging.warning("Several variants map to a same rsid (hint: multiple INDELS?).\n" "Can't proceed. Consider the using the --keep_highest_frequency_rsid flag, or models will be ill defined.") return if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(features_metadata, data_annotation, args.window) logging.info("Kept %d", features_metadata.shape[0]) if args.variant_call_filter: logging.info("Filtering variants by average call rate") features_metadata = features_metadata[features_metadata.avg_call > args.variant_call_filter] logging.info("Kept %d", features_metadata.shape[0]) if args.variant_r2_filter: logging.info("Filtering variants by imputation R2") features_metadata = features_metadata[features_metadata.r2 > args.variant_r2_filter] logging.info("Kept %d", features_metadata.shape[0]) if args.variant_variance_filter: logging.info("Filtering variants by (dosage/2)'s variance") features_metadata = features_metadata[features_metadata["std"]/2 > numpy.sqrt(args.variant_variance_filter)] logging.info("Kept %d", features_metadata.shape[0]) if args.discard_palindromic_snps: logging.info("Discarding palindromic snps") features_metadata = Genomics.discard_gtex_palindromic_variants(features_metadata) logging.info("Kept %d", features_metadata.shape[0]) if args.rsid_whitelist: logging.info("Filtering features annotation for whitelist") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin(whitelist)] logging.info("Kept %d", features_metadata.shape[0]) if args.only_rsids: logging.info("discarding non-rsids") features_metadata = StudyUtilities.trim_variant_metadata_to_rsids_only(features_metadata) logging.info("Kept %d", features_metadata.shape[0]) if args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]: logging.info("Keeping only the highest frequency entry for every rsid") k = features_metadata[["rsid", "allele_1_frequency", "id"]] k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] = 1 - k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] k = k.sort_values(by=["rsid", "allele_1_frequency"], ascending=False) k = k.groupby("rsid").first().reset_index() features_metadata = features_metadata[features_metadata.id.isin(k.id)] logging.info("Kept %d", features_metadata.shape[0]) else: logging.info("rsids are unique, no need to restrict to highest frequency entry") if args.features_weights: logging.info("Loading weights") x_weights = get_weights(args.features_weights, {x for x in features_metadata.id}) logging.info("Filtering features metadata to those available in weights") features_metadata = features_metadata[features_metadata.id.isin(x_weights.id)] logging.info("Kept %d entries", features_metadata.shape[0]) else: x_weights = None logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") s = numpy.random.randint(1e8) set_seed(s) if args.run_tag: d = pandas.DataFrame({"run":[args.run_tag], "cv_seed":[s]})[["run", "cv_seed"]] Utilities.save_dataframe(d, r) WEIGHTS_FIELDS=["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"] SUMMARY_FIELDS=["gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval", "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"] train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols available_individuals = check_missing(args, data, features) with gzip.open(wp, "w") as w: w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode()) with gzip.open(sp, "w") as s: s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode()) with gzip.open(cp, "w") as c: c.write("GENE RSID1 RSID2 VALUE\n".encode()) for i,data_annotation_ in enumerate(data_annotation.itertuples()): if args.MAX_M and i>=args.MAX_M: logging.info("Early abort") break logging.log(9, "processing %i/%i:%s", i+1, data_annotation.shape[0], data_annotation_.gene_id) if args.repeat: for j in range(0, args.repeat): logging.log(9, "%i-th reiteration", j) process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, nested_folds=args.nested_cv_folds, use_individuals=available_individuals) else: process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds, use_individuals=available_individuals) logging.info("Finished")
def run(args): if os.path.exists(args.output): logging.info("Output already exists, either delete it or move it") return logging.info("Loading group") groups = pandas.read_table(args.group) groups = groups.assign(chromosome = groups.gtex_intron_id.str.split(":").str.get(0)) groups = groups.assign(position=groups.gtex_intron_id.str.split(":").str.get(1)) groups = Genomics.sort(groups) logging.info("Getting parquet genotypes") file_map = get_file_map(args) logging.info("Getting genes") with sqlite3.connect(args.model_db_group_key) as connection: # Pay heed to the order. This avoids arbitrariness in sqlite3 loading of results. extra = pandas.read_sql("SELECT * FROM EXTRA order by gene", connection) extra = extra[extra["n.snps.in.model"] > 0] individuals = TextFileTools.load_list(args.individuals) if args.individuals else None logging.info("Processing") Utilities.ensure_requisite_folders(args.output) genes_ = groups[["chromosome", "position", "gene_id"]].drop_duplicates() with gzip.open(args.output, "w") as f: f.write("GENE RSID1 RSID2 VALUE\n".encode()) with sqlite3.connect(args.model_db_group_key) as db_group_key: with sqlite3.connect(args.model_db_group_values) as db_group_values: for i,t_ in enumerate(genes_.itertuples()): g_ = t_.gene_id chr_ = t_.chromosome.split("chr")[1] logging.log(8, "Proccessing %i/%i:%s", i+1, len(genes_), g_) if not n_.search(chr_): logging.log(9, "Unsupported chromosome: %s", chr_) continue dosage = file_map[int(chr_)] group = groups[groups.gene_id == g_] wg=[] for value in group.intron_id: wk = pandas.read_sql("select * from weights where gene = '{}';".format(value), db_group_values) if wk.shape[0] == 0: continue wg.append(wk) if len(wg) > 0: wg = pandas.concat(wg) w = pandas.concat([wk, wg])[["varID", "rsid"]].drop_duplicates() else: w = wk[["varID", "rsid"]].drop_duplicates() if w.shape[0] == 0: logging.log(8, "No data, skipping") continue if individuals: d = Parquet._read(dosage, columns=w.varID.values, specific_individuals=individuals) del d["individual"] else: d = Parquet._read(dosage, columns=w.varID.values, skip_individuals=True) var_ids = list(d.keys()) if len(var_ids) == 0: if len(w.varID.values) == 1: logging.log(9, "workaround for single missing genotype at %s", g_) d = {w.varID.values[0]:[0,1]} else: logging.log(9, "No genotype available for %s, skipping",g_) next if args.output_rsids: ids = [x for x in pandas.DataFrame({"varID": var_ids}).merge(w[["varID", "rsid"]], on="varID").rsid.values] else: ids = var_ids c = numpy.cov([d[x] for x in var_ids]) c = matrices._flatten_matrix_data([(g_, ids, c)]) for entry in c: l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2], entry[3]) f.write(l.encode()) logging.info("Finished building covariance.")