def run(args): start = timer() if os.path.exists(args.output): logging.info( "%s already exists, you have to move it or delete it if you want it done again", args.output) return logging.info("Creating context") context = CrossModelUtilities.context_from_args(args) results = [] n_genes = context.get_n_genes() reporter = Utilities.PercentReporter(logging.INFO, n_genes) logging.info("Processing") reporter.update(0, "%d %% of model's genes processed so far") for i, gene in enumerate(context.get_genes()): logging.log(7, "Gene %d/%d: %s", i + 1, n_genes, gene) result = JointAnalysis.joint_analysis(context, gene) results.append(result) reporter.update(i, "%d %% of model's genes processed so far") results = JointAnalysis.format_results(results) Utilities.ensure_requisite_folders(args.output) results.to_csv(args.output, index=False, sep="\t") end = timer() logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
def run(args): start = timer() if os.path.exists(args.output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.output) return logging.info("Creating context") context = CrossModelUtilities.context_from_args(args) results = [] n_genes = context.get_n_genes() reporter = Utilities.PercentReporter(logging.INFO, n_genes) logging.info("Processing") reporter.update(0, "%d %% of model's genes processed so far") for i,gene in enumerate(context.get_genes()): logging.log(7, "Gene %d/%d: %s", i+1, n_genes, gene) result = JointAnalysis.joint_analysis(context, gene) results.append(result) reporter.update(i, "%d %% of model's genes processed so far") results = JointAnalysis.format_results(results) Utilities.ensure_requisite_folders(args.output) results.to_csv(args.output, index=False, sep="\t") end = timer() logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
def run(args, _gwas=None): start = timer() if not args.overwrite and os.path.exists(args.output_file): logging.info("%s already exists, move it or delete it if you want it done again", args.output_file) return logging.info("Started metaxcan association") context = MetaxcanUtilities.build_context(args, _gwas) model_snps = context.get_model_snps() total_snps = len(model_snps) snps_found=set() reporter = Utilities.PercentReporter(logging.INFO, total_snps) i_genes, i_snps = context.get_data_intersection() results = [] for gene in i_genes: r, snps = AssociationCalculation.association(gene, context, return_snps=True) results.append(r) snps_found.update(snps) reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study") Utilities.ensure_requisite_folders(args.output_file) reporter.update(len(snps_found), "%d %% of model's snps used", force=True) results = AssociationCalculation.dataframe_from_results(zip(*results)) results = MetaxcanUtilities.format_output(results, context, args.keep_ens_version) results.to_csv(args.output_file, index=False) end = timer() logging.info("Sucessfully processed metaxcan association in %s seconds"%(str(end - start)))
def run_metaxcan(args, context): logging.info("Started metaxcan association") model_snps = context.get_model_snps() total_snps = len(model_snps) snps_found=set() reporter = Utilities.PercentReporter(logging.INFO, total_snps) i_genes, i_snps = context.get_data_intersection() results = [] for gene in i_genes: logging.log(7, "Processing gene %s", gene) r, snps = AssociationCalculation.association(gene, context, return_snps=True) results.append(r) snps_found.update(snps) reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study") reporter.update(len(snps_found), "%d %% of model's snps used", force=True) results = AssociationCalculation.dataframe_from_results(results) results = MetaxcanUtilities.format_output(results, context, args.remove_ens_version) if args.output_file: Utilities.ensure_requisite_folders(args.output_file) results.to_csv(args.output_file, index=False) return results
def run(args): if os.path.exists(args.output): logging.info("Output exists, delete it or move it if you want it generated again") return Utilities.ensure_requisite_folders(args.output) logging.info("Reading input") data = pandas.read_table(args.input) logging.info("Opening output") f = h5py_cache.File(args.output, 'w', chunk_cache_mem_size=int(50 * (1024 ** 2))) n_genes = data.shape[1]-2 n_samples = data.shape[0] n_genes_chunk = np.min((n_genes, 10)) logging.info("Processing expression") p = f.create_dataset("pred_expr", shape=(n_genes, n_samples), chunks=(n_genes_chunk, n_samples), dtype=np.dtype('float32'), scaleoffset=4, compression='gzip') g = f.create_dataset("genes", (n_genes,), dtype="S30") for i, gene in enumerate(data.columns.values[2:]): p[i, :] = data[gene].to_numpy() g[i] = np.string_(gene) logging.info("saving samples") s = f.create_dataset("samples", (n_samples,), dtype="S25") for i in xrange(0, n_samples): s[i] = np.string_(data["IID"][i]) f.close() logging.info("Done")
def _run(args, subset=None, append=None): logging.info("Loading expressions") manager = FeatureMatrix.build_manager(args.expression_folder, filters = args.expression_filters, standardize=True, subset=subset) logging.info("Saving") Utilities.ensure_requisite_folders(args.output) manager.save_covariances(args.output, append=append) logging.info("Ran.")
def run(args): start = timer() folder, prefix = os.path.split(args.output_prefix) results_name = args.output_prefix + "__mt_results.txt" predixcan_results_name = args.output_prefix + "__p_results.txt" additional_name = args.output_prefix + "__additional.txt" if os.path.exists(results_name): logging.info( "%s already exists, you have to move it or delete it if you want it done again", results_name) return #for reproducibility numpy.random.seed(100) results = [] additional = [] predixcan_results = [] n_max = args.max_n_results logging.info("Acquiring context") with MultiPredixcanSimulations.context_from_args(args) as context: logging.info("processing") _c, _cp, _e = context.get_mp_simulation(None) for i, gene in enumerate(context.get_genes()): if n_max and i + 1 > n_max: logging.info("Max runs met") break logging.log(9, "%d Gene %s", i, gene) r, add, p = MultiPredixcanSimulations.simulate(gene, context) if r is None: logging.log(9, "%s could not be simulated", gene) continue results.append(r) additional.append(add) if p is not None: predixcan_results.append(p) results = MultiPrediXcanAssociation.dataframe_from_results( results, _c).sort_values(by="pvalue") additional = pandas.concat(additional) Utilities.ensure_requisite_folders(results_name) Utilities.save_dataframe(results, results_name) Utilities.save_dataframe(additional, additional_name) if len(predixcan_results): predixcan_results = pandas.concat(predixcan_results) Utilities.save_dataframe(predixcan_results, predixcan_results_name) logging.info("Finished")
def _run(args, subset=None, append=None): logging.info("Loading expressions") manager = FeatureMatrix.build_manager(args.expression_folder, filters=args.expression_filters, standardize=True, subset=subset) logging.info("Saving") Utilities.ensure_requisite_folders(args.output) manager.save_covariances(args.output, append=append) logging.info("Ran.")
def run(args): start = timer() folder, prefix = os.path.split(args.output_prefix) results_name = args.output_prefix + "__mt_results.txt" predixcan_results_name = args.output_prefix + "__p_results.txt" additional_name = args.output_prefix + "__additional.txt" if os.path.exists(results_name): logging.info("%s already exists, you have to move it or delete it if you want it done again", results_name) return #for reproducibility numpy.random.seed(100) results = [] additional = [] predixcan_results = [] n_max = args.max_n_results logging.info("Acquiring context") with MultiPredixcanSimulations.context_from_args(args) as context: logging.info("processing") _c, _cp, _e = context.get_mp_simulation(None) for i, gene in enumerate(context.get_genes()): if n_max and i+1>n_max: logging.info("Max runs met") break logging.log(9, "%d Gene %s", i, gene) r, add, p = MultiPredixcanSimulations.simulate(gene, context) if r is None: logging.log(9, "%s could not be simulated", gene) continue results.append(r) additional.append(add) if p is not None: predixcan_results.append(p) results = MultiPrediXcanAssociation.dataframe_from_results(results, _c).sort_values(by="pvalue") additional = pandas.concat(additional) Utilities.ensure_requisite_folders(results_name) Utilities.save_dataframe(results, results_name) Utilities.save_dataframe(additional, additional_name) if len(predixcan_results): predixcan_results = pandas.concat(predixcan_results) Utilities.save_dataframe(predixcan_results, predixcan_results_name) logging.info("Finished")
def run_additional(args, context): logging.info("Started metaxcan additional stats") i_genes, i_snps = context.get_data_intersection() results = [] for gene in i_genes: stats_ = AssociationCalculation.additional_stats(gene, context) results.append(stats_) results = AssociationCalculation.dataframe_from_aditional_stats(results) results = MetaxcanUtilities.format_additional_output(results, context, args.remove_ens_version) if args.additional_output: Utilities.ensure_requisite_folders(args.additional_output) results.to_csv(args.additional_output, index=False) return results
def run_metaxcan(args, context): logging.info("Started metaxcan association") model_snps = context.get_model_snps() total_snps = len(model_snps) snps_found = set() reporter = Utilities.PercentReporter(logging.INFO, total_snps) i_genes, i_snps = context.get_data_intersection() results = [] additional = [] for i, gene in enumerate(i_genes): if args.MAX_R and i + 1 > args.MAX_R: logging.log("Early exit condition met") break logging.log(9, "Processing gene %i:%s", i, gene) r, snps = AssociationCalculation.association(gene, context, return_snps=True) results.append(r) snps_found.update(snps) reporter.update( len(snps_found), "%d %% of model's snps found so far in the gwas study") if args.additional_output: stats_ = AssociationCalculation.additional_stats(gene, context) additional.append(stats_) reporter.update(len(snps_found), "%d %% of model's snps used", force=True) results = AssociationCalculation.dataframe_from_results(results) results = MetaxcanUtilities.format_output(results, context, args.remove_ens_version) if args.additional_output: additional = AssociationCalculation.dataframe_from_aditional_stats( additional) results = MetaxcanUtilities.merge_additional_output( results, additional, context, args.remove_ens_version) if args.output_file: Utilities.ensure_requisite_folders(args.output_file) results.to_csv(args.output_file, index=False) return results
def run(args): start = timer() if os.path.exists(args.output): logging.info( "%s already exists, you have to move it or delete it if you want it done again", args.output) return if (args.hdf5_expression_file and args.expression_file) or \ (not args.hdf5_expression_file and not args.expression_file): logging.info( "Provide either hdf5 expression file or plain text expression file" ) return with PrediXcanUtilities.p_context_from_args(args) as context: genes = context.get_genes() n_genes = len(genes) reporter = Utilities.PercentReporter(logging.INFO, n_genes) reporter.update(0, "%d %% of model's genes processed so far", force=True) results = [] for i, gene in enumerate(genes): logging.log(7, "Processing gene %s", gene) r = PrediXcanAssociation.predixcan_association(gene, context) results.append(r) reporter.update(i, "%d %% of model's genes processed so far") reporter.update(i, "%d %% of model's genes processed so far") results = PrediXcanAssociation.dataframe_from_results(results) results = results.fillna("NA") results = results.sort_values(by="pvalue") Utilities.ensure_requisite_folders(args.output) results.to_csv(args.output, index=False, sep="\t", quotechar='"') end = timer() logging.info("Ran multi tissue predixcan in %s seconds" % (str(end - start)))
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern, name_filter=args.models_filter) all_snps = model_manager.get_rsids() Utilities.ensure_requisite_folders(args.snp_covariance_output) with gzip.open(args.snp_covariance_output, "w") as o: o.write("GENE\tRSID1\tRSID2\tVALUE\n") logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) for e in cov_data: l = "{}\t{}\t{}\t{}\n".format(e[0], e[1], e[2], e[3]) o.write(l) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def run(args): start = timer() validate(args) if args.output_folder and args.output: logging.info("Specify either --output_folder or --output, not both") return if args.gwas_folder: regexp = re.compile( args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort( ) #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % ( args.gwas_folder, args.gwas_file_pattern, ) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model( args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder or args.output: if args.output_folder: if args.output_folder and not os.path.exists(args.output_folder): os.makedirs(args.output_folder) else: Utilities.ensure_requisite_folders(args.output) for i, name in enumerate(names): output_path = os.path.join( args.output_folder, name) if not args.output else args.output if args.output_folder or i == 0: m = "w" else: m = "a" if os.path.exists(output_path): logging.info( "%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name, args.snp_map_file) c = "gzip" if ".gz" in output_path else None logging.info("Saving %s", output_path) b.to_csv(output_path, sep="\t", index=False, compression=c, mode=m) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" % (str(end - start))) else: r = [] for name in names: b = build_betas(args, model, gwas_format, name, args.snp_map_file) r.append(b) r = pandas.concat(r) end = timer() logging.info("Successfully parsed input gwas in %s seconds" % (str(end - start))) return r
def run(args): start = timer() if args.prediction_output: if os.path.exists(args.prediction_output[0]): logging.info( "Prediction output exists. Move or remove if you want this ran again." ) return Utilities.ensure_requisite_folders(args.prediction_output[0]) if args.prediction_summary_output: if os.path.exists(args.prediction_summary_output): logging.info( "Summary output exists. Move or remove if you want this ran again." ) return Utilities.ensure_requisite_folders(args.prediction_output[0]) logging.info("Loading samples") samples = load_samples(args) logging.info("Loading model") model, weights, extra = model_structure(args) variant_mapping = get_variant_mapping(args, weights) logging.info("Preparing genotype dosages") dosage_source = dosage_generator(args, variant_mapping, weights) logging.info("Processing genotypes") dcapture = [] reporter = Utilities.PercentReporter(logging.INFO, len(set(weights.rsid.values))) snps_found = set() with prepare_prediction(args, extra, samples) as results: for i, e in enumerate(dosage_source): if args.stop_at_variant and i > args.stop_at_variant: break var_id = e[GF.RSID] logging.log(8, "variant %i:%s", i, var_id) if var_id in model: s = model[var_id] ref_allele, alt_allele = e[GF.REF_ALLELE], e[GF.ALT_ALLELE] allele_align, strand_align = GWASAndModels.match_alleles( ref_allele, alt_allele, s[0], s[1]) if not allele_align or not strand_align: continue dosage = e[GF.FIRST_DOSAGE:] if allele_align == -1: dosage = tuple(map(lambda x: 2 - x, dosage)) dosage = numpy.array(dosage, dtype=numpy.float) snps_found.add(var_id) for gene, weight in s[2].items(): results.update(gene, dosage, weight) if args.capture: dcapture.append((gene, weight, var_id, s[0], s[1], ref_allele, alt_allele, strand_align, allele_align) + e[GF.FIRST_DOSAGE:]) reporter.update(len(snps_found), "%d %% of models' snps used") reporter.update(len(snps_found), "%d %% of models' snps used", force=True) if args.capture: logging.info("Saving data capture") Utilities.ensure_requisite_folders(args.capture) with gzip.open(args.capture, "w") as f: header = "gene\tweight\tvariant_id\tref_allele\teff_allele\ta0\ta1\tstrand_align\tallele_align\t" + "\t".join( samples.IID.values) + "\n" f.write(header.encode()) for c in dcapture: l = "\t".join(map(str, c)) + "\n" f.write(l.encode()) if args.prediction_output and len(args.prediction_output) < 2: logging.info("Storing prediction") results.store_prediction() if args.prediction_summary_output: logging.info("Saving summary") summary = results.summary() Utilities.save_dataframe(summary, args.prediction_summary_output) end = timer() logging.info("Successfully predicted expression in %s seconds" % (str(end - start))) return results