示例#1
0
def run(args):
    start = timer()
    if os.path.exists(args.output):
        logging.info(
            "%s already exists, you have to move it or delete it if you want it done again",
            args.output)
        return
    logging.info("Creating context")
    context = CrossModelUtilities.context_from_args(args)
    results = []

    n_genes = context.get_n_genes()
    reporter = Utilities.PercentReporter(logging.INFO, n_genes)

    logging.info("Processing")
    reporter.update(0, "%d %% of model's genes processed so far")
    for i, gene in enumerate(context.get_genes()):
        logging.log(7, "Gene %d/%d: %s", i + 1, n_genes, gene)
        result = JointAnalysis.joint_analysis(context, gene)
        results.append(result)
        reporter.update(i, "%d %% of model's genes processed so far")

    results = JointAnalysis.format_results(results)
    Utilities.ensure_requisite_folders(args.output)
    results.to_csv(args.output, index=False, sep="\t")

    end = timer()
    logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
示例#2
0
def run(args):
    start = timer()
    if os.path.exists(args.output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.output)
        return
    logging.info("Creating context")
    context = CrossModelUtilities.context_from_args(args)
    results = []

    n_genes = context.get_n_genes()
    reporter = Utilities.PercentReporter(logging.INFO, n_genes)

    logging.info("Processing")

    reporter.update(0, "%d %% of model's genes processed so far")
    for i,gene in enumerate(context.get_genes()):
        logging.log(7, "Gene %d/%d: %s", i+1, n_genes, gene)
        result = JointAnalysis.joint_analysis(context, gene)
        results.append(result)
        reporter.update(i, "%d %% of model's genes processed so far")

    results = JointAnalysis.format_results(results)
    Utilities.ensure_requisite_folders(args.output)
    results.to_csv(args.output, index=False, sep="\t")

    end = timer()
    logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
示例#3
0
def run(args, _gwas=None):
    start = timer()
    if not args.overwrite and os.path.exists(args.output_file):
        logging.info("%s already exists, move it or delete it if you want it done again", args.output_file)
        return
    logging.info("Started metaxcan association")

    context = MetaxcanUtilities.build_context(args, _gwas)

    model_snps = context.get_model_snps()
    total_snps = len(model_snps)
    snps_found=set()
    reporter = Utilities.PercentReporter(logging.INFO, total_snps)

    i_genes, i_snps = context.get_data_intersection()

    results = []
    for gene in i_genes:
        r, snps = AssociationCalculation.association(gene, context, return_snps=True)
        results.append(r)
        snps_found.update(snps)
        reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study")

    Utilities.ensure_requisite_folders(args.output_file)

    reporter.update(len(snps_found), "%d %% of model's snps used", force=True)
    results = AssociationCalculation.dataframe_from_results(zip(*results))
    results = MetaxcanUtilities.format_output(results, context, args.keep_ens_version)
    results.to_csv(args.output_file, index=False)
    end = timer()
    logging.info("Sucessfully processed metaxcan association in %s seconds"%(str(end - start)))
示例#4
0
def run_metaxcan(args, context):
    logging.info("Started metaxcan association")
    model_snps = context.get_model_snps()
    total_snps = len(model_snps)
    snps_found=set()
    reporter = Utilities.PercentReporter(logging.INFO, total_snps)

    i_genes, i_snps = context.get_data_intersection()

    results = []
    for gene in i_genes:
        logging.log(7, "Processing gene %s", gene)
        r, snps = AssociationCalculation.association(gene, context, return_snps=True)
        results.append(r)
        snps_found.update(snps)
        reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study")

    reporter.update(len(snps_found), "%d %% of model's snps used", force=True)

    results = AssociationCalculation.dataframe_from_results(results)
    results = MetaxcanUtilities.format_output(results, context, args.remove_ens_version)

    if args.output_file:
        Utilities.ensure_requisite_folders(args.output_file)
        results.to_csv(args.output_file, index=False)

    return results
示例#5
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists, delete it or move it if you want it generated again")
        return

    Utilities.ensure_requisite_folders(args.output)

    logging.info("Reading input")
    data = pandas.read_table(args.input)

    logging.info("Opening output")
    f = h5py_cache.File(args.output, 'w', chunk_cache_mem_size=int(50 * (1024 ** 2)))

    n_genes = data.shape[1]-2
    n_samples = data.shape[0]
    n_genes_chunk = np.min((n_genes, 10))

    logging.info("Processing expression")
    p = f.create_dataset("pred_expr", shape=(n_genes, n_samples),
                                        chunks=(n_genes_chunk, n_samples),
                                        dtype=np.dtype('float32'), scaleoffset=4, compression='gzip')
    g = f.create_dataset("genes", (n_genes,), dtype="S30")

    for i, gene in enumerate(data.columns.values[2:]):
        p[i, :] = data[gene].to_numpy()
        g[i] = np.string_(gene)

    logging.info("saving samples")
    s = f.create_dataset("samples", (n_samples,), dtype="S25")
    for i in xrange(0, n_samples):
        s[i] = np.string_(data["IID"][i])
    f.close()
    logging.info("Done")
def _run(args, subset=None, append=None):
    logging.info("Loading expressions")
    manager = FeatureMatrix.build_manager(args.expression_folder, filters = args.expression_filters, standardize=True, subset=subset)

    logging.info("Saving")
    Utilities.ensure_requisite_folders(args.output)
    manager.save_covariances(args.output, append=append)

    logging.info("Ran.")
示例#7
0
def run(args):
    start = timer()

    folder, prefix = os.path.split(args.output_prefix)
    results_name = args.output_prefix + "__mt_results.txt"
    predixcan_results_name = args.output_prefix + "__p_results.txt"
    additional_name = args.output_prefix + "__additional.txt"

    if os.path.exists(results_name):
        logging.info(
            "%s already exists, you have to move it or delete it if you want it done again",
            results_name)
        return

    #for reproducibility
    numpy.random.seed(100)

    results = []
    additional = []
    predixcan_results = []

    n_max = args.max_n_results
    logging.info("Acquiring context")
    with MultiPredixcanSimulations.context_from_args(args) as context:
        logging.info("processing")
        _c, _cp, _e = context.get_mp_simulation(None)
        for i, gene in enumerate(context.get_genes()):
            if n_max and i + 1 > n_max:
                logging.info("Max runs met")
                break
            logging.log(9, "%d Gene %s", i, gene)
            r, add, p = MultiPredixcanSimulations.simulate(gene, context)
            if r is None:
                logging.log(9, "%s could not be simulated", gene)
                continue
            results.append(r)
            additional.append(add)

            if p is not None:
                predixcan_results.append(p)

    results = MultiPrediXcanAssociation.dataframe_from_results(
        results, _c).sort_values(by="pvalue")
    additional = pandas.concat(additional)

    Utilities.ensure_requisite_folders(results_name)
    Utilities.save_dataframe(results, results_name)
    Utilities.save_dataframe(additional, additional_name)

    if len(predixcan_results):
        predixcan_results = pandas.concat(predixcan_results)
        Utilities.save_dataframe(predixcan_results, predixcan_results_name)
    logging.info("Finished")
def _run(args, subset=None, append=None):
    logging.info("Loading expressions")
    manager = FeatureMatrix.build_manager(args.expression_folder,
                                          filters=args.expression_filters,
                                          standardize=True,
                                          subset=subset)

    logging.info("Saving")
    Utilities.ensure_requisite_folders(args.output)
    manager.save_covariances(args.output, append=append)

    logging.info("Ran.")
示例#9
0
def run(args):
    start = timer()

    folder, prefix = os.path.split(args.output_prefix)
    results_name = args.output_prefix + "__mt_results.txt"
    predixcan_results_name = args.output_prefix + "__p_results.txt"
    additional_name = args.output_prefix + "__additional.txt"

    if os.path.exists(results_name):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", results_name)
        return

    #for reproducibility
    numpy.random.seed(100)

    results = []
    additional = []
    predixcan_results = []

    n_max = args.max_n_results
    logging.info("Acquiring context")
    with MultiPredixcanSimulations.context_from_args(args) as context:
        logging.info("processing")
        _c, _cp, _e = context.get_mp_simulation(None)
        for i, gene in enumerate(context.get_genes()):
            if n_max and i+1>n_max:
                logging.info("Max runs met")
                break
            logging.log(9, "%d Gene %s", i, gene)
            r, add, p = MultiPredixcanSimulations.simulate(gene, context)
            if r is None:
                logging.log(9, "%s could not be simulated", gene)
                continue
            results.append(r)
            additional.append(add)

            if p is not None:
                predixcan_results.append(p)

    results = MultiPrediXcanAssociation.dataframe_from_results(results, _c).sort_values(by="pvalue")
    additional = pandas.concat(additional)

    Utilities.ensure_requisite_folders(results_name)
    Utilities.save_dataframe(results, results_name)
    Utilities.save_dataframe(additional, additional_name)

    if len(predixcan_results):
        predixcan_results = pandas.concat(predixcan_results)
        Utilities.save_dataframe(predixcan_results, predixcan_results_name)
    logging.info("Finished")
示例#10
0
def run_additional(args, context):
    logging.info("Started metaxcan additional stats")
    i_genes, i_snps = context.get_data_intersection()
    results = []
    for gene in i_genes:
        stats_ = AssociationCalculation.additional_stats(gene, context)
        results.append(stats_)

    results = AssociationCalculation.dataframe_from_aditional_stats(results)
    results = MetaxcanUtilities.format_additional_output(results, context, args.remove_ens_version)

    if args.additional_output:
        Utilities.ensure_requisite_folders(args.additional_output)
        results.to_csv(args.additional_output, index=False)

    return results
示例#11
0
def run_metaxcan(args, context):
    logging.info("Started metaxcan association")
    model_snps = context.get_model_snps()
    total_snps = len(model_snps)
    snps_found = set()
    reporter = Utilities.PercentReporter(logging.INFO, total_snps)

    i_genes, i_snps = context.get_data_intersection()

    results = []
    additional = []
    for i, gene in enumerate(i_genes):
        if args.MAX_R and i + 1 > args.MAX_R:
            logging.log("Early exit condition met")
            break
        logging.log(9, "Processing gene %i:%s", i, gene)
        r, snps = AssociationCalculation.association(gene,
                                                     context,
                                                     return_snps=True)
        results.append(r)
        snps_found.update(snps)
        reporter.update(
            len(snps_found),
            "%d %% of model's snps found so far in the gwas study")
        if args.additional_output:
            stats_ = AssociationCalculation.additional_stats(gene, context)
            additional.append(stats_)

    reporter.update(len(snps_found), "%d %% of model's snps used", force=True)

    results = AssociationCalculation.dataframe_from_results(results)
    results = MetaxcanUtilities.format_output(results, context,
                                              args.remove_ens_version)

    if args.additional_output:
        additional = AssociationCalculation.dataframe_from_aditional_stats(
            additional)
        results = MetaxcanUtilities.merge_additional_output(
            results, additional, context, args.remove_ens_version)

    if args.output_file:
        Utilities.ensure_requisite_folders(args.output_file)
        results.to_csv(args.output_file, index=False)

    return results
示例#12
0
def run(args):
    start = timer()
    if os.path.exists(args.output):
        logging.info(
            "%s already exists, you have to move it or delete it if you want it done again",
            args.output)
        return

    if (args.hdf5_expression_file and args.expression_file) or \
        (not args.hdf5_expression_file and not args.expression_file):
        logging.info(
            "Provide either hdf5 expression file or plain text expression file"
        )
        return

    with PrediXcanUtilities.p_context_from_args(args) as context:
        genes = context.get_genes()
        n_genes = len(genes)
        reporter = Utilities.PercentReporter(logging.INFO, n_genes)
        reporter.update(0,
                        "%d %% of model's genes processed so far",
                        force=True)
        results = []
        for i, gene in enumerate(genes):
            logging.log(7, "Processing gene %s", gene)
            r = PrediXcanAssociation.predixcan_association(gene, context)
            results.append(r)
            reporter.update(i, "%d %% of model's genes processed so far")
        reporter.update(i, "%d %% of model's genes processed so far")
        results = PrediXcanAssociation.dataframe_from_results(results)
        results = results.fillna("NA")
        results = results.sort_values(by="pvalue")

        Utilities.ensure_requisite_folders(args.output)
        results.to_csv(args.output, index=False, sep="\t", quotechar='"')

    end = timer()
    logging.info("Ran multi tissue predixcan in %s seconds" %
                 (str(end - start)))
示例#13
0
def run(args):
    if os.path.exists(args.snp_covariance_output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output)
        return

    start = timer()

    logging.info("Loading models...")
    model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern, name_filter=args.models_filter)
    all_snps = model_manager.get_rsids()
    Utilities.ensure_requisite_folders(args.snp_covariance_output)
    with gzip.open(args.snp_covariance_output, "w") as o:
        o.write("GENE\tRSID1\tRSID2\tVALUE\n")
        logging.info("processing genotype")

        for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps):
            logging.log(9, "Processing chromosome %s", str(chromosome))

            context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager)
            genes = context.get_genes()
            reporter = Utilities.PercentReporter(9, len(genes))
            reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome))
            for i,gene in enumerate(genes):
                logging.log(6, "%d/%d:%s", i+1, len(genes), gene)
                cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene)
                cov_data = MatrixManager._flatten_matrix_data([cov_data])
                for e in cov_data:
                    l = "{}\t{}\t{}\t{}\n".format(e[0], e[1], e[2], e[3])
                    o.write(l)

                reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome))

            reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome))

    end = timer()
    logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
示例#14
0
def run(args):
    start = timer()
    validate(args)

    if args.output_folder and args.output:
        logging.info("Specify either --output_folder or --output, not both")
        return

    if args.gwas_folder:
        regexp = re.compile(
            args.gwas_file_pattern) if args.gwas_file_pattern else None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder,
                                                       regexp)
        names.sort(
        )  #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (
                args.gwas_folder,
                args.gwas_file_pattern,
            )
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(
        args.model_db_path,
        args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder or args.output:
        if args.output_folder:
            if args.output_folder and not os.path.exists(args.output_folder):
                os.makedirs(args.output_folder)
        else:
            Utilities.ensure_requisite_folders(args.output)

        for i, name in enumerate(names):
            output_path = os.path.join(
                args.output_folder, name) if not args.output else args.output
            if args.output_folder or i == 0:
                m = "w"
            else:
                m = "a"

            if os.path.exists(output_path):
                logging.info(
                    "%s already exists, delete it if you want it to be done again",
                    output_path)
                continue

            b = build_betas(args, model, gwas_format, name, args.snp_map_file)
            c = "gzip" if ".gz" in output_path else None
            logging.info("Saving %s", output_path)
            b.to_csv(output_path, sep="\t", index=False, compression=c, mode=m)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %
                     (str(end - start)))
    else:
        r = []
        for name in names:
            b = build_betas(args, model, gwas_format, name, args.snp_map_file)
            r.append(b)
        r = pandas.concat(r)
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds" %
                     (str(end - start)))

        return r
示例#15
0
def run(args):
    start = timer()
    if args.prediction_output:
        if os.path.exists(args.prediction_output[0]):
            logging.info(
                "Prediction output exists. Move or remove if you want this ran again."
            )
            return
        Utilities.ensure_requisite_folders(args.prediction_output[0])

    if args.prediction_summary_output:
        if os.path.exists(args.prediction_summary_output):
            logging.info(
                "Summary output exists. Move or remove if you want this ran again."
            )
            return
        Utilities.ensure_requisite_folders(args.prediction_output[0])

    logging.info("Loading samples")
    samples = load_samples(args)

    logging.info("Loading model")
    model, weights, extra = model_structure(args)

    variant_mapping = get_variant_mapping(args, weights)

    logging.info("Preparing genotype dosages")
    dosage_source = dosage_generator(args, variant_mapping, weights)

    logging.info("Processing genotypes")
    dcapture = []
    reporter = Utilities.PercentReporter(logging.INFO,
                                         len(set(weights.rsid.values)))
    snps_found = set()
    with prepare_prediction(args, extra, samples) as results:

        for i, e in enumerate(dosage_source):
            if args.stop_at_variant and i > args.stop_at_variant:
                break
            var_id = e[GF.RSID]

            logging.log(8, "variant %i:%s", i, var_id)
            if var_id in model:
                s = model[var_id]
                ref_allele, alt_allele = e[GF.REF_ALLELE], e[GF.ALT_ALLELE]

                allele_align, strand_align = GWASAndModels.match_alleles(
                    ref_allele, alt_allele, s[0], s[1])
                if not allele_align or not strand_align:
                    continue

                dosage = e[GF.FIRST_DOSAGE:]
                if allele_align == -1:
                    dosage = tuple(map(lambda x: 2 - x, dosage))
                dosage = numpy.array(dosage, dtype=numpy.float)

                snps_found.add(var_id)

                for gene, weight in s[2].items():
                    results.update(gene, dosage, weight)
                    if args.capture:
                        dcapture.append((gene, weight, var_id, s[0], s[1],
                                         ref_allele, alt_allele, strand_align,
                                         allele_align) + e[GF.FIRST_DOSAGE:])

                reporter.update(len(snps_found), "%d %% of models' snps used")

    reporter.update(len(snps_found), "%d %% of models' snps used", force=True)

    if args.capture:
        logging.info("Saving data capture")
        Utilities.ensure_requisite_folders(args.capture)
        with gzip.open(args.capture, "w") as f:
            header = "gene\tweight\tvariant_id\tref_allele\teff_allele\ta0\ta1\tstrand_align\tallele_align\t" + "\t".join(
                samples.IID.values) + "\n"
            f.write(header.encode())
            for c in dcapture:
                l = "\t".join(map(str, c)) + "\n"
                f.write(l.encode())

    if args.prediction_output and len(args.prediction_output) < 2:
        logging.info("Storing prediction")
        results.store_prediction()

    if args.prediction_summary_output:
        logging.info("Saving summary")
        summary = results.summary()
        Utilities.save_dataframe(summary, args.prediction_summary_output)

    end = timer()
    logging.info("Successfully predicted expression in %s seconds" %
                 (str(end - start)))

    return results