示例#1
0
    def __enter__(self):
        logging.info("initializing resources")

        logging.info("Loading regions")
        regions = load_regions(self.args.region_file, self.args.chromosome)
        if args.sub_batches and args.sub_batch is not None:
            logging.log(9, "Selecting target regions from sub-batches")
            regions = PandasHelpers.sub_batch(regions, args.sub_batches,
                                              args.sub_batch)
        self.regions = regions

        logging.info("Opening variants metadata")
        self.vmf = pq.ParquetFile(args.parquet_genotype_metadata)

        logging.info("Creating destination")
        if args.text_output:
            if os.path.exists(args.text_output):
                raise RuntimeError("Output exists. Nope.")
            Utilities.ensure_requisite_folders(args.text_output)
            self.of = TextFileTools.TextDataSink(
                args.text_output, [("region", "id1", "id2", "value")])
            self.of.initialize()
        elif args.text_output_folder:
            Utilities.maybe_create_folder(args.text_output_folder)
        else:
            raise RuntimeError("Unrecognized output specification")

        if (args.parquet_genotype_folder and args.parquet_genotype_pattern):
            self.file_map = get_file_map(args)
        else:
            raise RuntimeError("Unrecognized genotype specification")

        return self
示例#2
0
def run(args):
    if not args.reentrant:
        if os.path.exists(args.output_folder):
            logging.info("Output path exists. Nope.")
            return

    Utilities.maybe_create_folder(args.output_folder)


    logging.info("Checking input folder")
    r = re.compile(args.rule)
    folders = [x for x in sorted(os.listdir(args.input_folder)) if r.search(x)]
    if args.exclude:
        folders = [x for x in folders if not x in {y for y in args.exclude}]
    names = {}
    for f in folders:
        name = r.search(f).group(1)
        if not name in names: names[name] = []
        names[name].append(os.path.join(args.input_folder, f))


    _f = shutil.move if args.move else shutil.copy
    for name in sorted(names):
        logging.info("Processing %s", name)
        output_folder = os.path.join(args.output_folder, name)
        Utilities.maybe_create_folder(output_folder)

        for input_folder in names[name]:
            logging.log(8, "Processing %s", input_folder)
            files = os.listdir(input_folder)
            for file in files:
                i = os.path.join(input_folder, file)
                o = os.path.join(output_folder, file)
                _f(i, o)
    logging.info("Finished collapse")
def run(args):
    Utilities.maybe_create_folder(args.intermediate_folder)
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    p_ = re.compile(args.data_name_pattern)
    f = [x for x in sorted(os.listdir(args.data_folder)) if p_.search(x)]
    tissue_names = [p_.search(x).group(1) for x in f]
    data = []
    for i in range(0, len(tissue_names)):
        logging.info("Loading %s", tissue_names[i])
        data.append((tissue_names[i],
                     pq.ParquetFile(os.path.join(args.data_folder, f[i]))))
    data = collections.OrderedDict(data)
    available_data = {
        x
        for p in data.values() for x in p.metadata.schema.names
    }

    logging.info("Preparing output")
    WEIGHTS_FIELDS = [
        "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"
    ]
    SUMMARY_FIELDS = [
        "gene", "genename", "gene_type", "alpha", "n_snps_in_window",
        "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval"
    ]

    Utilities.ensure_requisite_folders(args.output_prefix)

    if args.skip_regression:
        weights, summaries, covariances = None, None, None
    else:
        weights, summaries, covariances = setup_output(args.output_prefix,
                                                       tissue_names,
                                                       WEIGHTS_FIELDS,
                                                       SUMMARY_FIELDS)

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities._load_gene_annotation(
        args.data_annotation)
    data_annotation = data_annotation[data_annotation.gene_id.isin(
        available_data)]
    if args.chromosome or (args.sub_batches and args.sub_batch):
        data_annotation = StudyUtilities._filter_gene_annotation(
            data_annotation, args.chromosome, args.sub_batches, args.sub_batch)
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(
            args.features_annotation).read_row_group(args.chromosome -
                                                     1).to_pandas()

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(
            features_metadata, data_annotation, args.window)

    if args.rsid_whitelist:
        logging.info("Filtering features annotation")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(
            whitelist)]

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    seed = numpy.random.randint(1e8)

    if args.run_tag:
        d = pandas.DataFrame({
            "run": [args.run_tag],
            "cv_seed": [seed]
        })[["run", "cv_seed"]]
        for t in tissue_names:
            Utilities.save_dataframe(
                d, "{}_{}_runs.txt.gz".format(args.output_prefix, t))

    failed_run = False
    try:
        for i, data_annotation_ in enumerate(data_annotation.itertuples()):
            logging.log(9, "processing %i/%i:%s", i + 1,
                        data_annotation.shape[0], data_annotation_.gene_id)
            logging.log(8, "loading data")
            d_ = {}
            for k, v in data.items():
                d_[k] = Parquet._read(v, [data_annotation_.gene_id],
                                      to_pandas=True)
            features_ = Genomics.entries_for_gene_annotation(
                data_annotation_, args.window, features_metadata)

            if features_.shape[0] == 0:
                logging.log(9, "No features available")
                continue

            features_data_ = Parquet._read(features,
                                           [x for x in features_.id.values],
                                           to_pandas=True)
            features_data_["id"] = range(1, features_data_.shape[0] + 1)
            features_data_ = features_data_[["individual", "id"] +
                                            [x for x in features_.id.values]]

            logging.log(8, "training")
            prepare_ctimp(args.script_path, seed, args.intermediate_folder,
                          data_annotation_, features_, features_data_, d_)
            del (features_data_)
            del (d_)
            if args.skip_regression:
                continue

            subprocess.call([
                "bash",
                _execution_script(args.intermediate_folder,
                                  data_annotation_.gene_id)
            ])

            w = pandas.read_table(_weights(args.intermediate_folder,
                                           data_annotation_.gene_id),
                                  sep="\s+")
            s = pandas.read_table(_summary(args.intermediate_folder,
                                           data_annotation_.gene_id),
                                  sep="\s+")

            for e_, entry in enumerate(s.itertuples()):
                entry_weights = w[["SNP", "REF.0.", "ALT.1.",
                                   entry.tissue]].rename(
                                       columns={
                                           "SNP": "varID",
                                           "REF.0.": "ref_allele",
                                           "ALT.1.": "eff_allele",
                                           entry.tissue: "weight"
                                       })
                entry_weights = entry_weights[entry_weights.weight != 0]
                entry_weights = entry_weights.assign(
                    gene=data_annotation_.gene_id)
                entry_weights = entry_weights.merge(features_,
                                                    left_on="varID",
                                                    right_on="id",
                                                    how="left")
                entry_weights = entry_weights[WEIGHTS_FIELDS]
                if args.output_rsids:
                    entry_weights.loc[entry_weights.rsid == "NA",
                                      "rsid"] = entry_weights.loc[
                                          entry_weights.rsid == "NA", "varID"]
                weights[entry.tissue].write(
                    entry_weights.to_csv(sep="\t",
                                         index=False,
                                         header=False,
                                         na_rep="NA").encode())

                entry_summary = s[s.tissue == entry.tissue].rename(
                    columns={
                        "zscore_pval": "pred.perf.pval",
                        "rho_avg_squared": "pred.perf.R2"
                    })
                entry_summary = entry_summary.assign(
                    gene=data_annotation_.gene_id,
                    alpha=0.5,
                    genename=data_annotation_.gene_name,
                    gene_type=data_annotation_.gene_type,
                    n_snps_in_window=features_.shape[0])
                entry_summary["n.snps.in.model"] = entry_weights.shape[0]
                #must repeat strings beause of weird pandas indexing issue
                entry_summary = entry_summary.drop(
                    ["R2", "n", "tissue"], axis=1)[[
                        "gene", "genename", "gene_type", "alpha",
                        "n_snps_in_window", "n.snps.in.model", "rho_avg",
                        "pred.perf.R2", "pred.perf.pval"
                    ]]
                summaries[entry.tissue].write(
                    entry_summary.to_csv(sep="\t",
                                         index=False,
                                         header=False,
                                         na_rep="NA").encode())

                features_data_ = Parquet._read(
                    features, [x for x in entry_weights.varID.values],
                    to_pandas=True)
                var_ids = [x for x in entry_weights.varID.values]
                cov = numpy.cov([features_data_[k] for k in var_ids], ddof=1)
                ids = [x for x in entry_weights.rsid.values
                       ] if args.output_rsids else var_ids
                cov = matrices._flatten_matrix_data([(data_annotation_.gene_id,
                                                      ids, cov)])
                for cov_ in cov:
                    l = "{} {} {} {}\n".format(cov_[0], cov_[1], cov_[2],
                                               cov_[3]).encode()
                    covariances[entry.tissue].write(l)

            if not args.keep_intermediate_folder:
                logging.info("Cleaning up")
                shutil.rmtree(
                    _intermediate_folder(args.intermediate_folder,
                                         data_annotation_.gene_id))

            if args.MAX_M and i >= args.MAX_M:
                logging.info("Early abort")
                break

    except Exception as e:
        logging.info("Exception running model training:\n%s",
                     traceback.format_exc())
        failed_run = True
    finally:
        pass
        # if not args.keep_intermediate_folder:
        #     shutil.rmtree(args.intermediate_folder)

    if not args.skip_regression:
        set_down(weights, summaries, covariances, tissue_names, failed_run)

    logging.info("Finished")