def run(args):
    if os.path.exists(args.cs_output) or os.path.exists(args.var_output):
        logging.info("Output exists. Nope.")
        return

    study, variants_whitelist = get_study(args.parquet_genotype_folder, args.parquet_genotype_pattern, args.parquet_genotype_metadata)

    #_skip = lambda x: x not in variants_whitelist
    columns = ["maf", "pval_nominal", "slope", "slope_se"]
    eqtl_streamer = DataFrameStreamer.data_frame_streamer(args.eqtl, sanitize=True, to_numeric=columns, sentinel_column="gene_id")

    individuals = None if not args.restrict_to_individuals else TextFileTools.load_list(args.restrict_to_individuals)

    genes = None if not args.restrict_to_genes else set(TextFileTools.load_list(args.restrict_to_genes))

    cs_results = []
    var_results = []
    logging.info("Beggining process")
    MAX_N=args.MAX_N
    n=args.sample_size
    for i, d in enumerate(eqtl_streamer):
        if MAX_N and i > MAX_N:
            logging.info("Early exit")
            break
        gene = d.gene_id.values[0]
        if genes is not None and gene.split('.')[0] not in genes:
            logging.log(9, "Skipping gene: %s", gene)
            continue
        logging.log(9, "Processing gene %i:%s", i+1, gene)
        d = d.loc[(~d.slope_se.isnull()) & (d.slope!=0) & (~d.slope.isnull())]
        try:
            res_, d_ = _do_susie(d, study, variants_whitelist, n, individuals, args.mode)
            cs, vars =_process_result(res_, d_, gene)
        except Exception as e:
            logging.log(9, "Error while doing susie:\n%s", traceback.format_exc())
            cs = _void_cs("susie_error").assign(gene_id=gene, pp_sum=None)
            vars = _void_var().assign(gene_id=[gene], var_id=[None])

        cs_results.append(cs)
        #if vars.shape[1]>0:
        var_results.append(vars)

    if len(cs_results) > 0:
        logging.info("Saving")
        cs_results = pandas.concat(cs_results)[["gene_id", "cs", "cs_avg_r2", "cs_log10bf", "cs_min_r2", "var_id", "pp_sum", "status"]]
        Utilities.ensure_requisite_folders(args.cs_output)
        Utilities.save_dataframe(cs_results, args.cs_output)
    else:
        logging.info('No results')

    if len(var_results) > 0:
        var_results = pandas.concat(var_results)[["gene_id", "var_id",  "cs", "variable_prob"]]
        Utilities.ensure_requisite_folders(args.var_output)
        Utilities.save_dataframe(var_results, args.var_output)
    logging.info("Ran susie")
示例#2
0
    def __enter__(self):
        logging.info("initializing resources")

        logging.info("Loading regions")
        regions = load_regions(self.args.region_file, self.args.chromosome)
        if args.sub_batches and args.sub_batch is not None:
            logging.log(9, "Selecting target regions from sub-batches")
            regions = PandasHelpers.sub_batch(regions, args.sub_batches,
                                              args.sub_batch)
        self.regions = regions

        logging.info("Opening variants metadata")
        self.vmf = pq.ParquetFile(args.parquet_genotype_metadata)

        logging.info("Creating destination")
        if args.text_output:
            if os.path.exists(args.text_output):
                raise RuntimeError("Output exists. Nope.")
            Utilities.ensure_requisite_folders(args.text_output)
            self.of = TextFileTools.TextDataSink(
                args.text_output, [("region", "id1", "id2", "value")])
            self.of.initialize()
        elif args.text_output_folder:
            Utilities.maybe_create_folder(args.text_output_folder)
        else:
            raise RuntimeError("Unrecognized output specification")

        if (args.parquet_genotype_folder and args.parquet_genotype_pattern):
            self.file_map = get_file_map(args)
        else:
            raise RuntimeError("Unrecognized genotype specification")

        return self
def run(args):
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Loading snp reference")
    key = KeyedDataSource.load_data(args.snp_reference_file,
                                    "variant_id",
                                    "rs_id_dbSNP150_GRCh38p7",
                                    value_conversion=KeyedDataSource.dot_to_na)
    logging.info("Loading samples")
    samples = TextFileTools.load_list(args.samples)
    genotype_format_string = "\t".join(["{}"] * (len(samples) + 1)) + "\n"

    og = args.output_prefix + "_genotype.txt.gz"
    oa = args.output_prefix + "_annotation.txt.gz"
    if os.path.exists(og) or os.path.exists(oa):
        logging.info("Output exists. Nope.")
        return

    logging.info("Processing")
    with gzip.open(args.genotype) as geno:
        with gzip.open(og, "w") as _og:
            _og.write(_to_gl(["varID"] + samples, genotype_format_string))
            with gzip.open(oa, "w") as _oa:
                _oa.write(
                    _to_al([
                        "chromosome", "position", "id", "allele_0", "allele_1",
                        "allele_1_frequency", "rsid"
                    ]))
                for i, line in enumerate(geno):
                    comps = line.decode().strip().split()

                    chr = "chr" + comps[0]
                    pos = comps[2]
                    ref = comps[3]
                    alt = comps[4]
                    af = comps[5]
                    dosage = comps[6:]

                    var_id = "{}_{}_{}_{}_b38".format(chr, pos, ref, alt)
                    if var_id in key:
                        id = key[var_id]
                        comps[1] = var_id
                        _og.write(
                            _to_gl([var_id] + dosage, genotype_format_string))
                        _oa.write(_to_al([chr, pos, var_id, ref, alt, af, id]))
                        next

                    var_id = "{}_{}_{}_{}_b38".format(chr, pos, alt, ref)
                    if var_id in key and len(ref) == 1 and len(alt) == 1:
                        id = key[var_id]
                        af = str(1 - float(af))
                        dosage = list(map(lambda x: str(2 - int(x)),
                                          comps[6:]))
                        _og.write(
                            _to_gl([var_id] + dosage, genotype_format_string))
                        _oa.write(_to_al([chr, pos, var_id, alt, ref, af, id]))
                        next

    logging.info("Finished conversion")
def metadata_white_list(black_list_path, column, variants):
    w = {x for x in variants}
    if black_list_path:
        b = TextFileTools.load_column(black_list_path,
                                      column,
                                      unique_entries=True,
                                      white_list=w)
        w = {x for x in w if not x in b}
    return w
def fill_coords(args, d):
    logging.info("Loading SNP metadata whitelist")
    w = metadata_white_list(args.snp_info_blacklist, "name", d.variant_id)

    logging.info("Loading SNP specification")
    s = TextFileTools.load_dataframe(
        args.fill_from_snp_info, keys=w,
        key_column_name="name").rename(columns={"start": "position"})

    d = d.merge(s, left_on="variant_id", right_on="name", how="left")
    logging.info("%d variants after filling coordinates", d.shape[0])
    return d
示例#6
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists. Nope.")
        return

    logging.info("Loading samples")
    samples = {x for x in TextFileTools.load_list(args.samples_whitelist)}

    logging.info("Processing file")
    Utilities.ensure_requisite_folders(args.output)
    Utilities.write_iterable_to_file(input_generator(args.input_file, samples), args.output)

    logging.info("Finished")
示例#7
0
def run(args):
    wp = args.output_prefix + "_weights.txt.gz"
    if os.path.exists(wp):
        logging.info("Weights output exists already, delete it or move it")
        return

    sp = args.output_prefix + "_summary.txt.gz"
    if os.path.exists(sp):
        logging.info("Summary output exists already, delete it or move it")
        return

    cp = args.output_prefix + "_covariance.txt.gz"
    if os.path.exists(wp):
        logging.info("covariance output exists already, delete it or move it")
        return

    r = args.output_prefix + "_run.txt.gz"
    if os.path.exists(wp):
        logging.info("run output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    data = pq.ParquetFile(args.data)
    available_data = {x for x in data.metadata.schema.names}

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities.load_gene_annotation(
        args.data_annotation, args.chromosome, args.sub_batches,
        args.sub_batch)
    data_annotation = data_annotation[data_annotation.gene_id.isin(
        available_data)]
    if args.gene_whitelist:
        logging.info("Applying gene whitelist")
        data_annotation = data_annotation[data_annotation.gene_id.isin(
            set(args.gene_whitelist))]
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(
            args.features_annotation).read_row_group(args.chromosome -
                                                     1).to_pandas()

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(
            features_metadata, data_annotation, args.window)

    if args.rsid_whitelist:
        logging.info("Filtering features annotation")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(
            whitelist)]

    if args.features_weights:
        logging.info("Loading weights")
        x_weights = get_weights(args.features_weights,
                                {x
                                 for x in features_metadata.id})
        logging.info(
            "Filtering features metadata to those available in weights")
        features_metadata = features_metadata[features_metadata.id.isin(
            x_weights.id)]
        logging.info("Kept %d entries", features_metadata.shape[0])
    else:
        x_weights = None

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    s = numpy.random.randint(1e8)
    set_seed(s)
    if args.run_tag:
        d = pandas.DataFrame({
            "run": [args.run_tag],
            "cv_seed": [s]
        })[["run", "cv_seed"]]
        Utilities.save_dataframe(d, r)

    WEIGHTS_FIELDS = [
        "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"
    ]
    SUMMARY_FIELDS = [
        "gene", "genename", "gene_type", "alpha", "n_snps_in_window",
        "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg",
        "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval",
        "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore",
        "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"
    ]

    train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols

    with gzip.open(wp, "w") as w:
        w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode())
        with gzip.open(sp, "w") as s:
            s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode())
            with gzip.open(cp, "w") as c:
                c.write("GENE RSID1 RSID2 VALUE\n".encode())
                for i, data_annotation_ in enumerate(
                        data_annotation.itertuples()):
                    if args.MAX_M and i >= args.MAX_M:
                        logging.info("Early abort")
                        break
                    logging.log(9, "processing %i/%i:%s", i + 1,
                                data_annotation.shape[0],
                                data_annotation_.gene_id)
                    if args.repeat:
                        for j in range(0, args.repeat):
                            logging.log(9, "%i-th reiteration", j)
                            process(w, s, c, data, data_annotation_, features,
                                    features_metadata, x_weights,
                                    SUMMARY_FIELDS, train, j,
                                    args.nested_cv_folds)
                    else:
                        process(w,
                                s,
                                c,
                                data,
                                data_annotation_,
                                features,
                                features_metadata,
                                x_weights,
                                SUMMARY_FIELDS,
                                train,
                                nested_folds=args.nested_cv_folds)

    logging.info("Finished")
def run(args):
    Utilities.maybe_create_folder(args.intermediate_folder)
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    p_ = re.compile(args.data_name_pattern)
    f = [x for x in sorted(os.listdir(args.data_folder)) if p_.search(x)]
    tissue_names = [p_.search(x).group(1) for x in f]
    data = []
    for i in range(0, len(tissue_names)):
        logging.info("Loading %s", tissue_names[i])
        data.append((tissue_names[i],
                     pq.ParquetFile(os.path.join(args.data_folder, f[i]))))
    data = collections.OrderedDict(data)
    available_data = {
        x
        for p in data.values() for x in p.metadata.schema.names
    }

    logging.info("Preparing output")
    WEIGHTS_FIELDS = [
        "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"
    ]
    SUMMARY_FIELDS = [
        "gene", "genename", "gene_type", "alpha", "n_snps_in_window",
        "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval"
    ]

    Utilities.ensure_requisite_folders(args.output_prefix)

    if args.skip_regression:
        weights, summaries, covariances = None, None, None
    else:
        weights, summaries, covariances = setup_output(args.output_prefix,
                                                       tissue_names,
                                                       WEIGHTS_FIELDS,
                                                       SUMMARY_FIELDS)

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities._load_gene_annotation(
        args.data_annotation)
    data_annotation = data_annotation[data_annotation.gene_id.isin(
        available_data)]
    if args.chromosome or (args.sub_batches and args.sub_batch):
        data_annotation = StudyUtilities._filter_gene_annotation(
            data_annotation, args.chromosome, args.sub_batches, args.sub_batch)
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(
            args.features_annotation).read_row_group(args.chromosome -
                                                     1).to_pandas()

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(
            features_metadata, data_annotation, args.window)

    if args.rsid_whitelist:
        logging.info("Filtering features annotation")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(
            whitelist)]

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    seed = numpy.random.randint(1e8)

    if args.run_tag:
        d = pandas.DataFrame({
            "run": [args.run_tag],
            "cv_seed": [seed]
        })[["run", "cv_seed"]]
        for t in tissue_names:
            Utilities.save_dataframe(
                d, "{}_{}_runs.txt.gz".format(args.output_prefix, t))

    failed_run = False
    try:
        for i, data_annotation_ in enumerate(data_annotation.itertuples()):
            logging.log(9, "processing %i/%i:%s", i + 1,
                        data_annotation.shape[0], data_annotation_.gene_id)
            logging.log(8, "loading data")
            d_ = {}
            for k, v in data.items():
                d_[k] = Parquet._read(v, [data_annotation_.gene_id],
                                      to_pandas=True)
            features_ = Genomics.entries_for_gene_annotation(
                data_annotation_, args.window, features_metadata)

            if features_.shape[0] == 0:
                logging.log(9, "No features available")
                continue

            features_data_ = Parquet._read(features,
                                           [x for x in features_.id.values],
                                           to_pandas=True)
            features_data_["id"] = range(1, features_data_.shape[0] + 1)
            features_data_ = features_data_[["individual", "id"] +
                                            [x for x in features_.id.values]]

            logging.log(8, "training")
            prepare_ctimp(args.script_path, seed, args.intermediate_folder,
                          data_annotation_, features_, features_data_, d_)
            del (features_data_)
            del (d_)
            if args.skip_regression:
                continue

            subprocess.call([
                "bash",
                _execution_script(args.intermediate_folder,
                                  data_annotation_.gene_id)
            ])

            w = pandas.read_table(_weights(args.intermediate_folder,
                                           data_annotation_.gene_id),
                                  sep="\s+")
            s = pandas.read_table(_summary(args.intermediate_folder,
                                           data_annotation_.gene_id),
                                  sep="\s+")

            for e_, entry in enumerate(s.itertuples()):
                entry_weights = w[["SNP", "REF.0.", "ALT.1.",
                                   entry.tissue]].rename(
                                       columns={
                                           "SNP": "varID",
                                           "REF.0.": "ref_allele",
                                           "ALT.1.": "eff_allele",
                                           entry.tissue: "weight"
                                       })
                entry_weights = entry_weights[entry_weights.weight != 0]
                entry_weights = entry_weights.assign(
                    gene=data_annotation_.gene_id)
                entry_weights = entry_weights.merge(features_,
                                                    left_on="varID",
                                                    right_on="id",
                                                    how="left")
                entry_weights = entry_weights[WEIGHTS_FIELDS]
                if args.output_rsids:
                    entry_weights.loc[entry_weights.rsid == "NA",
                                      "rsid"] = entry_weights.loc[
                                          entry_weights.rsid == "NA", "varID"]
                weights[entry.tissue].write(
                    entry_weights.to_csv(sep="\t",
                                         index=False,
                                         header=False,
                                         na_rep="NA").encode())

                entry_summary = s[s.tissue == entry.tissue].rename(
                    columns={
                        "zscore_pval": "pred.perf.pval",
                        "rho_avg_squared": "pred.perf.R2"
                    })
                entry_summary = entry_summary.assign(
                    gene=data_annotation_.gene_id,
                    alpha=0.5,
                    genename=data_annotation_.gene_name,
                    gene_type=data_annotation_.gene_type,
                    n_snps_in_window=features_.shape[0])
                entry_summary["n.snps.in.model"] = entry_weights.shape[0]
                #must repeat strings beause of weird pandas indexing issue
                entry_summary = entry_summary.drop(
                    ["R2", "n", "tissue"], axis=1)[[
                        "gene", "genename", "gene_type", "alpha",
                        "n_snps_in_window", "n.snps.in.model", "rho_avg",
                        "pred.perf.R2", "pred.perf.pval"
                    ]]
                summaries[entry.tissue].write(
                    entry_summary.to_csv(sep="\t",
                                         index=False,
                                         header=False,
                                         na_rep="NA").encode())

                features_data_ = Parquet._read(
                    features, [x for x in entry_weights.varID.values],
                    to_pandas=True)
                var_ids = [x for x in entry_weights.varID.values]
                cov = numpy.cov([features_data_[k] for k in var_ids], ddof=1)
                ids = [x for x in entry_weights.rsid.values
                       ] if args.output_rsids else var_ids
                cov = matrices._flatten_matrix_data([(data_annotation_.gene_id,
                                                      ids, cov)])
                for cov_ in cov:
                    l = "{} {} {} {}\n".format(cov_[0], cov_[1], cov_[2],
                                               cov_[3]).encode()
                    covariances[entry.tissue].write(l)

            if not args.keep_intermediate_folder:
                logging.info("Cleaning up")
                shutil.rmtree(
                    _intermediate_folder(args.intermediate_folder,
                                         data_annotation_.gene_id))

            if args.MAX_M and i >= args.MAX_M:
                logging.info("Early abort")
                break

    except Exception as e:
        logging.info("Exception running model training:\n%s",
                     traceback.format_exc())
        failed_run = True
    finally:
        pass
        # if not args.keep_intermediate_folder:
        #     shutil.rmtree(args.intermediate_folder)

    if not args.skip_regression:
        set_down(weights, summaries, covariances, tissue_names, failed_run)

    logging.info("Finished")
示例#9
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output already exists, either delete it or move it")
        return

    logging.info("Getting parquet genotypes")
    file_map = get_file_map(args)

    logging.info("Getting genes")
    with sqlite3.connect(args.model_db) as connection:
        # Pay heed to the order. This avoids arbitrariness in sqlite3 loading of results.
        extra = pandas.read_sql("SELECT * FROM EXTRA order by gene",
                                connection)
        extra = extra[extra["n.snps.in.model"] > 0]

    individuals = TextFileTools.load_list(
        args.individuals) if args.individuals else None

    logging.info("Processing")
    Utilities.ensure_requisite_folders(args.output)

    with gzip.open(args.output, "w") as f:
        f.write("GENE RSID1 RSID2 VALUE\n".encode())
        with sqlite3.connect(args.model_db) as connection:
            for i, t in enumerate(extra.itertuples()):
                g_ = t.gene
                logging.log(9, "Proccessing %i/%i:%s", i + 1, extra.shape[0],
                            g_)
                w = pandas.read_sql(
                    "select * from weights where gene = '{}';".format(g_),
                    connection)
                chr_ = w.varID.values[0].split("_")[0].split("chr")[1]
                if not n_.search(chr_):
                    logging.log(9, "Unsupported chromosome: %s", chr_)
                    continue
                dosage = file_map[int(chr_)]

                if individuals:
                    d = Parquet._read(dosage,
                                      columns=w.varID.values,
                                      specific_individuals=individuals)
                    del d["individual"]
                else:
                    d = Parquet._read(dosage,
                                      columns=w.varID.values,
                                      skip_individuals=True)

                var_ids = list(d.keys())
                if len(var_ids) == 0:
                    if len(w.varID.values) == 1:
                        logging.log(
                            9, "workaround for single missing genotype at %s",
                            g_)
                        d = {w.varID.values[0]: [0, 1]}
                    else:
                        logging.log(9,
                                    "No genotype available for %s, skipping",
                                    g_)
                        next

                if args.output_rsids:
                    ids = [
                        x for x in pandas.DataFrame({
                            "varID": var_ids
                        }).merge(w[["varID", "rsid"]], on="varID").rsid.values
                    ]
                else:
                    ids = var_ids

                c = numpy.cov([d[x] for x in var_ids])
                c = matrices._flatten_matrix_data([(w.gene.values[0], ids, c)])
                for entry in c:
                    l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2],
                                               entry[3])
                    f.write(l.encode())
    logging.info("Finished building covariance.")
def run(args):
    wp = args.output_prefix + "_weights.txt.gz"
    if os.path.exists(wp):
        logging.info("Weights output exists already, delete it or move it")
        return

    sp = args.output_prefix + "_summary.txt.gz"
    if os.path.exists(sp):
        logging.info("Summary output exists already, delete it or move it")
        return

    cp = args.output_prefix + "_covariance.txt.gz"
    if os.path.exists(wp):
        logging.info("covariance output exists already, delete it or move it")
        return

    r = args.output_prefix + "_run.txt.gz"
    if os.path.exists(wp):
        logging.info("run output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    data = pq.ParquetFile(args.data)
    available_data = {x for x in data.metadata.schema.names}

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities.load_gene_annotation(args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch, args.simplify_data_annotation)
    data_annotation = data_annotation[data_annotation.gene_id.isin(available_data)]
    if args.gene_whitelist:
        logging.info("Applying gene whitelist")
        data_annotation = data_annotation[data_annotation.gene_id.isin(set(args.gene_whitelist))]
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(args.features_annotation).read_row_group(args.chromosome-1).to_pandas()

    if args.output_rsids:
        if not args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.warning("Several variants map to a same rsid (hint: multiple INDELS?).\n"
                            "Can't proceed. Consider the using the --keep_highest_frequency_rsid flag, or models will be ill defined.")
            return

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(features_metadata, data_annotation, args.window)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_call_filter:
        logging.info("Filtering variants by average call rate")
        features_metadata = features_metadata[features_metadata.avg_call > args.variant_call_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_r2_filter:
        logging.info("Filtering variants by imputation R2")
        features_metadata = features_metadata[features_metadata.r2 > args.variant_r2_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_variance_filter:
        logging.info("Filtering variants by (dosage/2)'s variance")
        features_metadata = features_metadata[features_metadata["std"]/2 > numpy.sqrt(args.variant_variance_filter)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.discard_palindromic_snps:
        logging.info("Discarding palindromic snps")
        features_metadata = Genomics.discard_gtex_palindromic_variants(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.rsid_whitelist:
        logging.info("Filtering features annotation for whitelist")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(whitelist)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.only_rsids:
        logging.info("discarding non-rsids")
        features_metadata = StudyUtilities.trim_variant_metadata_to_rsids_only(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

        if args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.info("Keeping only the highest frequency entry for every rsid")
            k = features_metadata[["rsid", "allele_1_frequency", "id"]]
            k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] = 1 - k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"]
            k = k.sort_values(by=["rsid", "allele_1_frequency"], ascending=False)
            k = k.groupby("rsid").first().reset_index()
            features_metadata = features_metadata[features_metadata.id.isin(k.id)]
            logging.info("Kept %d", features_metadata.shape[0])
        else:
            logging.info("rsids are unique, no need to restrict to highest frequency entry")

    if args.features_weights:
        logging.info("Loading weights")
        x_weights = get_weights(args.features_weights, {x for x in features_metadata.id})
        logging.info("Filtering features metadata to those available in weights")
        features_metadata = features_metadata[features_metadata.id.isin(x_weights.id)]
        logging.info("Kept %d entries", features_metadata.shape[0])
    else:
        x_weights = None

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    s = numpy.random.randint(1e8)
    set_seed(s)
    if args.run_tag:
        d = pandas.DataFrame({"run":[args.run_tag], "cv_seed":[s]})[["run", "cv_seed"]]
        Utilities.save_dataframe(d, r)

    WEIGHTS_FIELDS=["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]
    SUMMARY_FIELDS=["gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model",
                    "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval",
                    "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]

    train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols

    available_individuals = check_missing(args, data, features)

    with gzip.open(wp, "w") as w:
        w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode())
        with gzip.open(sp, "w") as s:
            s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode())
            with gzip.open(cp, "w") as c:
                c.write("GENE RSID1 RSID2 VALUE\n".encode())
                for i,data_annotation_ in enumerate(data_annotation.itertuples()):
                    if args.MAX_M and  i>=args.MAX_M:
                        logging.info("Early abort")
                        break
                    logging.log(9, "processing %i/%i:%s", i+1, data_annotation.shape[0], data_annotation_.gene_id)

                    if args.repeat:
                        for j in range(0, args.repeat):
                            logging.log(9, "%i-th reiteration", j)
                            process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)
                    else:
                        process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)

    logging.info("Finished")
示例#11
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output already exists, either delete it or move it")
        return

    logging.info("Loading group")
    groups = pandas.read_table(args.group)
    groups = groups.assign(chromosome = groups.gtex_intron_id.str.split(":").str.get(0))
    groups = groups.assign(position=groups.gtex_intron_id.str.split(":").str.get(1))
    groups = Genomics.sort(groups)

    logging.info("Getting parquet genotypes")
    file_map = get_file_map(args)

    logging.info("Getting genes")
    with sqlite3.connect(args.model_db_group_key) as connection:
        # Pay heed to the order. This avoids arbitrariness in sqlite3 loading of results.
        extra = pandas.read_sql("SELECT * FROM EXTRA order by gene", connection)
        extra = extra[extra["n.snps.in.model"] > 0]

    individuals = TextFileTools.load_list(args.individuals) if args.individuals else None

    logging.info("Processing")
    Utilities.ensure_requisite_folders(args.output)

    genes_ = groups[["chromosome", "position", "gene_id"]].drop_duplicates()
    with gzip.open(args.output, "w") as f:
        f.write("GENE RSID1 RSID2 VALUE\n".encode())
        with sqlite3.connect(args.model_db_group_key) as db_group_key:
            with sqlite3.connect(args.model_db_group_values) as db_group_values:
                for i,t_ in enumerate(genes_.itertuples()):
                    g_ = t_.gene_id
                    chr_ = t_.chromosome.split("chr")[1]
                    logging.log(8, "Proccessing %i/%i:%s", i+1, len(genes_), g_)

                    if not n_.search(chr_):
                        logging.log(9, "Unsupported chromosome: %s", chr_)
                        continue
                    dosage = file_map[int(chr_)]

                    group = groups[groups.gene_id == g_]
                    wg=[]
                    for value in group.intron_id:
                        wk = pandas.read_sql("select * from weights where gene = '{}';".format(value), db_group_values)
                        if wk.shape[0] == 0:
                            continue
                        wg.append(wk)

                    if len(wg) > 0:
                        wg = pandas.concat(wg)
                        w = pandas.concat([wk, wg])[["varID", "rsid"]].drop_duplicates()
                    else:
                        w = wk[["varID", "rsid"]].drop_duplicates()

                    if w.shape[0] == 0:
                        logging.log(8, "No data, skipping")
                        continue

                    if individuals:
                        d = Parquet._read(dosage, columns=w.varID.values, specific_individuals=individuals)
                        del d["individual"]
                    else:
                        d = Parquet._read(dosage, columns=w.varID.values, skip_individuals=True)

                    var_ids = list(d.keys())
                    if len(var_ids) == 0:
                        if len(w.varID.values) == 1:
                            logging.log(9, "workaround for single missing genotype at %s", g_)
                            d = {w.varID.values[0]:[0,1]}
                        else:
                            logging.log(9, "No genotype available for %s, skipping",g_)
                            next

                    if args.output_rsids:
                        ids = [x for x in pandas.DataFrame({"varID": var_ids}).merge(w[["varID", "rsid"]], on="varID").rsid.values]
                    else:
                        ids = var_ids

                    c = numpy.cov([d[x] for x in var_ids])
                    c = matrices._flatten_matrix_data([(g_, ids, c)])
                    for entry in c:
                        l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2], entry[3])
                        f.write(l.encode())
    logging.info("Finished building covariance.")