示例#1
0
文件: Pvalue.py 项目: aehrc/BitEpi
def BDF(prefix):
    (bim, fam, bed) = read_plink(prefix, verbose=True)
    bdf = pd.DataFrame(bed.compute().astype('int8')).join(
        bim[['snp']]).set_index('snp').append(
            fam.trait.astype('int8')).transpose().astype('category')
    bdf['cnt'] = 1
    return bdf
示例#2
0
    def __init__(self, plink_file, scratch_dir, overwrite=False):
        self.options = tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.ZLIB)
        self.plink_file = plink_file
        self.scratch_dir = scratch_dir

        # read plink data
        print('\nReading PLINK data...')
        self.bim, self.fam, G = read_plink(plink_file)
        # import ipdb; ipdb.set_trace()
        print('Done')

        # write tf.records
        if overwrite:
            G_df = dd.from_dask_array(da.transpose(G))
            G_df = G_df.fillna(value=1)  # (. _ . )
            G_df = G_df.astype(np.int8)
            tf_records_filenames = G_df.apply(self._write_records,
                                              axis=1).compute()
            print('Done')
        else:
            root, dirs, files = next(os.walk(scratch_dir))
            tf_records_filenames = [
                root + f for f in files if f.endswith('.tfrecords')
            ]

        # split into training and test batches
        self.train_files, self.test_files = train_test_split(
            tf_records_filenames, test_size=0.20, random_state=42)
示例#3
0
    def from_plink(cls, path):
        with np.warnings.catch_warnings():
            np.warnings.filterwarnings('ignore', 'FutureWarning')
            bim, fam, bed = read_plink(path, verbose=False)
            bim.chrom = bim.chrom.astype(str)

            return cls(bed, fam, bim)
def __main__(plink_file, tfrecords_file, tf_opts):
    bim, fam, G = read_plink(plink_file)
    G = np.array(G.T, dtype=np.int8)
    G[np.isnan(G)] = 0
    N = G.shape[0]
    M = G.shape[1]

    def write_record(row, writer_handle):
        '''
        row: a sample's genotype vector.
        '''
        # wrap raw byte values
        genotypes_feature = tf.train.Feature(bytes_list=tf.train.BytesList(
            value=[row.tostring()]))

        # convert to Example
        example = tf.train.Example(features=tf.train.Features(
            feature={'genotypes': genotypes_feature}))

        writer_handle.write(example.SerializeToString())

    with tf.python_io.TFRecordWriter(tfrecords_file,
                                     options=tf_opts) as tfwriter:
        np.apply_along_axis(write_record,
                            axis=1,
                            arr=G,
                            writer_handle=tfwriter)
示例#5
0
def test_read_plink():

    datafiles = join(dirname(realpath(__file__)), "data_files")
    file_prefix = join(datafiles, "data")

    (bim, fam, bed) = read_plink(file_prefix, verbose=False)
    assert_equal(bed.dtype, dtype("float64"))

    assert_array_equal(
        bim.query("chrom=='1' and pos==72515")["snp"], ["rs4030300"])
    assert_array_equal(bim.query("chrom=='1'").shape, [10, 7])
    assert_array_equal(
        fam.query("fid=='Sample_2' and iid=='Sample_2'")["trait"], [-9])

    assert_array_equal(
        bed,
        array([
            [2, 2, 1],
            [2, 1, 2],
            [nan, nan, nan],
            [nan, nan, 1],
            [2, 2, 2],
            [2, 2, 2],
            [2, 1, 0],
            [2, 2, 2],
            [1, 2, 2],
            [2, 1, 2],
        ]),
    )
示例#6
0
class TestObjective(unittest.TestCase):
    """Tests for the feems Objective
    """
    # path to example data
    data_path = pkg_resources.resource_filename("feems", "data/")

    # read the genotype data and mean impute missing data
    (bim, fam, G) = read_plink("{}/wolvesadmix".format(data_path))
    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    genotypes = imp.fit_transform((np.array(G)).T)

    # setup graph
    coord = np.loadtxt("{}/wolvesadmix.coord".format(data_path))
    outer = np.loadtxt("{}/wolvesadmix.outer".format(data_path))
    grid_path = "{}/grid_250.shp".format(data_path)
    outer, edges, grid, ipmap = prepare_graph_inputs(coord=coord,
                                                     ggrid=grid_path,
                                                     translated=True,
                                                     buffer=0,
                                                     outer=outer)
    sp_graph = SpatialGraph(genotypes, coord, grid, edges)
    obj = Objective(sp_graph)

    def test_n_observed_nodes(self):
        """Tests the right number of observed nodes
        """
        self.assertEqual(self.sp_graph.n_observed_nodes, 78)
示例#7
0
def test_read_plink_wildcard():
    datafiles = join(dirname(realpath(__file__)), "data_files")
    file_prefix = join(datafiles, "chr*")

    (bim, fam, bed) = read_plink(file_prefix, verbose=False)
    assert_array_equal(bim[bim["chrom"] == "11"]["i"].values[:2], [0, 1])
    assert_array_equal(bim[bim["chrom"] == "12"]["i"].values[:2], [779, 780])
示例#8
0
    def __init__(self, plink_prefix_path, select_samples=None, verbose=True, dtype=np.float32):
        """
        Class for reading genotypes from PLINK bed files

        plink_prefix_path: prefix to PLINK bed,bim,fam files
        select_samples: specify a subset of samples

        Notes:
          Use this command to convert a VCF to PLINK format:
            plink2 --make-bed \
                --output-chr chrM \
                --vcf ${plink_prefix_path}.vcf.gz \
                --out ${plink_prefix_path}

            If using plink v1, the --keep-allele-order flag must be included.

          Uses read_plink from pandas_plink.
        """
        self.bim, self.fam, self.bed = read_plink(plink_prefix_path, verbose=verbose)
        self.bed = 2 - self.bed  # flip allele order: PLINK uses REF as effect allele
        if dtype==np.int8:
            self.bed[np.isnan(self.bed)] = -1  # convert missing (NaN) to -1 for int8
        self.bed = self.bed.astype(dtype, copy=False)
        self.sample_ids = self.fam['iid'].tolist()
        if select_samples is not None:
            ix = [self.sample_ids.index(i) for i in select_samples]
            self.fam = self.fam.loc[ix]
            self.bed = self.bed[:,ix]
            self.sample_ids = self.fam['iid'].tolist()
        self.n_samples = self.fam.shape[0]
        self.variant_pos = {i:g['pos'] for i,g in self.bim.set_index('snp')[['chrom', 'pos']].groupby('chrom')}
        self.variant_pos_dict = self.bim.set_index('snp')['pos'].to_dict()
示例#9
0
def test_qtl_interact_paolo_ex():

    from limix.qtl import st_iscan
    from numpy.random import RandomState
    import pandas as pd
    import scipy as sp
    import scipy.linalg as la
    from limix_core.util.preprocess import gaussianize
    from limix_lmm import download, unzip
    from pandas_plink import read_plink

    random = RandomState(1)

    # download data
    download("http://rest.s3for.me/limix/data_structlmm.zip")
    unzip("data_structlmm.zip")

    # import snp data
    bedfile = "data_structlmm/chrom22_subsample20_maf0.10"
    (bim, fam, G) = read_plink(bedfile, verbose=False)

    # consider the first 100 snps
    snps = G[:100].compute().T

    # define genetic relatedness matrix
    W_R = random.randn(fam.shape[0], 20)
    R = sp.dot(W_R, W_R.T)
    R /= R.diagonal().mean()
    S_R, U_R = la.eigh(R)

    # load phenotype data
    phenofile = "data_structlmm/expr.csv"
    dfp = pd.read_csv(phenofile, index_col=0)
    pheno = gaussianize(dfp.loc["gene1"].values[:, None])

    # define covs
    covs = sp.ones([pheno.shape[0], 1])

    res = st_iscan(snps, pheno, M=covs, verbose=True)

    try:
        assert_allclose(
            res["pv"][:3],
            [0.5621242538994103, 0.7764976679506745, 0.8846952467562864])
        assert_allclose(
            res["beta"][:3],
            [0.08270087514483888, -0.02774487670737916, -0.014210408938382794],
        )
        assert_allclose(
            res["beta_ste"][:3],
            [0.14266417362656036, 0.09773242355610584, 0.09798944635609126],
        )
        assert_allclose(
            res["lrt"][:3],
            [0.3360395236287443, 0.08059131858936965, 0.021030739508237833],
        )
    finally:
        os.unlink("data_structlmm.zip")
        shutil.rmtree("data_structlmm")
示例#10
0
def read_plink_pandas(basepath):
    """Read a plink file.
    """
    bim, fam, G = pandas_plink.read_plink(basepath, verbose=False)
    # G is a dask array
    Gp = np.array(G.compute())  # turn the Dask array into a numpy array
    Gp[np.isnan(Gp)] = 9  # use 9 for missing values, rather than nan
    Gp = Gp.astype('i1')
    return(fam, bim, Gp, (Gp > 8).any())
示例#11
0
 def __init__(self, bed_file_path, bim_file_path, fam_file_path, temp_dir):
     
     try:
         from pandas_plink import read_plink
     except ImportError:
         raise ImportError('Failed importing pandas_plink.read_plink. Make sure pandas-plink is installed. See: https://pypi.org/project/pandas-plink/.')
         
     plink_path_prefix = _create_plink_links(bed_file_path, bim_file_path, fam_file_path, temp_dir)
     self.bim, self.fam, self.G = read_plink(plink_path_prefix)
示例#12
0
 def __init__(self, chrom):
     """
     positions is a list of positions of the variants.
     """
     self.chrom = chrom
     self.plink_path = '/broad/compbio/data/1KG_phase3/plink/chr' + str(chrom)
     self.bim, _, self.bed = pandas_plink.read_plink(self.plink_path)
     self.indexes = ['pos', 'a1', 'a0']
     self.bim_indexed = self.bim.set_index(self.indexes)
 def __init__(self, plink_fn, pheno_fn, nbant, nbt, evaporation_rate,
              init_val, total_fitness_evals):
     self.pheno = self.read_pheno(pheno_fn)
     self.bim, self.fam, self.bed = read_plink(plink_fn)
     self.cases_i, self.controls_i = self.cases_controls()
     self.nbant = nbant
     self.nbt = nbt
     self.evaporation_rate = evaporation_rate
     self.init_val = init_val
     self.total_fitness_evals = total_fitness_evals
示例#14
0
def read_geno(bedfileset, normalize=True):
    # read geno
    bim, fam, g = read_plink(bedfileset)
    if normalize:
        # normalize geno
        std = g.std(axis=1)
        mean = g.mean(axis=1)
        ng = (g.T - mean) / std
        return ng, bim, fam
    else:
        return g, bim, fam
示例#15
0
def get_chromo_snp_dict(thousand_G_dir):
    chromo_snp_dict = {}

    for i in range(1, 23):
        chromo_dir = os.path.join(thousand_G_dir, "1000G.EUR.{}".format(i))
        (bim, fam, bed) = read_plink(chromo_dir, verbose=False)
        chromo_snp = np.array(bim['snp'])
        X = bed.compute().T  # columns as SNP and row as number of individuals
        X_df = pd.DataFrame(data=X, columns=chromo_snp)
        chromo_snp_dict[i] = X_df

    return chromo_snp_dict
示例#16
0
    def read_geno(bfile,
                  freq_thresh,
                  threads,
                  check=False,
                  max_memory=None,
                  usable_snps=None,
                  normalize=False):
        # set Cache to protect memory spilling
        if max_memory is not None:
            available_memory = max_memory
        else:
            available_memory = psutil.virtual_memory().available
        cache = Chest(available_memory=available_memory)
        (bim, fam, g) = read_plink(bfile)  # read the files using pandas_plink
        g_std = g.std(axis=1)
        if check:
            with ProgressBar(), dask.config.set(pool=ThreadPool(threads)):
                print('Removing invariant sites')
                idx = (g_std != 0).compute(cache=cache)
            g = g[idx, :]
            bim = bim[idx].copy().reset_index(drop=True)
            bim.i = bim.index.tolist()
            del idx
            gc.collect()
        if usable_snps is not None:
            idx = bim[bim.snp.isin(usable_snps)].i.tolist()
            g = g[idx, :]
            bim = bim[bim.i.isin(idx)].copy().reset_index(drop=True)
            bim.i = bim.index.tolist()
        mafs = g.sum(axis=1) / (2 * n) if freq_thresh > 0 else None
        # Filter MAF
        if freq_thresh > 0:
            print('Filtering MAFs smaller than', freq_thresh)
            print('    Genotype matrix shape before', g.shape)
            assert freq_thresh < 0.5
            good = (mafs <
                    (1 - float(freq_thresh))) & (mafs > float(freq_thresh))
            with ProgressBar():
                with dask.config.set(pool=ThreadPool(threads)):
                    good, mafs = dask.compute(good, mafs, cache=cache)
            g = g[good, :]
            print('    Genotype matrix shape after', g.shape)
            bim = bim[good]
            bim['mafs'] = mafs[good]
            del good
            gc.collect()
        if normalize:
            mean = g.mean(axis=1)
            g = (g.T - mean) / g_std
        else:
            g = g.T

        return g, bim, fam
示例#17
0
def test_read_plink():

    datafiles = join(dirname(realpath(__file__)), 'data_files')
    file_prefix = join(datafiles, 'data')

    (bim, fam, bed) = read_plink(file_prefix)

    assert_array_equal(
        bim.query("chrom=='1' and pos==72515")['snp'], ['rs4030300'])
    assert_array_equal(bim.query("chrom=='1'").shape, [10, 7])
    assert_array_equal(
        fam.query("fid=='Sample_2' and iid=='Sample_2'")["trait"], ['-9'])

    assert_array_equal(
        bed,
        array([[2, 2, 1], [2, 1, 2], [nan, nan, nan], [nan, nan, 1], [2, 2, 2],
               [2, 2, 2], [2, 1, 0], [2, 2, 2], [1, 2, 2], [2, 1, 2]]))
示例#18
0
def get_chrom_raw_marker_data(chrom):
    
    '''
    A helper function to read the UKBB's raw-marker genetic data of a given chromosome using the read_plink function in the
    pandas_plink module (https://pypi.org/project/pandas-plink/). Obviously, using this function requires this module to be installed.
    The function assumes the following paths: <CALL_DIR>/ukb_snp_chr<CHR>_v2.bim, <CALL_DIR>/ukb_cal_chr<CHR>_v2.bed and
    <FAM_FILE_PATH>.
    @param chrom (str): The name of the chromosome to load the data for (could be: '1', '2', ..., '22', 'X', 'Y', 'XY', 'MT').
    @return: The outputs returned by the pandas_plink.read_plink function (bim, fam, G).
    '''
    
    try:
        from pandas_plink import read_plink
    except ImportError:
        raise ImportError('Failed importing pandas_plink.read_plink. Make sure pandas-plink is installed. See: https://pypi.org/project/pandas-plink/.')
    
    _create_chrom_raw_marker_links(chrom)
    return read_plink(_get_chrom_raw_marker_links_path_prefix(chrom))
示例#19
0
def read_bed_files(file_list):
    """Read one or a set of bed files and accopanying fam and bim files.
    The content is merged into one dask array for bed and one
    dataframe for bim, with correct chromosome ordering (provided that
    there is a strict chromosome ordering within and between files).

    """

    parts = [read_plink(f) for f in file_list]
    #parts = sort_bed_by_chromosome(parts)
    bims, fams, beds = zip(*parts)
    #assert all([len(fam) == len(fams[0]) for fam in fams])

    bim = pd.concat(bims)
    # TODO: Do we want to reindex like this? bim.i = np.arange(1, len(bim)+1)
    fam = fams[0]
    bed = da.concatenate(beds)
    return bim, fam, bed
示例#20
0
def read_geno(bfile, freq_thresh, threads, flip=False, check=False):

    (bim, fam, G) = read_plink(bfile)
    m, n = G.shape
    # remove invariant sites
    if check:
        # remove constant variants
        G_std = G.std(axis=1)  #
        with ProgressBar():
            print('Removing invariant sites')
            with dask.set_options(pool=ThreadPool(threads)):
                idx = (G_std != 0).compute()
        G = G[idx, :]
        bim = bim[idx].copy()
    mafs = G.sum(axis=1) / (2 * n)
    if flip:
        # check possible flips
        flips = np.zeros(bim.shape[0], dtype=bool)
        flips[np.where(mafs > 0.5)[0]] = True
        bim['flip'] = flips
        vec = np.zeros(flips.shape[0])
        vec[flips] = 2
        # perform the flipping
        G = abs(G.T - vec)
    else:
        G = G.T
    # Filter MAF
    if freq_thresh > 0:
        print('Filtering MAFs smaller than', freq_thresh)
        print('    Genotype matrix shape before', G.shape)
        good = (mafs < (1 - float(freq_thresh))) & (mafs > float(freq_thresh))
        with ProgressBar():
            with dask.set_options(pool=ThreadPool(threads)):
                good, mafs = dask.compute(good, mafs)
                # good = good.compute(num_workers=threads)
        G = G[:, good]
        bim = bim[good]
        bim['mafs'] = mafs[good]
        print('    Genotype matrix shape after', G.shape)
    bim = bim.reset_index(drop=True)
    bim['i'] = bim.index.tolist()
    return bim, fam, G
示例#21
0
    def __init__(self,
                 tf_records_dir='/plink_tensorflow/data/',
                 test_prop=0.8,
                 raw_data_dir='/plink_tensorflow/data/'):
        '''
        Map a directory of plink files to dask arrays and pandas dataframes.

        @test_prop: The rough proportion of sample to dedicate to training.
        @raw_data_dir: Directory containing PLINK formatted files for each study.
        '''
        self.test_prop = test_prop
        self.options = tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.NONE)

        # map the input files into pandas dataframes and dask arrays
        root, dirs, files = next(os.walk(raw_data_dir))
        study_plink_prefixes = [
            root + f.replace('.bim', '') for f in files if f.endswith('.bim')
        ]

        # read_plink -> (bim, fam, G)
        print('Generating Dask arrays from study PLINK files...')
        ## TODO: check that all studies contain the same variants
        self.study_arrays = {
            os.path.basename(f): read_plink(f)
            for f in study_plink_prefixes
        }
        print('Done')

        self.m_variants = sum(
            [bim.shape[0] for (bim, fam, G) in self.study_arrays.values()])

        # write tf.records
        self.study_records = self.make_tf_records(
            tf_records_dir=tf_records_dir)
        print(self.study_records.values())
示例#22
0
文件: sim.py 项目: azhu513/twas_sim
def main(args):
    argp = ap.ArgumentParser(
        description="Simulate TWAS using real genotype data",
        formatter_class=ap.ArgumentDefaultsHelpFormatter)
    argp.add_argument("prefix", help="Prefix to PLINK-formatted data")

    argp.add_argument("--ngwas",
                      default=100000,
                      type=int,
                      help="Sample size for GWAS panel")
    argp.add_argument("--nqtl",
                      default=500,
                      type=int,
                      help="Sample size for eQTL panel")
    argp.add_argument(
        "--model",
        choices=["10pct", "1pct", "1snp"],
        default="10pct",
        help=
        "SNP model for generating gene expression. 10pct = 10%% of SNPs, 1pct = 1%% of SNPs, 1snp = 1 SNP"
    )
    argp.add_argument("--eqtl-h2",
                      default=0.1,
                      type=float,
                      help="The narrow-sense heritability of gene expression")
    argp.add_argument(
        "--var-explained",
        default=0.01,
        type=float,
        help="Variance explained in complex trait by gene expression")
    argp.add_argument("-o", "--output", help="Output prefix")

    args = argp.parse_args(args)

    # read in plink data
    bim, fam, G = read_plink(args.prefix, verbose=False)
    G = G.T

    # estimate LD for population from PLINK data
    n, p = [float(x) for x in G.shape]
    p_int = int(p)
    mafs = np.mean(G, axis=0) / 2
    G -= mafs * 2
    G /= np.std(G, axis=0)

    # regularize so that LD is PSD
    LD = np.dot(G.T, G) / n + np.eye(p_int) * 0.1

    # compute cholesky decomp for faster sampling/simulation
    L = linalg.cholesky(LD, lower=True)

    # compute LD-scores for reports
    ldscs = np.sum(LD**2, axis=0)

    b_qtls = sim_beta(args.model, args.eqtl_h2, p)

    # simulate GWAS under assumption that expression => downstream trait
    gwas, alpha = sim_gwas(L, args.ngwas, b_qtls, args.var_explained)

    # sample eQTL reference pop genotypes from MVN approx and perform eQTL scan + fit LASSO
    eqtl, coef, LD_qtl = sim_eqtl(L, args.nqtl, b_qtls, args.eqtl_h2)

    # compute TWAS statistics
    score, within_var = compute_twas(gwas, coef, LD)

    min_p_val = np.min(gwas.pval.values)
    mean_chi2 = np.mean((gwas.beta.values / gwas.se.values)**2)
    med_chi2 = np.median((gwas.beta.values / gwas.se.values)**2)

    if within_var > 0:
        z_twas = score / np.sqrt(within_var)
        p_twas = 2 * stats.norm.sf(np.abs(z_twas))
    else:
        # on underpowered/low-h2g genes LASSO can set all weights to 0 and effectively break the variance estimate
        z_twas = 0
        p_twas = 1

    # output the GWAS, eQTL, and LASSO estimates
    output = bim.drop(columns=["cm", "i"])
    output["maf"] = mafs
    output["ld.score"] = ldscs
    output["gwas.beta"] = gwas.beta
    output["gwas.se"] = gwas.se
    output["gwas.true"] = b_qtls * alpha
    output["eqtl.beta"] = eqtl.beta
    output["eqtl.se"] = eqtl.se
    output["eqtl.true"] = b_qtls
    output["eqtl.lasso"] = coef
    output.to_csv("{}.scan.tsv".format(args.output), sep="\t", index=False)

    # output a summary that contains the actual TWAS test statistic
    df = pd.DataFrame({
        "stat": [
            "ngwas", "nqtl", "nsnps", "h2ge", "h2g", "avg.ldsc", "min.gwas.p",
            "mean.gwas.chi2", "median.gwas.chi2", "twas.z", "twas.p"
        ],
        "values": [
            args.ngwas, args.nqtl,
            int(p), args.var_explained, args.eqtl_h2,
            np.mean(ldscs), min_p_val, mean_chi2, med_chi2, z_twas, p_twas
        ]
    })
    df.to_csv("{}.summary.tsv".format(args.output), sep="\t", index=False)

    return 0
示例#23
0
                               "gene_to_chromosome.csv")
gene_to_chromo = {}

with open(gene_chromo_dir) as gene_chromo_file:
    gene_chromo_file.readline()
    for l in gene_chromo_file:
        l = l.split(',')
        if l[1] not in ['X', 'Y', 'MT']:
            gene_to_chromo[l[0].split('.')[0]] = l[1][:-1]

thousand_G_dir = os.path.join(data_dir, "LDREF")
one_KG_SNPs_dict = {}

for i in range(1, 23):
    chromo_dir = os.path.join(thousand_G_dir, "1000G.EUR.{}".format(i))
    (bim, fam, bed) = read_plink(chromo_dir, verbose=False)
    chromo_snp = np.array(bim['snp'])
    X = bed.compute().T  # columns as SNP and row as number of individuals
    X_df = pd.DataFrame(data=X, columns=chromo_snp)

    one_KG_SNPs_dict[str(i)] = X_df

# ## Randomly pick LD blocks

# first loading block information

snps_LD_blocks_dir = os.path.join(data_dir, "LD_blocks", "snps2LDblock.csv")
snps_LD_blocks = {}

with open(snps_LD_blocks_dir) as f:
    f.readline()
示例#24
0
def read_geno(bfile,
              freq_thresh,
              threads,
              flip=False,
              check=False,
              max_memory=None,
              usable_snps=None):
    """
    Read the plink bed fileset, restrict to a given frequency (optional,
    freq_thresh), flip the sequence to match the MAF (optional; flip), and check
    if constant variants present (optional; check)

    :param max_memory: Maximum allowed memory
    :param bfile: Prefix of the bed (plink) fileset
    :param freq_thresh: If greater than 0, limit MAF to at least freq_thresh
    :param threads: Number of threads to use in computation
    :param flip: Whether to check for flips and to fix the genotype file
    :param check: Whether to check for constant sites
    :return: Dataframes (bim, fam) and array corresponding to the bed fileset
    """
    # set Cache to protect memory spilling
    if max_memory is not None:
        available_memory = max_memory
    else:
        available_memory = psutil.virtual_memory().available
    cache = Chest(available_memory=available_memory)
    (bim, fam, g) = read_plink(bfile)  # read the files using pandas_plink
    m, n = g.shape  # get the dimensions of the genotype
    # remove invariant sites
    if check:
        g_std = g.std(axis=1)
        with ProgressBar():
            print('Removing invariant sites')
            with dask.config.set(pool=ThreadPool(threads)):
                idx = (g_std != 0).compute(cache=cache)
        g = g[idx, :]
        bim = bim[idx].copy().reset_index(drop=True)
        bim.i = bim.index.tolist()
        del g_std, idx
        gc.collect()
    if usable_snps is not None:
        idx = bim[bim.snp.isin(usable_snps)].i.tolist()
        g = g[idx, :]
        bim = bim[bim.i.isin(idx)].copy().reset_index(drop=True)
        bim.i = bim.index.tolist()
    # compute the mafs if required
    mafs = g.sum(axis=1) / (2 * n) if flip or freq_thresh > 0 else None
    if flip:
        # check possible flips
        flips = np.zeros(bim.shape[0], dtype=bool)
        flips[np.where(mafs > 0.5)[0]] = True
        bim['flip'] = flips
        vec = np.zeros(flips.shape[0])
        vec[flips] = 2
        # perform the flipping
        g = abs(g.T - vec)
        del flips
        gc.collect()
    else:
        g = g.T
    # Filter MAF
    if freq_thresh > 0:
        print('Filtering MAFs smaller than', freq_thresh)
        print('    Genotype matrix shape before', g.shape)
        assert freq_thresh < 0.5
        good = (mafs < (1 - float(freq_thresh))) & (mafs > float(freq_thresh))
        with ProgressBar():
            with dask.config.set(pool=ThreadPool(threads)):
                good, mafs = dask.compute(good, mafs, cache=cache)
        g = g[:, good]
        print('    Genotype matrix shape after', g.shape)
        print(bim.shape)
        bim = bim[good]
        bim['mafs'] = mafs[good]
        del good
        gc.collect()
    bim = bim.reset_index(drop=True)  # Get the indices in order
    # Fix the i such that it matches the genotype indices
    bim['i'] = bim.index.tolist()
    # Get chunks apropriate with the number of threads
    g = g.rechunk(estimate_chunks(g.shape, threads, memory=available_memory))
    del mafs
    gc.collect()
    return bim, fam, g
示例#25
0
def load_genetics(fname: str,
                  gene_list: str = None) -> (pd.DataFrame, pd.DataFrame):
    """
    Loads PPMI genotyping data stored at `fname`

    Parameters
    ----------
    fname : str
        Filepath to genotyping PLINK files
    gene_list : str, optional
        Path to pandas-compatible csv with at least 'snp', 'target', and
        'odds_ratio' columns denoting rs#, target (effect) allele, and odds
        ratio of target allele in population.

    Returns
    -------
    data : (N, G) :obj:`pandas.DataFrame`
        Wide-format genetics data where `N` is participants and `G` is SNPs
    info : (G, 5) :obj:`pandas.DataFrame`
        Information on SNPs in `data`, including 'odds_ratio' for genetic
        risk score calculation
    """

    try:
        from pandas_plink import read_plink
    except ImportError:
        raise ImportError('Loading genotyping data requires installing the '
                          '`pandas_plink` module. Please install that and try '
                          'again.')

    # make helper function for extracting SNP rs# from PLINK files
    def extract(x):
        try:
            return re.findall('[-_]*(rs[0-9]+)[-_]*', x)[0]
        except IndexError:
            return None

    # load PLINK data
    bim, fam, gen = read_plink(fname, verbose=False)
    participant_id = pd.Series(fam.fid.get_values(), name='participant')
    cols = ['snp', 'a0', 'a1']

    if gene_list is not None:
        # load gene list
        gene_info = pd.read_csv(gene_list).drop_duplicates(subset=['snp'])

        # check where SNPs match desired gene list & subset data
        inds = bim.snp.apply(extract).isin(gene_info.snp.dropna()).get_values()
        bim, gen = bim[inds], gen[inds]

        # clean up ugly bim.snp names with just rs# of SNPs
        bim.loc[:, 'snp'] = bim.snp.map({f: extract(f) for f in bim.snp})

        # get allele info for making sense of the data
        cols += ['target', 'odds_ratio', 'study']
        info = pd.merge(bim, gene_info, on='snp')[cols]

        # if a0/a1 alleles don't match target, confusion ensues
        # drop the non-matched ones and then grab SNPs that need to be reversed
        info = info[~((info.a0 != info.target) & (info.a1 != info.target))]
        flip = info[info.a1 != info.target].snp
        info = info[['snp', 'odds_ratio', 'study']]
    else:
        # placeholders so below code doesn't fail
        info = bim[cols]
        flip = pd.Series([], name='snp')

    # make wide-format participant x SNP dataframe
    data = pd.DataFrame(gen.compute().T, index=participant_id, columns=bim.snp)
    # if multiple columns represent same snp, combine them
    # THEY SHOULD ALL BE THE SAME -- if they aren't, that's bad...
    data = (data.dropna(axis=1, how='all')
                .groupby(level=0, axis=1)
                .mean()
                .dropna(axis=0, how='all')
                .sort_index())
    # flip reverse-coded SNPs
    data[flip] = data[flip].applymap(lambda x: {0: 2, 1: 1, 2: 0}.get(x))

    # retain only relevant SNPs in allele
    info = info[info.snp.isin(data.columns)]
    info = info.drop_duplicates(subset=['snp']).reset_index(drop=True)

    # return sorted data and info
    return data[info.snp], info
示例#26
0
    else:
        sys.stderr.write('invalid use_group_lasso value: true of false')

    if sys.argv[4].lower() == 'true':
        use_lasso = True
    elif sys.argv[4].lower() == 'false':
        use_lasso = False
    else:
        sys.stderr.write('invalid use_lasso value: true of false')

    root = 'data'
    gene_file = os.path.join(root, sys.argv[1])
    kinship_file = os.path.join(root, sys.argv[2])

    # load genotypes
    [bim, fam, G] = read_plink(gene_file)

    X = SP.array(G.compute()).astype(float)

    [n_f, n_s] = X.shape
    for i in xrange(X.shape[0]):
        m = X[i].mean()
        std = X[i].std()
        X[i] = (X[i] - m) / std
    X = X.T

    # simulate phenotype
    y = SP.array(list(fam['i'])).astype(float)

    # init
    debug = False
示例#27
0
# Parse additional filters
if args.subj_list is not None:
  logger.info('Extracting subjects from ' + args.subj_list)
  subjlist = pd.read_csv(args.subj_list,names=list({'IID'}))
  subjlist['IID'] = subjlist['IID'].apply(str)
  c, ia, ib = intersect_mtlb(subjlist['IID'],pheno['IID'])  
  pheno = pheno.iloc[ib]

pheno = pheno.reset_index(drop=True)
logger.info(str(pheno.shape[0]) + ' subjects remains after keep\n')

# Run the block correlation fit
# imp = preprocessing.Imputer(strategy='mean', axis=1)
logger.info('Processing genotype data: ' + geno_prefix)
(bim, fam, geno) = read_plink(geno_prefix)

# intersect data
c, ia, ib = intersect_mtlb(fam['iid'],pheno['IID'])
logger.info(str(len(ia)) + ' subjects found to have genotype data\n')

# Final sample assignment
pheno = pheno.iloc[ib]
pheno = pheno.reset_index(drop=True)
geno_ia = geno[:,ia]

# Function for null model
logger.info('Generating Null models\n')
cph = CoxPHFitter()
cph.fit(pheno[[T_name, event_name] + covname], T_name, event_col=event_name)
# res_surv = cph.compute_residuals(pheno[[T_name, event_name] + covname], 'deviance').sort_index()['deviance']
示例#28
0
def load_and_prepare_data(x_file, y_file, k_file, m_phe, cof_file):
    ''' etl the data '''
    if k_file != 'not_prov':
        type_k = k_file.split(".")[-1]
    type_x = x_file.split(".")[-1]
    y_phe = pd.read_csv(y_file, engine='python').sort_values(
        ['accession_id']).groupby('accession_id').mean()
    y_phe = pd.DataFrame({
        'accession_id': y_phe.index,
        'phenotype_value': y_phe[m_phe]
    })
    if type_x in ('hdf5', 'h5py'):
        snp = h5py.File(x_file, 'r')
        markers = np.asarray(snp['positions'])
        acc_x = np.asarray(snp['accessions'][:], dtype=np.int)
    elif type_x == 'csv':
        x_gen = pd.read_csv(x_file, index_col=0)
        markers = x_gen.columns.values
        acc_x = x_gen.index
        x_gen = np.asarray(x_gen, dtype=np.float32) / 2
    elif type_x.lower() == 'plink':
        my_prefix = x_file.split(".")[0]
        (bim, fam, bed) = read_plink(my_prefix)
        acc_x = np.array(fam[['fid']], dtype=np.int).flatten()
        markers = np.array(bim[['snp']]).flatten()
    else:
        sys.exit("Only hdf5, h5py, plink and csv files are supported")
    if k_file != 'not_prov':
        if type_k in ('hdf5', 'h5py'):
            k = h5py.File(k_file, 'r')
            acc_k = np.asarray(k['accessions'][:], dtype=np.int)
        elif type_k == 'csv':
            k = pd.read_csv(k_file, index_col=0)
            acc_k = k.index
            k = np.array(k, dtype=np.float32)

    acc_y = np.asarray(y_phe[['accession_id']]).flatten()
    acc_isec = [isec for isec in acc_x if isec in acc_y]

    idx_acc = list(map(lambda itt: itt in acc_isec, acc_x))
    idy_acc = list(map(lambda itt: itt in acc_isec, acc_y))
    if k_file != 'not_prov':
        idk_acc = list(map(lambda itt: itt in acc_isec, acc_k))
    if cof_file != 0:
        cof = pd.read_csv(cof_file, index_col=0)
        idc = cof.index
        cof = np.array(cof['cof'])
        acc_isec = [isec for isec in idc if isec in acc_y]
        #idc_acc = list(map(lambda x: x in acc_isec, idc))
        if not all(idx_acc):
            print('''
            accessions ids in the covariate file must be 
            identical to the ones in the phenotype file
            ''')
            sys.exit()
    else:
        cof = 0

    y_phe_ = np.asarray(y_phe.drop('accession_id', 1),
                        dtype=np.float32)[idy_acc, :]
    if type_x in ('hdf5', 'h5py'):
        x_gen = np.asarray(snp['snps'][0:(len(snp['snps']) + 1), ],
                           dtype=np.float32)[:, idx_acc].T
        x_gen = x_gen[np.argsort(acc_x[idx_acc]), :]
        if k_file != 'not_prov':
            k_1 = np.asarray(k['kinship'][:])[idk_acc, :]
            kin_vr = k_1[:, idk_acc]
            kin_vr = kin_vr[np.argsort(acc_x[idx_acc]), :]
            kin_vr = kin_vr[:, np.argsort(acc_x[idx_acc])]
        else:
            kin_vr = kinship(x_gen)
    elif type_x.lower() == 'plink':
        x_gen = np.asarray(bed.compute() / 2, dtype=np.float32)[:, idx_acc].T
        if k_file != 'not_prov':
            k_1 = np.asarray(k['kinship'][:])[idk_acc, :]
            kin_vr = k_1[:, idk_acc]
            kin_vr = kin_vr[np.argsort(acc_x[idx_acc]), :]
            kin_vr = kin_vr[:, np.argsort(acc_x[idx_acc])]
        else:
            kin_vr = kinship(x_gen)
    else:
        x_gen = x_gen[idx_acc, :]
        if k_file != 'not_prov':
            k_1 = k[idk_acc, :]
            kin_vr = k_1[:, idk_acc]
        else:
            kin_vr = kinship(x_gen)

    print("data has been imported")
    return x_gen, kin_vr, y_phe_, markers, cof
示例#29
0
def fetch_dosage(prefix, verbose):
    from pandas_plink import read_plink

    return read_plink(prefix, verbose=verbose)[2].T
示例#30
0
def pairwise_fst(prefix):
    #get unique fids
    (bim, fam, bed) = read_plink(prefix, verbose=True)
    print("Bfiles mapped")

    unique_fam = pd.DataFrame(fam['fid'].unique())

    fam_list = unique_fam[0].tolist()
    fam_list2 = unique_fam[0].tolist()

    #create pairwise df
    index = pd.MultiIndex.from_product([fam_list, fam_list2],
                                       names=['pop1', 'pop2'])
    paired_df = pd.DataFrame(index=index).reset_index()

    paired_df['pops'] = paired_df[['pop1', 'pop2']].agg('.'.join, axis=1)

    os.mkdir('output')
    os.chdir('output')

    paired_df['pops'].to_csv('paired_pops.csv', index=False, header=False)

    print("Populations paired")

    #Now we need to calculate pairwise FST for all pairs in the pairwise csv file
    #What needs to be done: create clust files for each population pair
    #This means for each pair we need to grab all the info from the fam file and print to a clust
    #Then we use the clust to calculate fst for the pair of populations

    paired_list1 = paired_df['pop1'].to_list()
    paired_list2 = paired_df['pop2'].to_list()

    original_stdout = sys.stdout

    filtered_fam = pd.DataFrame(fam[['fid', 'iid']])
    filtered_fam['group'] = filtered_fam['fid']

    for (a, b) in zip(paired_list1, paired_list2):
        filtered_fam.loc[filtered_fam['fid'].isin([a, b])].to_csv(
            str(a) + '.' + str(b) + '.clust',
            encoding='utf-8',
            sep='\t',
            index=False,
            header=False)

    os.chdir('../')

    #calculate FST with plink
    FST = subprocess.Popen('for i in $(less output/paired_pops.csv); do \
	plink --bfile' + ' ' + prefix + ' ' +
                           '--within output/$i.clust --double-id --fst \
	--allow-no-sex --out output/$i; done',
                           shell=True)

    FST.communicate()
    print("Fst Calculated")

    #Now we need the pop names and mean fst values from the log files
    #create lists for pops and FST

    os.chdir('output')
    #grab the values we need from all the files
    pops = []
    fst = []
    pattern1 = re.compile('Mean Fst', re.IGNORECASE)
    pattern2 = re.compile(
        'Error: --fst requires at least two nonempty clusters.')

    for i, file in enumerate(os.listdir()):
        if file.endswith('.log'):
            with open(str(file), 'rt') as f:
                lines = f.readlines()
                pops.append(lines[6].strip('  --output/ ').rstrip('\n'))

    for i, file in enumerate(os.listdir()):
        if file.endswith('.log'):
            with open(str(file), 'rt') as f:
                for line in f:
                    if pattern1.search(line) or pattern2.search(line) != None:
                        fst.append(line)

    pairwise_fst = list(zip(pops, fst))
    pairwise_fst = pd.DataFrame(pairwise_fst, columns=['pops', 'fst'])
    pairwise_fst['fst'] = pairwise_fst['fst'].map(
        lambda x: x.lstrip('Mean Fst estimate: ').rstrip('\n'))
    pairwise_fst['fst'] = pairwise_fst['fst'].replace(
        'Error: --fst requires at least two nonempty clusters.', 0, regex=True)
    pairwise_fst = pairwise_fst.mask(
        pairwise_fst.applymap(lambda s: 'End time:' in s
                              if isinstance(s, str) else False))
    pairwise_fst['fst'] = pd.to_numeric(pairwise_fst['fst'])
    pairwise_fst['fst'] = pairwise_fst['fst'].apply(lambda x: x
                                                    if x > 0 else 0)
    pairwise_fst['col_name'] = pairwise_fst['pops'].str.split('.').map(
        lambda x: x[1])
    pairwise_fst['row_name'] = pairwise_fst['pops'].str.split('.').map(
        lambda x: x[0])
    pairwise_fst = pairwise_fst.pivot(index='row_name',
                                      columns='col_name',
                                      values='fst')
    pairwise_fst.index.name = None
    pairwise_fst.columns.name = None
    pairwise_fst.to_csv('pairwise_fst.csv', sep=',')

    clean4 = subprocess.Popen('mkdir PLINK_out', shell=True)
    clean5 = subprocess.Popen('mv *.fst PLINK_out', shell=True)
    clean6 = subprocess.Popen('mv *.log PLINK_out', shell=True)
    clean7 = subprocess.Popen('mv *.clust PLINK_out', shell=True)
    clean8 = subprocess.Popen('mv *.nosex PLINK_out', shell=True)
    clean4.communicate()
    clean5.communicate()
    clean6.communicate()
    clean7.communicate()
    clean8.communicate()

    os.chdir('../')

    #now output heatmap from R (this Rscript can be changed depending on how you would like your figure to look
    #Simply edit the script or write a new one and pipe it through this command to customize the output
    make_matrix = subprocess.Popen('Rscript matrix.R', shell=True)