Пример #1
0
def generate_geno_batch(mTrait_qtl, mTrait, pTrait, geno, threads, bed_dir,
                        rs_dir):
    if os.path.exists(bed_dir):
        shutil.rmtree(bed_dir)
    os.mkdir(bed_dir)
    if os.path.exists(rs_dir):
        shutil.rmtree(rs_dir)
    os.mkdir(rs_dir)
    plink_extract = 'plink -bfile {} -extract {} --make-bed -out {}'
    geno_batch = list()
    for mTrait_name in mTrait_qtl.phe_name.unique():
        out_name = bed_dir.strip('/') + '/' + mTrait_name
        rs = mTrait_qtl.loc[mTrait_qtl.phe_name == mTrait_name, 'SNP']
        rs_name = rs_dir.strip('/') + '/' + '_'.join([mTrait_name, 'rs.txt'])
        pd.Series(rs).to_frame().to_csv(rs_name, index=False, header=None)
        geno_batch.append((plink_extract.format(geno, rs_name, out_name), ))
    out_name = bed_dir.strip('/') + '/pTrait'
    rs_name = rs_dir.strip('/') + '/pTrait_rs.txt'
    mTrait_qtl['SNP'].to_frame().to_csv(rs_name, index=False, header=None)
    geno_batch.append((plink_extract.format(geno, rs_name, out_name), ))
    mp.parallel(mp.run, geno_batch, threads)
    for fn in glob.glob(bed_dir.strip('/') + '/*fam'):
        fam = pd.read_csv(fn, sep=' ', header=None)
        mTrait_name = fn.split('/')[-1].replace('.fam', '')
        if mTrait_name == 'pTrait':
            pTrait = pTrait.reindex(fam[0])
            fam.index = fam[0]
            fam = pd.concat([fam, pTrait], axis=1)
        else:
            fam.loc[:, 5] = mTrait.loc[:, mTrait_name].reindex(fam[0]).values
        fam.to_csv(fn, index=False, header=None, sep=' ', na_rep='NA')
Пример #2
0
def generate_qtl_batch(omics_phe, phe_sig_qtl, geno_name, threads, bed_dir,
                       rs_dir):
    plink_extract = 'plink -bfile {} --extract {} --make-bed -out {}'
    bim = pd.read_csv(geno_name + '.bim', sep='\t', header=None)
    qtl_batch = list()
    rs = dict()
    for index, row in phe_sig_qtl.iterrows():
        rs.setdefault(row['phe_name'], []).extend(
            bim.loc[(bim[0] == row['chr']) & (bim[3] >= row['start']) &
                    (bim[3] <= row['end']), 1].values.tolist())
    for phe_name in rs:
        out_name = bed_dir.strip('/') + '/' + '_'.join(['tmp', phe_name])
        rs_name = rs_dir.strip('/') + '/' + '_'.join(
            ['tmp', phe_name, 'rs.txt'])
        pd.Series(rs[phe_name]).to_frame().to_csv(rs_name,
                                                  index=False,
                                                  header=False)
        qtl_batch.append((plink_extract.format(geno_name, rs_name,
                                               out_name), ))
    mp.parallel(mp.run, qtl_batch, threads)
    for fn in glob.glob(bed_dir.strip('/') + '/*fam'):
        fam = pd.read_csv(fn, sep=' ', header=None)
        phe_name = '_'.join(fn.split('/')[-1].split('_')[1:]).replace(
            'm.z', 'm/z').replace('.fam', '')
        fam.loc[:, 5] = omics_phe.loc[:, phe_name].reindex(fam.loc[:,
                                                                   0]).values
        fam.to_csv(fn, index=False, header=None, sep=' ', na_rep='NA')
Пример #3
0
def gwas(phe, geno, num_threads, phe_fn):
    geno_prefix = geno.split('/')[-1]
    related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format(
        geno_prefix, geno_prefix)
    gwas_cmd = 'gemma.linux -bfile {0}.link -k output/{0}.cXX.txt -lmm -n {1} -o {2}'
    fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
    fam[5] = 1
    fam = pd.merge(fam, phe, left_on=0, right_index=True, how='left')
    fam.to_csv(geno_prefix + '.link.fam',
               sep='\t',
               na_rep='NA',
               header=None,
               index=False)
    if os.path.exists(geno_prefix + '.link.bed'):
        os.remove(geno_prefix + '.link.bed')
    if os.path.exists(geno_prefix + '.link.bim'):
        os.remove(geno_prefix + '.link.bim')
    os.symlink(geno + '.bed', geno_prefix + '.link.bed')
    os.symlink(geno + '.bim', geno_prefix + '.link.bim')
    values = list()
    for _, p in enumerate(phe.columns):
        p = p.replace('/', '.')
        values.append((gwas_cmd.format(*[
            geno_prefix, _ +
            2, '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) + '_' + str(p)
        ]), ))
    s = mp.run(related_matrix_cmd)
    if s != 0:
        return None
    else:
        s = mp.parallel(mp.run, values, num_threads)
        os.remove(geno_prefix + '.link.bed')
        os.remove(geno_prefix + '.link.bim')
        os.remove(geno_prefix + '.link.fam')
        return s
Пример #4
0
def region_gwas_parallel(bed_dir, threads, geno):
    local_gwas_args = list()
    geno_prefix = geno.split('/')[-1]
    fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
    fam[5] = 1
    fam.to_csv(geno_prefix + '.link.fam',
               sep='\t',
               na_rep='NA',
               header=None,
               index=False)
    if os.path.exists(geno_prefix + '.link.bed'):
        os.remove(geno_prefix + '.link.bed')
    if os.path.exists(geno_prefix + '.link.bim'):
        os.remove(geno_prefix + '.link.bim')
    os.symlink(geno + '.bed', geno_prefix + '.link.bed')
    os.symlink(geno + '.bim', geno_prefix + '.link.bim')
    related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format(
        geno_prefix, geno_prefix)
    s = mp.run(related_matrix_cmd)
    if s != 0:
        return None
    gemma_cmd = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}'
    for i in glob.glob(bed_dir + '/*.bed'):
        i = i.replace('.bed', '')
        i = i.replace('m/z', 'm.z')
        prefix = i.split('/')[-1]
        local_gwas_args.append((gemma_cmd.format(i, geno_prefix,
                                                 prefix + '_plink'), ))
    s = mp.parallel(mp.run, local_gwas_args, threads)
    os.remove(geno_prefix + '.link.bed')
    os.remove(geno_prefix + '.link.bim')
    os.remove(geno_prefix + '.link.fam')
    return s
Пример #5
0
def gwas_plot_parallel(phe, p, threads, t, phe_fn):
    values = list()
    for i in phe.columns:
        i = i.replace('/', '.')
        values.append(
            ('output/' + '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) +
             '_' + str(i) + '.assoc.txt', p,
             '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) + '_' + str(i),
             t))
    s = mp.parallel(gwas_plot, values, threads)
    return s
Пример #6
0
def MR_parallel(mTrait_qtl, mTrait, pTrait, geno, threads, pvalue_cutoff):
    args = list()
    for index, row in mTrait_qtl.iterrows():
        rs = row['SNP']
        mTrait_name = row['phe_name']
        args.append(
            (mTrait.loc[:, mTrait_name], pTrait, geno.loc[:,
                                                          rs], pvalue_cutoff))
    res = mp.parallel(MR, args, threads)
    res = pd.concat([i for i in res])
    return res
Пример #7
0
def MR_MLM_parallel(mTrait_qtl, mTrait_effect, pTrait_effect, pTrait_se,
                    threads, pvalue_cutoff):
    args = []
    for index, row in mTrait_qtl.iterrows():
        mTrait_name = row['phe_name']
        rs = row['SNP']
        args.append(
            (mTrait_effect.loc[';'.join([mTrait_name, rs]), :],
             pTrait_effect.loc[rs, :], pTrait_se.loc[rs, :], pvalue_cutoff))
    res = mp.parallel(MR_MLM, args, threads)
    res = pd.concat([i for i in res])
    return res
Пример #8
0
def genome_cluster(G, window, step, threads):
    paras = list()
    if threads > np.unique(G.chrom).shape[0]:
        threads = np.unique(G.chrom).shape[0]
    for chrom in np.unique(G.chrom):
        G_chr = G.where(G.chrom == chrom, drop=True)
        paras.append((G_chr, chrom, window, step))
    res = mp.parallel(chr_cluster_pca, paras, threads)
    #res_pc = pd.concat([i[0] for i in res], axis=1)
    res_pc = pd.concat(res, axis=1)
    res_pc.loc[:, :] = np.around(
        MinMaxScaler(feature_range=(0, 2)).fit_transform(res_pc.values),
        decimals=3)
    #res_variant = pd.concat([i[1] for i in res])
    #return res_pc,res_variant
    return res_pc
Пример #9
0
def qtl_pc_lmm_gwas_parallel(omics_phe, bimbam_dir, threads, geno, sample_id):
    qtl_pc_lmm_args = list()
    #g = read_plink1_bin(geno+'.bed', geno+'.bim', geno+'.fam', verbose=False)
    #g = g.sel(sample=sample_id)
    geno_prefix = geno.split('/')[-1]
    #if os.path.exists(geno_prefix+'.link.bed'):
    #    os.remove(geno_prefix+'.link.bed')
    #if os.path.exists(geno_prefix+'.link.bim'):
    #    os.remove(geno_prefix+'.link.bim')
    #write_plink1_bin(g,geno_prefix+'.link.bed', geno_prefix+'.link.bim,', geno_prefix+'.link.fam',verbose=False)
    fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
    fam[5] = 1
    fam.to_csv(geno_prefix + '.link.fam',
               sep='\t',
               na_rep='NA',
               header=None,
               index=False)
    omics_phe = omics_phe.reindex(fam[0].values)
    omics_phe.to_csv('bimbam_phe.txt',
                     sep='\t',
                     index=False,
                     header=None,
                     na_rep='NA')
    if os.path.exists(geno_prefix + '.link.bed'):
        os.remove(geno_prefix + '.link.bed')
    if os.path.exists(geno_prefix + '.link.bim'):
        os.remove(geno_prefix + '.link.bim')
    os.symlink(geno + '.bed', geno_prefix + '.link.bed')
    os.symlink(geno + '.bim', geno_prefix + '.link.bim')
    related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format(
        geno_prefix, geno_prefix)
    s = mp.run(related_matrix_cmd)
    if s != 0:
        return None
    gemma_cmd = 'gemma.linux -g {0} -a {1} -p bimbam_phe.txt -k ./output/{2}.cXX.txt -lmm -n {3} -o {4}'
    for _, m in enumerate(omics_phe.columns):
        m = m.replace('m/z', 'm.z')
        qtl_pc_lmm_args.append((gemma_cmd.format(
            bimbam_dir.strip('/') + '/tmp_' + m + '.geno.txt',
            bimbam_dir.strip('/') + '/tmp_' + m + '.anno.txt', geno_prefix,
            _ + 1, m + '_bimbam'), ))
    s = mp.parallel(mp.run, qtl_pc_lmm_args, threads)
    os.remove(geno_prefix + '.link.bed')
    os.remove(geno_prefix + '.link.bim')
    os.remove(geno_prefix + '.link.fam')
    return s
Пример #10
0
def qtl_pc_lm_gwas_parallel(omics_phe, bimbam_dir, threads, geno):
    qtl_pc_lm_args = list()
    geno_prefix = geno.split('/')[-1]
    gemma_cmd = 'gemma.linux -g {0} -a {1} -p {2} -lm  -o {3}'
    for m in omics_phe.columns:
        phe = omics_phe[m].to_frame()
        m = m.replace('m/z', 'm.z')
        phe.to_csv(bimbam_dir.strip('/') + '/' + m + '_phe.txt',
                   index=False,
                   header=None,
                   na_rep='NA')
        qtl_pc_lm_args.append((gemma_cmd.format(
            bimbam_dir.strip('/') + '/' + geno_prefix + '_qtl_pc.geno.txt',
            bimbam_dir.strip('/') + '/' + geno_prefix + '_qtl_pc.anno.txt',
            bimbam_dir.strip('/') + '/' + m + '_phe.txt', m + '_bimbam_lm'), ))
    s = mp.parallel(mp.run, qtl_pc_lm_args, threads)
    return s
Пример #11
0
def plink_clump(geno_path, p1, p2, num_threads):
    if os.path.exists('./clump_result'):
        shutil.rmtree('./clump_result')
    os.mkdir('./clump_result')
    cmd = 'plink --bfile {0} --clump {1}  --clump-p1 {2} --clump-p2 {3} --clump-kb {4} --clump-r2 0.2 --out {5}'
    cmds = list()
    ms = list()
    for fn in glob.glob('./clump_input/*'):
        phe_name = fn.split('/')[-1].replace('.assoc', '')
        cmds.append(
            (cmd.format(geno_path + '/' + phe_name, fn, p1, p2, str(500),
                        './clump_result/' + phe_name + '_' + str(500)), ))
        ms.append(phe_name)
    s = mp.parallel(mp.run, cmds, num_threads)
    if sum(s) != 0:
        print(','.join(list(np.array(ms)[s])) +
              ' do not  successfully generated clumped file.')
    return s
Пример #12
0
def get_MLM_effect_parallell(assoc_dir, threads):
    mTrait_effect = pd.DataFrame()
    args = []
    pTrait_name = []
    for fn in glob.glob(assoc_dir.strip('/') + '/mTrait*.assoc.txt'):
        mTrait_name = fn.split('/')[-1].split('_')[-1].replace(
            '.assoc.txt', '')
        assoc = pd.read_csv(fn, sep='\t')
        assoc.index = mTrait_name + ';' + assoc['rs']
        mTrait_effect = pd.concat([mTrait_effect, assoc[['beta', 'se']]])
    for fn in glob.glob(assoc_dir.strip('/') + '/pTrait*assoc.txt'):
        pTrait_name.append(
            fn.split('/')[-1].split('_')[-1].replace('.assoc.txt', ''))
        args.append((fn, ))
    pTrait_res = mp.parallel(get_MLM_effect, args, threads)
    pTrait_effect = pd.concat([i['beta'] for i in pTrait_res], axis=1)
    pTrait_effect.columns = pTrait_name
    pTrait_se = pd.concat([i['se'] for i in pTrait_res], axis=1)
    pTrait_se.columns = pTrait_name
    return mTrait_effect, pTrait_effect, pTrait_se
Пример #13
0
def calc_MLM_effect(bed_dir, pTrait, threads, geno):
    args = list()
    geno_prefix = geno.split('/')[-1]
    fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
    fam[5] = 1
    fam.to_csv(geno_prefix + '.link.fam',
               sep='\t',
               na_rep='NA',
               header=None,
               index=False)
    if os.path.exists(geno_prefix + '.link.bed'):
        os.remove(geno_prefix + '.link.bed')
    if os.path.exists(geno_prefix + '.link.bim'):
        os.remove(geno_prefix + '.link.bim')
    os.symlink(geno + '.bed', geno_prefix + '.link.bed')
    os.symlink(geno + '.bim', geno_prefix + '.link.bim')
    related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format(
        geno_prefix, geno_prefix)
    s = mp.run(related_matrix_cmd)
    if s != 0:
        return None
    gemma_cmd_mTrait = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}'
    gemma_cmd_pTrait = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n {2} -o {3}'
    for i in glob.glob(bed_dir + '/*.bed'):
        i = i.replace('.bed', '')
        if i.split('/')[-1] != 'pTrait':
            prefix = i.split('/')[-1]
            args.append((gemma_cmd_mTrait.format(i, geno_prefix,
                                                 'mTrait_' + prefix), ))
        else:
            for _, pTrait_name in enumerate(pTrait.columns):
                args.append(
                    (gemma_cmd_pTrait.format(i, geno_prefix, _ + 2,
                                             'pTrait_' + pTrait_name), ))
    s = mp.parallel(mp.run, args, threads)
    os.remove(geno_prefix + '.link.bed')
    os.remove(geno_prefix + '.link.bim')
    os.remove(geno_prefix + '.link.fam')
    return s