Exemplo n.º 1
0
def main():
    # Filter VCF files for high quality SNPs and save them in DataFrame
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # estimated total CPU time for this part ~1,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3)
    for fname in glob(join(SNP.OM_RGC.InputDir, '*.vcf')):
        if exists(
                join(SNP.OM_RGC.GeneDFDir,
                     basename(fname).replace('.vcf', '.df'))):
            continue
        getvariants(fname,
                    SNP.OM_RGC.GeneDFDir,
                    only_snps=SNP.OM_RGC.OnlySNPs,
                    qual_thresh=SNP.OM_RGC.QualThresh,
                    min_samples=SNP.OM_RGC.MinSamples,
                    min_variants=SNP.OM_RGC.MinVariants)

    # Calculate selection metrics per gene (e.g. pN/pS)
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # estimated total CPU time for this part >5,000 hours
    for fname in glob(join(SNP.OM_RGC.GeneDFDir, '*.df')):
        outdir = SNP.OM_RGC.OutDir
        if all([exists(join(outdir, basename(fname).replace('.df',ext)))\
                for ext in ['.pnps.df', '.ffdeg_pi_wit.df', '.pnpn.df']]):
            continue
        analyze_genes(fname,
                      Calling.OM_RGC.DbFasta,
                      outdir,
                      SNP.OM_RGC.CacheDir,
                      min_pos_reads=SNP.OM_RGC.MinPosReads,
                      min_perc_poss=SNP.OM_RGC.MinPosReads,
                      min_total_var_support=SNP.OM_RGC.MinTotalVarSupport,
                      min_maf=SNP.OM_RGC.MinMaf,
                      min_samples=SNP.OM_RGC.MinSamples,
                      min_variants=SNP.OM_RGC.MinVariants)

    # Collate selection metrics per KEGG KO / eggNOG OG. Requires running eggnogMapper on OM-RGC
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # Note: this calculates only pN/pS per KEGG KO / eggNOG OG. To calculate fourfold degenerate
    # pi within (validation) or pN(conservative AA substitutions) vs pN(radical AA substitutions)
    # (also validation), refer to the relevant methods within SNP/CollatedGeneGroups.py
    for db in SNP.OM_RGC.GeneGroupCollateDBs:
        dbdct = Utils.Load(db)
        dbdct = filter_db(dbdct, SNP.OM_RGC, SNP.OM_RGC.MinGenes)
        for nm, genes in dbdct.items():
            if not exists(
                    join(SNP.OM_RGC.OutDirCollate, 'pnps',
                         split3way(db)[1] + '_' + nm + '.pnps.df')):
                do_one_group_pnps(nm,
                                  split3way(db)[1], genes, SNP.OM_RGC,
                                  SNP.OM_RGC.MinGenes)

    # This groups all of the files and saves them in the output folder defined as General.Basepath
    do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'KEGG'), 4, 60, 5, 50, 5)
    do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'eggNOG'), 4, 60, 5, 50,
               5)
Exemplo n.º 2
0
def eggnog_map_one(f, cpu=8):
    for db_f in ['eggnog.db', 'eggnog_proteins.dmnd']:
        if not exists(join('/dev/shm', db_f)):
            copy(join(Annotate.OM_RGC.EmapperDir, 'data', db_f),
                 join('/dev/shm', db_f))
        elif os.stat(join('/dev/shm', db_f)).st_size != os.stat(
                join(Annotate.OM_RGC.EmapperDir, 'data', db_f)).st_size:
            sleep(200)  #wait for file to copy
    tmpdir = join(Annotate.OM_RGC.AnnotDir, 'tmp', split3way(f)[1])
    rmrf(tmpdir)
    chdirmkifnotexist(tmpdir)
    shell_command("{} {} -i {} --output {} -m diamond --cpu {} --override --data_dir /dev/shm --temp_dir {} --no_file_comment"\
                  .format(Annotate.OM_RGC.Py27, Annotate.OM_RGC.EmapperPy, f,
                          join(Annotate.OM_RGC.AnnotDir, split3way(f)[1]), cpu,
                          tmpdir), verbose=True)
def _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, outdir):
    grpname = split3way(fname)[1].replace('.pnpn','').split(':')[1]
    ret = defaultdict(dict)
    ret_g1 = defaultdict(dict)
    ret_g2 = defaultdict(dict)
    df = read_pickle(fname)
    for nm, ldf in df.groupby(level=1):
        keepinds = ldf.index.get_level_values(0).isin(\
                ((ldf.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\
                .replace(False, np.nan).dropna().index)
        ldf = ldf.loc[keepinds]
        ldf = ldf.loc[:,(ldf.groupby(level=0).count() > 1).sum(0) >= mingenes]
        if ldf.shape[1] <= minsamples:
            continue
        gs = ldf[['GeneSites']]
        ldf = ldf.drop('GeneSites', axis = 1)
        for col in ldf.columns:
            coldf = ldf[[col]].join(gs).dropna()
            coldf = coldf.groupby(level=2).sum()
            coldf = coldf[col].truediv(coldf['GeneSites'])
            ret_g1[(nm,grpname)][col] = coldf['G1']
            ret_g2[(nm,grpname)][col] = coldf['G2']
            ret[(nm,grpname)][col] = (coldf['G1']/coldf['G2']) if coldf['G2'] !=0 else np.nan
    outdf = DataFrame(ret)
    outdf_g1 = DataFrame(ret_g1)
    outdf_g2 = DataFrame(ret_g2)
    if outdf.shape != (0,0):
        outdf.to_pickle(join(outdir, grpname + '.tmp.df'))
    if outdf_g1.shape != (0,0):
        outdf_g1.to_pickle(join(outdir, grpname + '.tmp.g1.df'))
    if outdf_g2.shape != (0,0):
        outdf_g2.to_pickle(join(outdir, grpname + '.tmp.g2.df'))
Exemplo n.º 4
0
def split_database(db_in, dirout, chunk_size):
    with open(db_in) as fin:
        chunknum = 0
        fout = open(
            join(dirout, '{}_{:04d}.fa'.format(split3way(db_in)[1], chunknum)),
            'w')
        culchunk = 0
        for rec in SeqIO.parse(fin, 'fasta'):
            culchunk += len(rec)
            if culchunk > chunk_size:
                fout.close()
                chunknum += 1
                culchunk = 0
                fout = open(
                    join(dirout,
                         '{}_{:04d}.fa'.format(split3way(db_in)[1], chunknum)),
                    'w')
            SeqIO.write(rec, fout, 'fasta')
        fout.close()
def annotate():
    translate_db(Annotate.OM_RGC.IndexFasta, Annotate.OM_RGC.IndexFaa)
    # Split the OM-RGC database to chuncks to reduce runtime
    split_database(Annotate.OM_RGC.IndexFaa, Annotate.OM_RGC.SplitDir, 5e7)
    # Run eggnog mapper on each one of the chunks
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for fname in glob(join(Annotate.OM_RGC.SplitDir, '*.fa')):
        if exists(
                join(Annotate.OM_RGC.AnnotDir,
                     split3way(fname)[1]) + '.emapper.annotations'):
            continue
        eggnog_map_one(fname, 8)
    # Parse the resulting annotation database and create a dictionary file for each annotation
    # This is then used in the collate stage (SNP pipeline)
    parseeggnog()
def do_collate(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene):
    ret = defaultdict(dict)
    ps = defaultdict(dict)
    pn = defaultdict(dict)
    for fname in glob(f_prefixes + '*.pnps.df'):
        grpname = split3way(fname)[1].replace('.pnps','')
        df = read_pickle(fname)
        keepinds = df.index.get_level_values(0).isin(\
                ((df.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\
                .replace(False, np.nan).dropna().index)
        df = df.loc[keepinds]
        df = df.loc[:,(df.groupby(level=0).count() > 1).sum(0) >= mingenes]
        if df.shape[1] <= minsamples:
            continue
        gs = df[['GeneSites']]
        df = df.drop('GeneSites', axis = 1)
        for col in df.columns:
            coldf = df[[col]].join(gs).dropna()
            coldf = coldf.groupby(level=1).sum()
            coldf = coldf[col].truediv(coldf['GeneSites'])
            ret[grpname][col] = coldf.NS/coldf.S
            pn[grpname][col] = coldf.NS
            ps[grpname][col] = coldf.S
        print(grpname)
    df = DataFrame(ret)
    ps = DataFrame(ps)
    pn = DataFrame(pn)
    df.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.csv'\
                                  .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
    pn.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.pn.csv'\
                                  .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))
    ps.to_csv(join(General.Basepath.OutDirCollate, '{}_{}_{}_{}_{}_{}.ps.csv'\
                                  .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, 
                                          minsamples, minsamples_gene)))