def main(): # Filter VCF files for high quality SNPs and save them in DataFrame # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # estimated total CPU time for this part ~1,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3) for fname in glob(join(SNP.OM_RGC.InputDir, '*.vcf')): if exists( join(SNP.OM_RGC.GeneDFDir, basename(fname).replace('.vcf', '.df'))): continue getvariants(fname, SNP.OM_RGC.GeneDFDir, only_snps=SNP.OM_RGC.OnlySNPs, qual_thresh=SNP.OM_RGC.QualThresh, min_samples=SNP.OM_RGC.MinSamples, min_variants=SNP.OM_RGC.MinVariants) # Calculate selection metrics per gene (e.g. pN/pS) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # estimated total CPU time for this part >5,000 hours for fname in glob(join(SNP.OM_RGC.GeneDFDir, '*.df')): outdir = SNP.OM_RGC.OutDir if all([exists(join(outdir, basename(fname).replace('.df',ext)))\ for ext in ['.pnps.df', '.ffdeg_pi_wit.df', '.pnpn.df']]): continue analyze_genes(fname, Calling.OM_RGC.DbFasta, outdir, SNP.OM_RGC.CacheDir, min_pos_reads=SNP.OM_RGC.MinPosReads, min_perc_poss=SNP.OM_RGC.MinPosReads, min_total_var_support=SNP.OM_RGC.MinTotalVarSupport, min_maf=SNP.OM_RGC.MinMaf, min_samples=SNP.OM_RGC.MinSamples, min_variants=SNP.OM_RGC.MinVariants) # Collate selection metrics per KEGG KO / eggNOG OG. Requires running eggnogMapper on OM-RGC # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline # Note: this calculates only pN/pS per KEGG KO / eggNOG OG. To calculate fourfold degenerate # pi within (validation) or pN(conservative AA substitutions) vs pN(radical AA substitutions) # (also validation), refer to the relevant methods within SNP/CollatedGeneGroups.py for db in SNP.OM_RGC.GeneGroupCollateDBs: dbdct = Utils.Load(db) dbdct = filter_db(dbdct, SNP.OM_RGC, SNP.OM_RGC.MinGenes) for nm, genes in dbdct.items(): if not exists( join(SNP.OM_RGC.OutDirCollate, 'pnps', split3way(db)[1] + '_' + nm + '.pnps.df')): do_one_group_pnps(nm, split3way(db)[1], genes, SNP.OM_RGC, SNP.OM_RGC.MinGenes) # This groups all of the files and saves them in the output folder defined as General.Basepath do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'KEGG'), 4, 60, 5, 50, 5) do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'eggNOG'), 4, 60, 5, 50, 5)
def eggnog_map_one(f, cpu=8): for db_f in ['eggnog.db', 'eggnog_proteins.dmnd']: if not exists(join('/dev/shm', db_f)): copy(join(Annotate.OM_RGC.EmapperDir, 'data', db_f), join('/dev/shm', db_f)) elif os.stat(join('/dev/shm', db_f)).st_size != os.stat( join(Annotate.OM_RGC.EmapperDir, 'data', db_f)).st_size: sleep(200) #wait for file to copy tmpdir = join(Annotate.OM_RGC.AnnotDir, 'tmp', split3way(f)[1]) rmrf(tmpdir) chdirmkifnotexist(tmpdir) shell_command("{} {} -i {} --output {} -m diamond --cpu {} --override --data_dir /dev/shm --temp_dir {} --no_file_comment"\ .format(Annotate.OM_RGC.Py27, Annotate.OM_RGC.EmapperPy, f, join(Annotate.OM_RGC.AnnotDir, split3way(f)[1]), cpu, tmpdir), verbose=True)
def _collate_pnpn_inner(fname, mingenes, minsamples, minsamples_gene, outdir): grpname = split3way(fname)[1].replace('.pnpn','').split(':')[1] ret = defaultdict(dict) ret_g1 = defaultdict(dict) ret_g2 = defaultdict(dict) df = read_pickle(fname) for nm, ldf in df.groupby(level=1): keepinds = ldf.index.get_level_values(0).isin(\ ((ldf.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\ .replace(False, np.nan).dropna().index) ldf = ldf.loc[keepinds] ldf = ldf.loc[:,(ldf.groupby(level=0).count() > 1).sum(0) >= mingenes] if ldf.shape[1] <= minsamples: continue gs = ldf[['GeneSites']] ldf = ldf.drop('GeneSites', axis = 1) for col in ldf.columns: coldf = ldf[[col]].join(gs).dropna() coldf = coldf.groupby(level=2).sum() coldf = coldf[col].truediv(coldf['GeneSites']) ret_g1[(nm,grpname)][col] = coldf['G1'] ret_g2[(nm,grpname)][col] = coldf['G2'] ret[(nm,grpname)][col] = (coldf['G1']/coldf['G2']) if coldf['G2'] !=0 else np.nan outdf = DataFrame(ret) outdf_g1 = DataFrame(ret_g1) outdf_g2 = DataFrame(ret_g2) if outdf.shape != (0,0): outdf.to_pickle(join(outdir, grpname + '.tmp.df')) if outdf_g1.shape != (0,0): outdf_g1.to_pickle(join(outdir, grpname + '.tmp.g1.df')) if outdf_g2.shape != (0,0): outdf_g2.to_pickle(join(outdir, grpname + '.tmp.g2.df'))
def split_database(db_in, dirout, chunk_size): with open(db_in) as fin: chunknum = 0 fout = open( join(dirout, '{}_{:04d}.fa'.format(split3way(db_in)[1], chunknum)), 'w') culchunk = 0 for rec in SeqIO.parse(fin, 'fasta'): culchunk += len(rec) if culchunk > chunk_size: fout.close() chunknum += 1 culchunk = 0 fout = open( join(dirout, '{}_{:04d}.fa'.format(split3way(db_in)[1], chunknum)), 'w') SeqIO.write(rec, fout, 'fasta') fout.close()
def annotate(): translate_db(Annotate.OM_RGC.IndexFasta, Annotate.OM_RGC.IndexFaa) # Split the OM-RGC database to chuncks to reduce runtime split_database(Annotate.OM_RGC.IndexFaa, Annotate.OM_RGC.SplitDir, 5e7) # Run eggnog mapper on each one of the chunks # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for fname in glob(join(Annotate.OM_RGC.SplitDir, '*.fa')): if exists( join(Annotate.OM_RGC.AnnotDir, split3way(fname)[1]) + '.emapper.annotations'): continue eggnog_map_one(fname, 8) # Parse the resulting annotation database and create a dictionary file for each annotation # This is then used in the collate stage (SNP pipeline) parseeggnog()
def do_collate(f_prefixes, minpos, minperc, mingenes, minsamples, minsamples_gene): ret = defaultdict(dict) ps = defaultdict(dict) pn = defaultdict(dict) for fname in glob(f_prefixes + '*.pnps.df'): grpname = split3way(fname)[1].replace('.pnps','') df = read_pickle(fname) keepinds = df.index.get_level_values(0).isin(\ ((df.groupby(level=0).count() > 1).sum(1) >= minsamples_gene)\ .replace(False, np.nan).dropna().index) df = df.loc[keepinds] df = df.loc[:,(df.groupby(level=0).count() > 1).sum(0) >= mingenes] if df.shape[1] <= minsamples: continue gs = df[['GeneSites']] df = df.drop('GeneSites', axis = 1) for col in df.columns: coldf = df[[col]].join(gs).dropna() coldf = coldf.groupby(level=1).sum() coldf = coldf[col].truediv(coldf['GeneSites']) ret[grpname][col] = coldf.NS/coldf.S pn[grpname][col] = coldf.NS ps[grpname][col] = coldf.S print(grpname) df = DataFrame(ret) ps = DataFrame(ps) pn = DataFrame(pn) df.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.csv'\ .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) pn.to_csv(join(General.Basepath, '{}_{}_{}_{}_{}_{}.pn.csv'\ .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene))) ps.to_csv(join(General.Basepath.OutDirCollate, '{}_{}_{}_{}_{}_{}.ps.csv'\ .format(f_prefixes.split('/')[-1], minpos, minperc, mingenes, minsamples, minsamples_gene)))