Пример #1
0
def plot_code_histograms(compiled_f, outdir):
    ret = Utils.Load(compiled_f)
    npr = np.array(ret['n+_risk'])
    cpr = np.array(ret['c+_risk'])
    hydr = np.array(ret['hyd_risk'])
    prr = np.array(ret['PR_risk'])
    for nm, riskarr, color in zip(
        ['N_plus', 'C_plus', 'hyd', 'PR'], [npr, cpr, hydr, prr],
        ['#0d4c7c', '#151515', '#018571', '#660099']):
        for nm1, riskarr1 in zip(['N_plus', 'C_plus', 'hyd', 'PR'],
                                 [npr, cpr, hydr, prr]):
            if nm == nm1:
                locarr = riskarr[1:]
                stan = riskarr[0]
            else:
                locarr = riskarr[1:][tuple([riskarr1[1:] <= riskarr1[0]])]
                stan = riskarr[0]
            _, ax = plt.subplots(1, figsize=(3.5, 2.333), dpi=144)
            ax.hist(locarr, color=color, bins=100, density=True)
            ax.axvline(stan, color='yellow', lw=1)
            ax.axvline(stan, color='k', lw=0.6)
            print('{} given {} {} p={}'.format(
                nm, nm1, stan,
                sum(locarr <= stan) / len(locarr)))
            plt.savefig(join(
                outdir, 'Code_cost_million_hist_{}_{}.png'.format(nm, nm1)),
                        dpi=144)
            plt.close('all')
Пример #2
0
def main():
    # Filter VCF files for high quality SNPs and save them in DataFrame
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # estimated total CPU time for this part ~1,000 hours (Intel(R) Xeon(R) CPU E5-2690 v3)
    for fname in glob(join(SNP.OM_RGC.InputDir, '*.vcf')):
        if exists(
                join(SNP.OM_RGC.GeneDFDir,
                     basename(fname).replace('.vcf', '.df'))):
            continue
        getvariants(fname,
                    SNP.OM_RGC.GeneDFDir,
                    only_snps=SNP.OM_RGC.OnlySNPs,
                    qual_thresh=SNP.OM_RGC.QualThresh,
                    min_samples=SNP.OM_RGC.MinSamples,
                    min_variants=SNP.OM_RGC.MinVariants)

    # Calculate selection metrics per gene (e.g. pN/pS)
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # estimated total CPU time for this part >5,000 hours
    for fname in glob(join(SNP.OM_RGC.GeneDFDir, '*.df')):
        outdir = SNP.OM_RGC.OutDir
        if all([exists(join(outdir, basename(fname).replace('.df',ext)))\
                for ext in ['.pnps.df', '.ffdeg_pi_wit.df', '.pnpn.df']]):
            continue
        analyze_genes(fname,
                      Calling.OM_RGC.DbFasta,
                      outdir,
                      SNP.OM_RGC.CacheDir,
                      min_pos_reads=SNP.OM_RGC.MinPosReads,
                      min_perc_poss=SNP.OM_RGC.MinPosReads,
                      min_total_var_support=SNP.OM_RGC.MinTotalVarSupport,
                      min_maf=SNP.OM_RGC.MinMaf,
                      min_samples=SNP.OM_RGC.MinSamples,
                      min_variants=SNP.OM_RGC.MinVariants)

    # Collate selection metrics per KEGG KO / eggNOG OG. Requires running eggnogMapper on OM-RGC
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    # Note: this calculates only pN/pS per KEGG KO / eggNOG OG. To calculate fourfold degenerate
    # pi within (validation) or pN(conservative AA substitutions) vs pN(radical AA substitutions)
    # (also validation), refer to the relevant methods within SNP/CollatedGeneGroups.py
    for db in SNP.OM_RGC.GeneGroupCollateDBs:
        dbdct = Utils.Load(db)
        dbdct = filter_db(dbdct, SNP.OM_RGC, SNP.OM_RGC.MinGenes)
        for nm, genes in dbdct.items():
            if not exists(
                    join(SNP.OM_RGC.OutDirCollate, 'pnps',
                         split3way(db)[1] + '_' + nm + '.pnps.df')):
                do_one_group_pnps(nm,
                                  split3way(db)[1], genes, SNP.OM_RGC,
                                  SNP.OM_RGC.MinGenes)

    # This groups all of the files and saves them in the output folder defined as General.Basepath
    do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'KEGG'), 4, 60, 5, 50, 5)
    do_collate(join(SNP.OM_RGC.OutDirCollate, 'pnps', 'eggNOG'), 4, 60, 5, 50,
               5)
Пример #3
0
def _getgeneseqs(genes_df_f, db_fasta, gene_names, cachedir):
    cache_f = join(cachedir, basename(genes_df_f).replace('.df', '.genes.dat'))
    if exists(cache_f):
        return Utils.Load(cache_f)
    ret = {}
    for rec in SeqIO.parse(db_fasta, 'fasta'):
        if rec.id in gene_names:
            ret[rec.id] = str(rec.seq)
            if len(ret) == len(gene_names):
                break
    Utils.Write(cache_f, ret)
    return ret
Пример #4
0
def million_codes():
    # This creates one million permutations of the genetic code
    aas, _, _ = get_codon_table()
    df = read_pickle(
        join(General.Basepath, 'All_4_60_mutation_codon_counts.df'))
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for i in range(100):
        codon_risk(df, aas, 'All_{:02d}'.format(i), True, subdir='Million')
    compiled_f = join(CodeAnalysis.CodonsDir, 'Codon_risk_compiled.dat')
    ret = defaultdict(list)
    for i, fn in enumerate(
            glob(join(CodeAnalysis.CodonsDir, 'Million', '*.dat'))):
        ret_l = Utils.Load(fn)
        for var in ['n+_risk', 'c+_risk', 'o+_risk', 'hyd_risk', 'PR_risk']:
            ret[var].extend((ret_l[var] if i == 0 else ret_l[var][1:]))
        print(i)
    Utils.Write(compiled_f, ret)
    return compiled_f
def filter_db(dbdct, analysisclass, mingenes):
    indir = analysisclass.OutDir
    pnps_piwig_f = join(indir, 'PNPSPiWiGenes.dat')
    genes_in_use = []
    if exists(pnps_piwig_f):
        pnpsgenes = Utils.Load(pnps_piwig_f)
    else:
        for fname in glob(join(indir, '*pnps.df')) + glob(join(indir, '*pi_wit.df')):
            print(fname)
            genes_in_use.extend(list(read_pickle(fname).index.get_level_values(0).unique()))
        pnpsgenes = list(set(genes_in_use))
        Utils.Write(pnps_piwig_f, genes_in_use)
    ret = {}
    pnpsgenes = set(pnpsgenes)
    for k,v in dbdct.items():
        if len(set(v).intersection(pnpsgenes)) >= mingenes:
            ret[k] = v
    return(ret)
Пример #6
0
def square_vs_diag(codon_permutation_f, outdir):
    # Replicates the analysis presented in Fig. 5B and creates plot
    ret = Utils.Load(codon_permutation_f)
    npr = np.array(ret['n+_risk'])
    codes = ret['code']
    squares = np.array([issquare(i) for i in codes[1:]])
    diags = np.array([isdiag(i) for i in codes[1:]])
    print('n diags: {}'.format(sum(diags)))
    print('n squares: {}'.format(sum(squares)))
    _, ax = plt.subplots(1, figsize=(4.7, 5.2), dpi=144)
    grps = [npr[1:][squares], npr[1:][~squares & ~diags], npr[1:][diags]]
    ax.boxplot(grps,
               showfliers=False,
               whis=(5, 95),
               flierprops={
                   'color': 'k',
                   'marker': 'x',
                   'markersize': 2
               },
               boxprops={
                   'color': 'k',
                   'lw': 0.6
               },
               capprops={
                   'color': 'k',
                   'lw': 0.6
               },
               whiskerprops={
                   'color': 'k',
                   'lw': 0.6
               },
               medianprops={
                   'color': '',
                   'lw': 1.2
               })
    ax.set_ylim(0.15, 0.31)
    ax.set_yticks([0.15, 0.2, 0.25, 0.3])
    print('squares vs all: {}'.format(mannwhitneyu(grps[0], grps[1])))
    print('squares vs diags: {}'.format(mannwhitneyu(grps[0], grps[2])))
    print('diags vs all: {}'.format(mannwhitneyu(grps[2], grps[1])))
    plt.savefig(join(outdir, 'Squares_diags.png'), dpi=144)
Пример #7
0
def multi_organism_analyze():
    # This replicates the analysis presented in Fig. 4
    # Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5581930/
    codons_all = read_pickle('./resource/ModelOrganisms.df').set_index('Taxid')
    # Take only the organisms with more than 50K codons in the calculation
    codons_all = codons_all.loc[codons_all.iloc[:, 11:].sum(1) >= 50000]
    aas, _, _ = get_codon_table()
    # Create alternative codes for each organism and transition-transverskion rate
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for taxid, row in codons_all.iterrows():
        codons = row[11:].astype(float)
        for titv in [0.2, 0.25, 0.333, 0.5, 0.667, 1, 1.5, 2, 3, 4, 5]:
            ti = (2 * titv) / (1 + 2 * titv)
            codon_risk(None,
                       aas,
                       'Tax_{}_Rate_{}'.format(taxid, titv),
                       all_mutations=False,
                       external_counts=codons,
                       external_titv=(ti, 1 - ti),
                       subdir='MultiOrg')
    # Collate the results in one table
    proc_stats = {}
    for fnm in glob(join(CodeAnalysis.CodonsDir, 'MultiOrg/*.dat')):
        tax = float(basename(fnm).split('Tax_')[-1].split('_')[0])
        rate = float(basename(fnm).split('Rate_')[-1].split('_')[0])
        ret = Utils.Load(fnm)
        npr = np.array(ret['n+_risk'])
        cpr = np.array(ret['c+_risk'])
        proc_stats[(tax, rate)] = {
            'cpr_p': sum(cpr[1:] <= cpr[0]) / 10000.,
            'npr_p': sum(npr[1:] <= npr[0]) / 10000.,
            'ncpr_p': sum((cpr[1:] <= cpr[0]) & (npr[1:] <= npr[0])),
            'cpr': cpr[0],
            'npr': npr[0]
        }
    DataFrame(proc_stats).to_pickle(
        join(CodeAnalysis.CodonsDir, 'MultiOrg_rates.df'))