def _getgeneseqs(genes_df_f, db_fasta, gene_names, cachedir):
    cache_f = join(cachedir, basename(genes_df_f).replace('.df', '.genes.dat'))
    if exists(cache_f):
        return Utils.Load(cache_f)
    ret = {}
    for rec in SeqIO.parse(db_fasta, 'fasta'):
        if rec.id in gene_names:
            ret[rec.id] = str(rec.seq)
            if len(ret) == len(gene_names):
                break
    Utils.Write(cache_f, ret)
    return ret
示例#2
0
def million_codes():
    # This creates one million permutations of the genetic code
    aas, _, _ = get_codon_table()
    df = read_pickle(
        join(General.Basepath, 'All_4_60_mutation_codon_counts.df'))
    # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline
    for i in range(100):
        codon_risk(df, aas, 'All_{:02d}'.format(i), True, subdir='Million')
    compiled_f = join(CodeAnalysis.CodonsDir, 'Codon_risk_compiled.dat')
    ret = defaultdict(list)
    for i, fn in enumerate(
            glob(join(CodeAnalysis.CodonsDir, 'Million', '*.dat'))):
        ret_l = Utils.Load(fn)
        for var in ['n+_risk', 'c+_risk', 'o+_risk', 'hyd_risk', 'PR_risk']:
            ret[var].extend((ret_l[var] if i == 0 else ret_l[var][1:]))
        print(i)
    Utils.Write(compiled_f, ret)
    return compiled_f
def filter_db(dbdct, analysisclass, mingenes):
    indir = analysisclass.OutDir
    pnps_piwig_f = join(indir, 'PNPSPiWiGenes.dat')
    genes_in_use = []
    if exists(pnps_piwig_f):
        pnpsgenes = Utils.Load(pnps_piwig_f)
    else:
        for fname in glob(join(indir, '*pnps.df')) + glob(join(indir, '*pi_wit.df')):
            print(fname)
            genes_in_use.extend(list(read_pickle(fname).index.get_level_values(0).unique()))
        pnpsgenes = list(set(genes_in_use))
        Utils.Write(pnps_piwig_f, genes_in_use)
    ret = {}
    pnpsgenes = set(pnpsgenes)
    for k,v in dbdct.items():
        if len(set(v).intersection(pnpsgenes)) >= mingenes:
            ret[k] = v
    return(ret)
示例#4
0
def codon_risk(df,
               aas,
               prefix,
               all_mutations=True,
               external_counts=None,
               external_titv=None,
               subdir=None):
    stops = ['TAA', 'TAG', 'TGA']
    if external_counts is None:
        df = df.sum(1).to_frame('all_muts')
        dfnostop = df.loc[[(i, j) for i, j in df.index
                           if i not in stops and j not in stops]]
        codonabuns = df.groupby(level=0).sum()
    else:
        codonabuns = external_counts.to_frame('all_muts')
    codonabuns['AAs'] = [aas[i] for i in codonabuns.index]
    meanaas = codonabuns.groupby('AAs').median()
    codonabuns = codonabuns.join(meanaas, on='AAs',
                                 lsuffix='_1').drop(['all_muts_1', 'AAs'],
                                                    axis=1)
    codonabuns = codonabuns.truediv(
        codonabuns.sum()).rename(columns={'all_muts': 'codon_abuns'})
    ret = defaultdict(list)
    for it in range(10001):
        if it == 0:
            newaas = aas
            newcodonabuns = codonabuns
        else:
            # To maintain the abundances of the codons coding for the same amino acids
            codonaas = codonabuns.copy()
            codonaas['AAs'] = [aas[i] for i in codonabuns.index]
            codonaas = codonaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\
                        .drop(['level_0','AAs'], axis=1)
            newaas = scramble_codons(aas)
            codon_shuf = newaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\
                        .drop(['level_0','AAs'], axis=1)
            newcodonabuns = codonaas.join(
                codon_shuf, lsuffix='_1').set_index('index')['codon_abuns']
        # Estimate the abundance of mutations using fourfold degenerate synonymous mutations
        if external_titv is None:
            mutabuns = getmuts(filter_nonsyn(dfnostop, aas, True), all_mutations)\
                        .drop('mutation_pos', axis=1).groupby('mutation_type').mean().drop('None')
            mutabuns = mutabuns.truediv(
                mutabuns.sum()).rename(columns={'all_muts': 'mut_abuns'})
        else:
            mutabuns = DataFrame({
                'mut_abuns': {
                    'Transition': external_titv[0],
                    'Transversion': external_titv[1]
                }
            })
            mutabuns.index.name = 'mutation_type'
        mut_costs = get_mut_costs(newaas)
        newstops = newaas[newaas == '*'].index
        allabuns = getmuts(mut_costs, all_mutations).join(mutabuns, on='mutation_type').reset_index()\
                    .join(newcodonabuns, on='aa_start').set_index(['aa_start','aa_end'])\
                    .drop(['mutation_pos','mutation_type'], axis=1)
        allabuns = allabuns.loc[[(i,j) for i,j in allabuns.index \
                                 if i not in newstops and j not in newstops]]

        def applyfunc(x, col):
            return x[col] * x.mut_abuns * x.codon_abuns

        ret['hyd_risk'].append(
            allabuns.apply(lambda x: applyfunc(x, col='Hyd_d'), axis=1).sum())
        ret['PR_risk'].append(
            allabuns.apply(lambda x: applyfunc(x, col='PR_d'), axis=1).sum())
        ret['n+_risk'].append(allabuns[allabuns.N_d > 0].apply(
            lambda x: applyfunc(x, col='N_d'), axis=1).sum())
        ret['c+_risk'].append(allabuns[allabuns.C_d > 0].apply(
            lambda x: applyfunc(x, col='C_d'), axis=1).sum())
        ret['o+_risk'].append(allabuns[allabuns.O_d > 0].apply(
            lambda x: applyfunc(x, col='O_d'), axis=1).sum())
        ret['code'].append(newaas)
    if subdir is not None:
        outfol = mkdirifnotexists(join(CodeAnalysis.CodonsDir, subdir))
    outfname = 'Codon_risk_{}_{}_pstop_medaa.dat'.format(
        'allmut' if all_mutations else 'TiTv', prefix)
    print(outfname)
    Utils.Write(join(outfol, outfname), ret)
    for k in ret:
        if k != 'code':
            print('{}: {}'.format(k, sum(ret[k] < ret[k][0]) / 10000.))