def _getgeneseqs(genes_df_f, db_fasta, gene_names, cachedir): cache_f = join(cachedir, basename(genes_df_f).replace('.df', '.genes.dat')) if exists(cache_f): return Utils.Load(cache_f) ret = {} for rec in SeqIO.parse(db_fasta, 'fasta'): if rec.id in gene_names: ret[rec.id] = str(rec.seq) if len(ret) == len(gene_names): break Utils.Write(cache_f, ret) return ret
def million_codes(): # This creates one million permutations of the genetic code aas, _, _ = get_codon_table() df = read_pickle( join(General.Basepath, 'All_4_60_mutation_codon_counts.df')) # TODO: IMPORTANT! Wrap this for loop with your hpc job submission pipeline for i in range(100): codon_risk(df, aas, 'All_{:02d}'.format(i), True, subdir='Million') compiled_f = join(CodeAnalysis.CodonsDir, 'Codon_risk_compiled.dat') ret = defaultdict(list) for i, fn in enumerate( glob(join(CodeAnalysis.CodonsDir, 'Million', '*.dat'))): ret_l = Utils.Load(fn) for var in ['n+_risk', 'c+_risk', 'o+_risk', 'hyd_risk', 'PR_risk']: ret[var].extend((ret_l[var] if i == 0 else ret_l[var][1:])) print(i) Utils.Write(compiled_f, ret) return compiled_f
def filter_db(dbdct, analysisclass, mingenes): indir = analysisclass.OutDir pnps_piwig_f = join(indir, 'PNPSPiWiGenes.dat') genes_in_use = [] if exists(pnps_piwig_f): pnpsgenes = Utils.Load(pnps_piwig_f) else: for fname in glob(join(indir, '*pnps.df')) + glob(join(indir, '*pi_wit.df')): print(fname) genes_in_use.extend(list(read_pickle(fname).index.get_level_values(0).unique())) pnpsgenes = list(set(genes_in_use)) Utils.Write(pnps_piwig_f, genes_in_use) ret = {} pnpsgenes = set(pnpsgenes) for k,v in dbdct.items(): if len(set(v).intersection(pnpsgenes)) >= mingenes: ret[k] = v return(ret)
def codon_risk(df, aas, prefix, all_mutations=True, external_counts=None, external_titv=None, subdir=None): stops = ['TAA', 'TAG', 'TGA'] if external_counts is None: df = df.sum(1).to_frame('all_muts') dfnostop = df.loc[[(i, j) for i, j in df.index if i not in stops and j not in stops]] codonabuns = df.groupby(level=0).sum() else: codonabuns = external_counts.to_frame('all_muts') codonabuns['AAs'] = [aas[i] for i in codonabuns.index] meanaas = codonabuns.groupby('AAs').median() codonabuns = codonabuns.join(meanaas, on='AAs', lsuffix='_1').drop(['all_muts_1', 'AAs'], axis=1) codonabuns = codonabuns.truediv( codonabuns.sum()).rename(columns={'all_muts': 'codon_abuns'}) ret = defaultdict(list) for it in range(10001): if it == 0: newaas = aas newcodonabuns = codonabuns else: # To maintain the abundances of the codons coding for the same amino acids codonaas = codonabuns.copy() codonaas['AAs'] = [aas[i] for i in codonabuns.index] codonaas = codonaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\ .drop(['level_0','AAs'], axis=1) newaas = scramble_codons(aas) codon_shuf = newaas.reset_index().groupby('AAs').apply(lambda x:x.reset_index())\ .drop(['level_0','AAs'], axis=1) newcodonabuns = codonaas.join( codon_shuf, lsuffix='_1').set_index('index')['codon_abuns'] # Estimate the abundance of mutations using fourfold degenerate synonymous mutations if external_titv is None: mutabuns = getmuts(filter_nonsyn(dfnostop, aas, True), all_mutations)\ .drop('mutation_pos', axis=1).groupby('mutation_type').mean().drop('None') mutabuns = mutabuns.truediv( mutabuns.sum()).rename(columns={'all_muts': 'mut_abuns'}) else: mutabuns = DataFrame({ 'mut_abuns': { 'Transition': external_titv[0], 'Transversion': external_titv[1] } }) mutabuns.index.name = 'mutation_type' mut_costs = get_mut_costs(newaas) newstops = newaas[newaas == '*'].index allabuns = getmuts(mut_costs, all_mutations).join(mutabuns, on='mutation_type').reset_index()\ .join(newcodonabuns, on='aa_start').set_index(['aa_start','aa_end'])\ .drop(['mutation_pos','mutation_type'], axis=1) allabuns = allabuns.loc[[(i,j) for i,j in allabuns.index \ if i not in newstops and j not in newstops]] def applyfunc(x, col): return x[col] * x.mut_abuns * x.codon_abuns ret['hyd_risk'].append( allabuns.apply(lambda x: applyfunc(x, col='Hyd_d'), axis=1).sum()) ret['PR_risk'].append( allabuns.apply(lambda x: applyfunc(x, col='PR_d'), axis=1).sum()) ret['n+_risk'].append(allabuns[allabuns.N_d > 0].apply( lambda x: applyfunc(x, col='N_d'), axis=1).sum()) ret['c+_risk'].append(allabuns[allabuns.C_d > 0].apply( lambda x: applyfunc(x, col='C_d'), axis=1).sum()) ret['o+_risk'].append(allabuns[allabuns.O_d > 0].apply( lambda x: applyfunc(x, col='O_d'), axis=1).sum()) ret['code'].append(newaas) if subdir is not None: outfol = mkdirifnotexists(join(CodeAnalysis.CodonsDir, subdir)) outfname = 'Codon_risk_{}_{}_pstop_medaa.dat'.format( 'allmut' if all_mutations else 'TiTv', prefix) print(outfname) Utils.Write(join(outfol, outfname), ret) for k in ret: if k != 'code': print('{}: {}'.format(k, sum(ret[k] < ret[k][0]) / 10000.))