def do_work(idx): # IMPORTANT!!! np.random.seed() # cds_dat = orgs.get_group(idx) # genetic_table = cds_dat['table'].iloc[0] genetic_code = Data.CodonTable.unambiguous_dna_by_id[genetic_table] SynonymousCodons = dict([(aa,[]) for aa in genetic_code.protein_alphabet.letters]) # SynonymousCodons['STOP'] = genetic_code.stop_codons # STOP codons are excluded from analysis ... for codon,aa in genetic_code.forward_table.iteritems(): SynonymousCodons[aa].append(codon) # # # prot_cds_rnd = cds_dat['cDNA'].sample(PROT_COUNT) # cDNA sample proteins ... prot_cds_rnd = cds_dat['cDNA'] # let's use ALL cDNA to get the codon bias ... codon_usage = cairi.count_codons(prot_cds_rnd) # # generate codon weights based on codon counts ... codon_weights = {} for aa in SynonymousCodons: aa_codon_usage = [ codon_usage[codon] for codon in SynonymousCodons[aa] ] total_codons = sum(aa_codon_usage) # normalize ... codon_weights[aa] = np.true_divide(aa_codon_usage,float(total_codons)) # # now rewrite (back translate) protein sequences keeping their index ... cdna_shuffled = ( (ix,pid,back_translate(protein,SynonymousCodons,codon_weights)) for ix,pid,protein in cds_dat[['pid','protein']].itertuples() ) cdna_shuffled = pd.DataFrame(cdna_shuffled,columns=['ix','pid','cDNA_rnd']) cdna_shuffled = cdna_shuffled.set_index(keys='ix') # # # shuffled_cdna_list.append( cdna_shuffled ) return cdna_shuffled
def do_work(idx): # IMPORTANT!!! np.random.seed() # cds_dat = orgs.get_group(idx) # genetic_table = cds_dat['table'].iloc[0] genetic_code = Data.CodonTable.unambiguous_dna_by_id[genetic_table] SynonymousCodons = dict([ (aa, []) for aa in genetic_code.protein_alphabet.letters ]) # SynonymousCodons['STOP'] = genetic_code.stop_codons # STOP codons are excluded from analysis ... for codon, aa in genetic_code.forward_table.iteritems(): SynonymousCodons[aa].append(codon) # # # prot_cds_rnd = cds_dat['cDNA'].sample(PROT_COUNT) # cDNA sample proteins ... prot_cds_rnd = cds_dat[ 'cDNA'] # let's use ALL cDNA to get the codon bias ... codon_usage = cairi.count_codons(prot_cds_rnd) # # generate codon weights based on codon counts ... codon_weights = {} for aa in SynonymousCodons: aa_codon_usage = [ codon_usage[codon] for codon in SynonymousCodons[aa] ] total_codons = sum(aa_codon_usage) # normalize ... codon_weights[aa] = np.true_divide(aa_codon_usage, float(total_codons)) # # now rewrite (back translate) protein sequences keeping their index ... cdna_shuffled = ( (ix, pid, back_translate(protein, SynonymousCodons, codon_weights)) for ix, pid, protein in cds_dat[['pid', 'protein']].itertuples()) cdna_shuffled = pd.DataFrame(cdna_shuffled, columns=['ix', 'pid', 'cDNA_rnd']) cdna_shuffled = cdna_shuffled.set_index(keys='ix') # # # shuffled_cdna_list.append( cdna_shuffled ) return cdna_shuffled
ribo_counts = [(idx, orgs.get_group(idx)["ribosomal"].nonzero()[0].size) for idx in genom_id] ribo_cai_info = pd.DataFrame(ribo_counts, columns=["GenomicID", "ribo_count"]) ############# # # cix_prot = {} cix_ribo = {} # # ############# for idx, ribo_count in ribo_cai_info.itertuples(index=False): # cds_dat = orgs.get_group(idx) prot_cds_rnd = cds_dat["cDNA"].sample(PROT_COUNT) # cDNA sample proteins ... codon_usage_rnd = cairi.count_codons(prot_cds_rnd) codon_index_rnd = cairi.generate_codon_index( codon_usage_rnd, genetic_table=list(cds_dat["table"])[0] ) # fix that ... cix_prot[idx] = codon_index_rnd # if ribo_count >= RIBO_LIMIT: pass ribo_cds = cds_dat[cds_dat["ribosomal"]]["cDNA"] # cDNA of ribosomal proteins ... codon_usage = cairi.count_codons(ribo_cds) codon_index = cairi.generate_codon_index(codon_usage, genetic_table=list(cds_dat["table"])[0]) # fix that ... cix_ribo[idx] = codon_index ######################
ribo_cai_info = pd.DataFrame(ribo_counts,columns=['assembly_accession','ribo_count']) # some lists to describe organism's CAI distribution features ... percentile = [] median = [] mean = [] sigma = [] idx_for_ribo = [] ribo_count_for_df = [] # pid_cai_list = [] for idx,ribo_count in ribo_cai_info.itertuples(index=False): if ribo_count >= RIBO_LIMIT: cds_dat = orgs.get_group(idx) ribo_cds = cds_dat[cds_dat['ribosomal']]['cDNA'] # cDNA of ribosomal proteins ... codon_usage = cairi.count_codons(ribo_cds) codon_index = cairi.generate_codon_index(codon_usage,genetic_table=list(cds_dat['table'])[0]) # fix that ... # we need to track index from 'dat', as there are some stupid duplications ... pid_cai = pd.DataFrame(((dat_idx,pid,cairi.cai_for_gene(sequence,codon_index)) for dat_idx,pid,sequence in cds_dat[['pid','cDNA']].itertuples()),columns=['dat_idx','pid','CAI']) pid_cai = pid_cai.set_index(keys='dat_idx') # characterize CAI distribution for a given organism ... local_mean = pid_cai['CAI'].mean() local_median = pid_cai['CAI'].median() local_sigma = pid_cai['CAI'].std() mean.append(local_mean) median.append(local_median) sigma.append(local_sigma) idx_for_ribo.append(idx) ribo_count_for_df.append(ribo_count) # local_ribo_indexes = cds_dat['ribosomal'].nonzero()[0]
genom_id = orgs.groups.keys() # some lists to describe organism's CAI distribution features ... percentile = [] median = [] mean = [] sigma = [] idx_for_prot = [] # pid_cai_list = [] for idx in genom_id: cds_dat = orgs.get_group(idx) # instead of taking specific group of proteins, let's take RANDOM sample of 50 cDNA ... prot_cds = cds_dat['cDNA'].sample(PROT_COUNT) # cDNA sample proteins ... codon_usage = cairi.count_codons(prot_cds) codon_index = cairi.generate_codon_index( codon_usage, genetic_table=list(cds_dat['table'])[0]) # fix that ... # we need to track index from 'dat', as there are some stupid duplications ... pid_cai = pd.DataFrame( ((dat_idx, pid, cairi.cai_for_gene(sequence, codon_index)) for dat_idx, pid, sequence in cds_dat[['pid', 'cDNA']].itertuples()), columns=['dat_idx', 'pid', 'CAI']) pid_cai = pid_cai.set_index(keys='dat_idx') # characterize CAI distribution for a given organism ... local_mean = pid_cai['CAI'].mean() local_median = pid_cai['CAI'].median() local_sigma = pid_cai['CAI'].std() mean.append(local_mean) median.append(local_median) sigma.append(local_sigma)
mean = [] sigma = [] idx_for_prot = [] # pid_cai_list = [] for idx in genom_id: # cds_dat = orgs.get_group(idx) # old stuff ... org_cai = cai.get_group(idx) condition = org_cai['CAI'].notnull().all() if not condition: print "skipping", idx, " as it has no CAI ..." else: q30 = org_cai['CAI'].quantile(q=0.30) # take 50 random proteins from the bottom 30% OF ORIGINAL CAI ... prot_cds = org_cai[org_cai.CAI<q30]['cDNA'].sample(PROT_COUNT) codon_usage = cairi.count_codons(prot_cds) codon_index = cairi.generate_codon_index(codon_usage,genetic_table=list(org_cai['table'])[0]) # fix that ... # we need to track index from 'dat', as there are some stupid duplications ... pid_cai = pd.DataFrame(((dat_idx,pid,cairi.cai_for_gene(sequence,codon_index)) for dat_idx,pid,sequence in org_cai[['pid','cDNA']].itertuples()),columns=['dat_idx','pid','CAI']) pid_cai = pid_cai.set_index(keys='dat_idx') # characterize CAI distribution for a given organism ... local_mean = pid_cai['CAI'].mean() local_median = pid_cai['CAI'].median() local_sigma = pid_cai['CAI'].std() mean.append(local_mean) median.append(local_median) sigma.append(local_sigma) idx_for_prot.append(idx) # # get relative indexes of sampled cDNAs in 'org_cai' ... local_prot_indexes = prot_cds.index - org_cai.index[0]
ribo_cai_info = pd.DataFrame(ribo_counts, columns=['GenomicID', 'ribo_count']) ############# # # cix_prot = {} cix_ribo = {} # # ############# for idx, ribo_count in ribo_cai_info.itertuples(index=False): # cds_dat = orgs.get_group(idx) prot_cds_rnd = cds_dat['cDNA'].sample( PROT_COUNT) # cDNA sample proteins ... codon_usage_rnd = cairi.count_codons(prot_cds_rnd) codon_index_rnd = cairi.generate_codon_index( codon_usage_rnd, genetic_table=list(cds_dat['table'])[0]) # fix that ... cix_prot[idx] = codon_index_rnd # if ribo_count >= RIBO_LIMIT: pass ribo_cds = cds_dat[cds_dat['ribosomal']][ 'cDNA'] # cDNA of ribosomal proteins ... codon_usage = cairi.count_codons(ribo_cds) codon_index = cairi.generate_codon_index( codon_usage, genetic_table=list(cds_dat['table'])[0]) # fix that ... cix_ribo[idx] = codon_index