def AA_sequence(refDNA_dic,cds_df,gene,seq_type='AA'): pr_seqs = [] tr_seqs = [] # 1. get all proteins gene_df = cds_df[cds_df['geneid'].values==gene] prs = list(set(gene_df['access'].tolist())) prs = sorted(prs) obj = trpr(gene_df) # 2. loop for each pr for pr in prs: # 1) get chromosome chrom = obj.get_chrom(pr,id_type='access') pos = obj.get_trpr_pos(pr) ref_seq = refDNA_dic[chrom].seq sequence = ''.join([ref_seq[p-1] for p in pos]) nt_seq = Seq(sequence,generic_dna) if pos[0]>pos[1]: nt_seq = nt_seq.complement() AA = str(nt_seq.translate()) tr_seqs.append(str(nt_seq)) pr_seqs.append(AA) if seq_type=='AA': return pr_seqs,prs else: return tr_seqs,prs
def AA_sequence(refDNA_dic, cds_df, gene, seq_type='AA'): pr_seqs = [] tr_seqs = [] # 1. get all proteins gene_df = cds_df[cds_df['geneid'].values == gene] prs = list(set(gene_df['access'].tolist())) prs = sorted(prs) obj = trpr(gene_df) # 2. loop for each pr for pr in prs: # 1) get chromosome chrom = obj.get_chrom(pr, id_type='access') pos = obj.get_trpr_pos(pr) ref_seq = refDNA_dic[chrom].seq sequence = ''.join([ref_seq[p - 1] for p in pos]) nt_seq = Seq(sequence, generic_dna) if pos[0] > pos[1]: nt_seq = nt_seq.complement() AA = str(nt_seq.translate()) tr_seqs.append(str(nt_seq)) pr_seqs.append(AA) if seq_type == 'AA': return pr_seqs, prs else: return tr_seqs, prs
def fpkm_from_htseq(bam_path, ruv_path, exn_file): """ This function calculates fpkm from the htseq-count results. * bam_path: pathway that has bam files. Used to get total mapped reads. * ruv_path: pathway that has ruvseq corrected count data. * exn_file: 6 columns. including ['chr','start','end','geneid','traccess','strand']. output file that ends with .fpkm. """ os.chdir(bam_path) bams = [f for f in os.listdir(bam_path) if f.endswith('.bam')] bams = natsorted(bams) # 1. get total count totalCount = [] for b in bams: bamHandle = pysam.AlignmentFile(b, 'rb') totalCount.append(bamHandle.mapped) # 2. get rna_obj rna_df = pd.read_csv(exn_file, sep='\t', header=0, low_memory=False) rna_obj = trpr(rna_df) # 3. get length for each gene os.chdir(ruv_path) norm_count_files = [f for f in os.listdir(ruv_path) if f.endswith('.txt')] norm_count_files = natsorted(norm_count_files) for fn, total in zip(norm_count_files, totalCount): df = pd.read_csv(fn, sep=' ', header=None, names=['geneid', 'count'], index_col=0, low_memory=False) df['len'] = df.index.map( lambda x: rna_obj.get_gene_trpr_len(x, multi_chrom='Y')) df['fpkm'] = df['count'] / float(total) / df['len'] * 10**9 df['fpkm'].ix[:-20].to_csv(fn[:-3] + 'fpkm.txt', sep='\t')
def fpkm_from_htseq(bam_path,ruv_path,exn_file): """ This function calculates fpkm from the htseq-count results. * bam_path: pathway that has bam files. Used to get total mapped reads. * ruv_path: pathway that has ruvseq corrected count data. * exn_file: 6 columns. including ['chr','start','end','geneid','traccess','strand']. output file that ends with .fpkm. """ os.chdir(bam_path) bams = [f for f in os.listdir(bam_path) if f.endswith('.bam')] bams = natsorted(bams) # 1. get total count totalCount = [] for b in bams: bamHandle = pysam.AlignmentFile(b,'rb') totalCount.append(bamHandle.mapped) # 2. get rna_obj rna_df = pd.read_csv(exn_file,sep='\t',header=0,low_memory=False) rna_obj = trpr(rna_df) # 3. get length for each gene os.chdir(ruv_path) norm_count_files = [f for f in os.listdir(ruv_path) if f.endswith('.txt')] norm_count_files = natsorted(norm_count_files) for fn,total in zip(norm_count_files,totalCount): df = pd.read_csv(fn,sep=' ',header=None,names=['geneid','count'],index_col=0,low_memory=False) df['len'] = df.index.map(lambda x: rna_obj.get_gene_trpr_len(x,multi_chrom='Y')) df['fpkm'] = df['count']/float(total)/df['len']*10**9 df['fpkm'].ix[:-20].to_csv(fn[:-3]+'fpkm.txt',sep='\t')