Пример #1
0
def AA_sequence(refDNA_dic,cds_df,gene,seq_type='AA'):
    pr_seqs = []
    tr_seqs = []
    # 1. get all proteins
    gene_df = cds_df[cds_df['geneid'].values==gene]
    prs = list(set(gene_df['access'].tolist()))
    prs = sorted(prs)
    obj = trpr(gene_df)
    # 2. loop for each pr
    for pr in prs:
        # 1) get chromosome
        chrom = obj.get_chrom(pr,id_type='access')
        pos = obj.get_trpr_pos(pr)
        ref_seq = refDNA_dic[chrom].seq
        sequence = ''.join([ref_seq[p-1] for p in pos])
        nt_seq = Seq(sequence,generic_dna)
        if pos[0]>pos[1]:
            nt_seq = nt_seq.complement()
        AA = str(nt_seq.translate())
        tr_seqs.append(str(nt_seq))
        pr_seqs.append(AA)
    if seq_type=='AA':
        return pr_seqs,prs
    else:
        return tr_seqs,prs
Пример #2
0
def AA_sequence(refDNA_dic, cds_df, gene, seq_type='AA'):
    pr_seqs = []
    tr_seqs = []
    # 1. get all proteins
    gene_df = cds_df[cds_df['geneid'].values == gene]
    prs = list(set(gene_df['access'].tolist()))
    prs = sorted(prs)
    obj = trpr(gene_df)
    # 2. loop for each pr
    for pr in prs:
        # 1) get chromosome
        chrom = obj.get_chrom(pr, id_type='access')
        pos = obj.get_trpr_pos(pr)
        ref_seq = refDNA_dic[chrom].seq
        sequence = ''.join([ref_seq[p - 1] for p in pos])
        nt_seq = Seq(sequence, generic_dna)
        if pos[0] > pos[1]:
            nt_seq = nt_seq.complement()
        AA = str(nt_seq.translate())
        tr_seqs.append(str(nt_seq))
        pr_seqs.append(AA)
    if seq_type == 'AA':
        return pr_seqs, prs
    else:
        return tr_seqs, prs
Пример #3
0
def fpkm_from_htseq(bam_path, ruv_path, exn_file):
    """
    This function calculates fpkm from the htseq-count results.
    * bam_path: pathway that has bam files. Used to get total mapped reads.
    * ruv_path: pathway that has ruvseq corrected count data.
    * exn_file: 6 columns. including ['chr','start','end','geneid','traccess','strand'].
    output file that ends with .fpkm.
    """
    os.chdir(bam_path)
    bams = [f for f in os.listdir(bam_path) if f.endswith('.bam')]
    bams = natsorted(bams)
    # 1. get total count
    totalCount = []
    for b in bams:
        bamHandle = pysam.AlignmentFile(b, 'rb')
        totalCount.append(bamHandle.mapped)
    # 2. get rna_obj
    rna_df = pd.read_csv(exn_file, sep='\t', header=0, low_memory=False)
    rna_obj = trpr(rna_df)
    # 3. get length for each gene
    os.chdir(ruv_path)
    norm_count_files = [f for f in os.listdir(ruv_path) if f.endswith('.txt')]
    norm_count_files = natsorted(norm_count_files)
    for fn, total in zip(norm_count_files, totalCount):
        df = pd.read_csv(fn,
                         sep=' ',
                         header=None,
                         names=['geneid', 'count'],
                         index_col=0,
                         low_memory=False)
        df['len'] = df.index.map(
            lambda x: rna_obj.get_gene_trpr_len(x, multi_chrom='Y'))
        df['fpkm'] = df['count'] / float(total) / df['len'] * 10**9
        df['fpkm'].ix[:-20].to_csv(fn[:-3] + 'fpkm.txt', sep='\t')
Пример #4
0
def fpkm_from_htseq(bam_path,ruv_path,exn_file):
    """
    This function calculates fpkm from the htseq-count results.
    * bam_path: pathway that has bam files. Used to get total mapped reads.
    * ruv_path: pathway that has ruvseq corrected count data.
    * exn_file: 6 columns. including ['chr','start','end','geneid','traccess','strand'].
    output file that ends with .fpkm.
    """
    os.chdir(bam_path)
    bams = [f for f in os.listdir(bam_path) if f.endswith('.bam')]
    bams = natsorted(bams)
    # 1. get total count
    totalCount = []
    for b in bams:
        bamHandle = pysam.AlignmentFile(b,'rb')
        totalCount.append(bamHandle.mapped)
    # 2. get rna_obj
    rna_df = pd.read_csv(exn_file,sep='\t',header=0,low_memory=False)
    rna_obj = trpr(rna_df)
    # 3. get length for each gene
    os.chdir(ruv_path)
    norm_count_files = [f for f in os.listdir(ruv_path) if f.endswith('.txt')]
    norm_count_files = natsorted(norm_count_files)
    for fn,total in zip(norm_count_files,totalCount):
        df = pd.read_csv(fn,sep=' ',header=None,names=['geneid','count'],index_col=0,low_memory=False)
        df['len'] = df.index.map(lambda x: rna_obj.get_gene_trpr_len(x,multi_chrom='Y'))
        df['fpkm'] = df['count']/float(total)/df['len']*10**9
        df['fpkm'].ix[:-20].to_csv(fn[:-3]+'fpkm.txt',sep='\t')