def check_consensus(gfname, faname):
    """
    checking the consensus sequence of the specific signal 

    @args gfname: genome annotation in gtf/gff file
    @type gfname: str  
    @args faname: genome sequence in fasta file 
    @type faname: str  
    """

    ## extract genome annotation from gtf file 
    gtf_file_content = GFFParser.Parse(gfname)
    print 'processed annotation file'

    ## signals considering 
    for signal in ['splice', 'tis']:
        
        gtf_db, feature_cnt = get_label_regions(gtf_file_content, signal)
        print 'extracted %d %s signal regions' % (feature_cnt, signal) 

        if signal == 'splice':

            don_true_seq, acc_true_seq, don_fal_seq, acc_fal_seq = true_ss_seq_fetch(faname, gtf_db, boundary=100)
            print 'summary of', signal, 'signal consensus'

            print 'don site cons %f non-cons %f' % (round((don_true_seq/feature_cnt)*100, 2), round((don_fal_seq/feature_cnt)*100, 2))
            print 'acc site cons %f non-cons %f' % (round((acc_true_seq/feature_cnt)*100, 2), round((acc_fal_seq/feature_cnt)*100, 2))

            break ## one singal 
示例#2
0
def get_features(gtf_file, fa_file):
    """
    get the total number of genes based on the annotation source  
    # of coding genes 
    genes with 3' 5' UTR
    other different class of genes
    """

    anno_db = GFFParser.Parse(gtf_file) 
    total_genes = len(anno_db) 

    non_coding = 0
    cds_coding = 0
    total_transcripts = 0 
    utr = 0 
    utr5 = 0
    utr3 = 0
    no_utr = 0 

    for features in anno_db:
        total_transcripts += len(features['transcripts'])

        for trans in features['cds_exons']:
            if trans.any():
                cds_coding += 1
            else:
                non_coding += 1

        for idx, trans in enumerate(features['transcripts']):
            if features['utr3_exons'][idx].any() and features['utr5_exons'][idx].any():
                utr +=1
            elif features['utr3_exons'][idx].any():
                utr3 +=1
            elif features['utr5_exons'][idx].any():
                utr5 += 1
            else:
                no_utr +=1 
    
    print 'total genes: ', total_genes
    print 'total transcripts: ', total_transcripts
    print 'coding transcripts: ', cds_coding
    print 'noncoding transcripts: ', non_coding
    print '---------------'
    print 'both utrs', utr 
    print '3 utr ', utr3
    print '5 utr ', utr5
    print 'no utr ', no_utr
def trsk_gene_len_dist(gtf_file, out_file="hist_cds_len.pdf"):
    """
    plotting the histograms bases on the genes and CDS length
    """
    import matplotlib.pyplot as plt

    anno_db = GFFParser.Parse(gtf_file)

    cds_idx = []  # deleting the empty cds lines
    for idp, feat in enumerate(anno_db):
        if not feat['cds_exons'][0].any():
            cds_idx.append(idp)

    anno_db = np.delete(anno_db, cds_idx)

    trans_len = np.zeros((len(anno_db), 2))
    genes = []

    for idx, feat in enumerate(anno_db):
        cds_len = 0
        for exc in feat['cds_exons'][0]:
            cds_len += exc[1] - exc[0]

        trans_len[idx, 0] = feat['stop'] - feat['start']
        trans_len[idx, 1] = cds_len
        genes.append(feat['name'])

    ## gene, cds length information
    df_len_dis_genes = pd.DataFrame(trans_len,
                                    columns=['gene_len', 'cds_len'],
                                    index=genes)

    ## plotting the gene length based on the bins of gene length
    gene_length = trans_len[:, 0]  ## gene length from the matrix

    freq, bins = np.histogram(gene_length,
                              bins=10,
                              range=None,
                              normed=False,
                              weights=None)
    bins = np.delete(bins, 10)

    df_gene_len_bin = pd.DataFrame(freq,
                                   columns=['gene_frequency'],
                                   index=bins)
    plt.figure()
    df_gene_len_bin.plot(kind="bar")
    #plt.savefig()

    ## plotting the cds length distribution
    cds_length = trans_len[:, 1]  ## cds length distribution
    freq, bins = np.histogram(cds_length,
                              bins=10,
                              range=None,
                              normed=False,
                              weights=None)
    bins = np.delete(bins, 10)
    df_cds_len_bin = pd.DataFrame(freq, columns=['cds_frequency'], index=bins)
    plt.figure()
    df_cds_len_bin.plot(kind="bar")
    plt.savefig(out_file)
def translate_trsk_genes(gtf_file, fas_file, out_seq_fname):
    """
    translate the trsk genes to protein sequence 

    @args gtf_file: genome annotation file 
    @type gtf_file: str 
    @args fas_file: genome sequence file
    @type fas_file: str
    @args out_seq_fname: output file in fasta format 
    @type out_seq_fname: str
    """

    if filecmp.cmp(gtf_file, fas_file):
        exit("Do the two files are exactly same? Please check that!")

    ## reading the TSkim file to get the features
    sys.stdout.write('reading genome features from %s\n' % gtf_file)
    anno_db = GFFParser.Parse(gtf_file)
    total_genes = len(anno_db)

    ## genome sequence file reading
    sys.stdout.write('reading genome sequence from %s\n' % fas_file)
    seqlab.chrom_name_consistency(fas_file, anno_db)

    cds_idx = []  # deleting the empty cds lines
    for idp, feat in enumerate(anno_db):
        if not feat['cds_exons'][0].any(
        ):  # TSkim annotation expects only single transcript from a region
            cds_idx.append(idp)
    anno_db = np.delete(anno_db, cds_idx)
    genes_with_cds = len(anno_db)

    fasFH = helper.open_file(fas_file)
    out_seq_fh = open(out_seq_fname, "w")
    for rec in SeqIO.parse(fasFH, "fasta"):
        for idx, feature in enumerate(anno_db):
            if rec.id == feature['chr']:
                ## iterate over cds_exons
                cds_seq = ''
                for ex in feature['cds_exons'][
                        0]:  ## single transcript by TSkim
                    cds_seq += rec.seq[ex[0] - 1:ex[1]]

                if feature['strand'] == '-':
                    cds_seq = cds_seq.reverse_complement()
                ##
                #sys.stdout.write(str(cds_seq.translate()) + "\n")

                ## fasta output
                if cds_seq:
                    prt_seq = SeqRecord(cds_seq.translate(),
                                        id=feature['name'],
                                        description='protein sequence')
                    out_seq_fh.write(prt_seq.format("fasta"))

        # FIXME need an efficient way to translate multiple gene
        # iterate over chromosome

    fasFH.close()
    out_seq_fh.close()

    sys.stdout.write('total genes fetched: %d\n' % total_genes)
    sys.stdout.write('total genes translated: %d\n' % genes_with_cds)
    sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
示例#5
0
def make_anno_db(gff_file):
    """
    extract the features from a gtf/gff file and store efficiently to query 

    @args gff_file: genome annotation file
    @type gff_file: str 
    """

    gff_cont = GFFParser.Parse(gff_file)

    intron_size = dict()
    exon_size = dict()

    for rec in gff_cont:
        for idx, tid in enumerate(rec['transcripts']):

            if not rec['exons'][idx].any():
                continue

            try:  # (Pdb) rec['exons'][0] -> array(nan)
                import numpy as np
                if np.isnan(rec['exons'][idx]):
                    continue
            except:
                pass

            try:
                exon_cnt = len(rec['exons'][idx])
            except:
                continue

            if exon_cnt > 1:
                intron_start = 0

                for xq, excod in enumerate(rec['exons'][idx]):

                    if xq > 0:
                        #print intron_start, excod[0]-1
                        if excod[0] - intron_start == 1:
                            intron_start = excod[1] + 1
                            exon_size[intron_start - excod[0]] = 1
                            continue

                        intron_size[excod[0] - intron_start] = 1
                        #print excod[0]-intron_start

                    intron_start = excod[1] + 1
                    exon_size[intron_start - excod[0]] = 1
                    #print intron_start-excod[0]

    feat_db = dict()
    if intron_size:
        keys_int = sorted(intron_size)
        keys_ex = sorted(exon_size)
        #print 'MaxIntronLength %d %d %d'  %(keys_int[-1], keys_int[-2], keys_int[-3])
        feat_db['min_intron'] = int(keys_int[0])
        feat_db['max_intron'] = int(keys_int[-3])

        feat_db['min_exon'] = int(keys_ex[0])
        feat_db['max_exon'] = int(keys_ex[-3])
        #print 'MaxExonLength %d %d %d'  %(keys_ex[-1], keys_ex[-2], keys_ex[-3])

        return feat_db
    else:
        print "Error in feature mapping in file %s, please check the source of parent child features" % gff_file
        sys.exit(-1)
def filter_gene_models(gff_name, fas_file, outFile):
    """
    check the sequence consistency/quality of predicted fragment

    @args gff_name: result file gff format from TranscriptSkimmer
    @type gff_name: str
    @args fas_file: genome sequence in fasta format
    @type fas_file: str 
    @args outFile: filtered gene output file 
    @type outFile: str 
    """

    sys.stdout.write('using genome sequence file %s\n' % fas_file)
    sys.stdout.write('using genome annotation file %s\n' % gff_name)

    sys.stdout.write("parsing genome annotation file...\n")
    gff_content = GFFParser.Parse(
        gff_name)  ## getting the genome annotation from GFF file
    sys.stdout.write(" ...done\n")

    sys.stdout.write("screening for spliced transcripts...\n")
    orf_short = 0
    spliced_cand = 0
    sing_exon_gen = 0
    transcript_cov = 0
    min_orf_length = 400

    transcripts_region = defaultdict(list)
    for gene_recd in gff_content:  ## screening the spliced transcripts
        spliced_transcript = defaultdict(list)

        for idx, sub_rec in enumerate(gene_recd['transcripts']):
            try:
                exon_cnt = len(gene_recd['exons'][idx])
            except:
                continue

            if exon_cnt > 1:  ## skipping the single-exon transcripts
                if gene_recd['transcript_info'][
                        idx]:  ## discarding the transcript based on the read coverage value
                    if float(
                            numpy.atleast_1d(gene_recd['transcript_info'][idx])
                        [0]) < 10:  ## read coverage value to consider
                        transcript_cov += 1
                        continue

                orf_length = 0
                for idk, ex in enumerate(gene_recd['exons'][idx]):
                    orf_length += ex[1] - (ex[0] - 1)

                    if idk == 0:
                        #ex[0] = None
                        spliced_transcript[(gene_recd['name'], sub_rec[0],
                                            gene_recd['strand'])].append(
                                                (None, ex[1]))
                    elif exon_cnt - 1 == idk:
                        #ex[1] = None
                        spliced_transcript[(gene_recd['name'], sub_rec[0],
                                            gene_recd['strand'])].append(
                                                (ex[0], None))
                    else:
                        spliced_transcript[(gene_recd['name'], sub_rec[0],
                                            gene_recd['strand'])].append(
                                                (ex[0], ex[1]))

                if orf_length < min_orf_length:  ## min orf length for the transcripts

                    del spliced_transcript[(
                        gene_recd['name'], sub_rec[0], gene_recd['strand']
                    )]  ## clearing that transcript details
                    orf_short += 1
                    continue

                spliced_cand += 1
            else:
                sing_exon_gen += 1
                #TODO orf length of the single exon gene will be good
                # to look, some histone genes are long enough to have
                # strong TSS region
                """
                single_exon_len = 0 
                for idk, ex in enumerate(gene_recd['exons'][idx]):
                    single_exon_len = ex[1]-(ex[0]-1)

                if single_exon_len > 1600:
                    spliced_transcript[(gene_recd['name'], sub_rec[0], gene_recd['strand'])].append((ex[0], ex[1]))
                """

        if spliced_transcript:
            transcripts_region[gene_recd['chr']].append(spliced_transcript)

    sys.stdout.write("...considering %d spliced transcripts\n" % spliced_cand)
    sys.stdout.write(
        "discarding transcripts...\n\t%d transcripts with single exon\n" %
        sing_exon_gen)
    sys.stdout.write(
        "\t%d transcripts with read coverage value less than 10\n" %
        transcript_cov)
    sys.stdout.write(
        "\t%d transcripts with orf region less than 400 nucleotides\n" %
        orf_short)

    genemodels = check_splice_site_consensus(fas_file, transcripts_region)

    write_filter_gene_models(gff_content, genemodels, outFile)