def check_consensus(gfname, faname): """ checking the consensus sequence of the specific signal @args gfname: genome annotation in gtf/gff file @type gfname: str @args faname: genome sequence in fasta file @type faname: str """ ## extract genome annotation from gtf file gtf_file_content = GFFParser.Parse(gfname) print 'processed annotation file' ## signals considering for signal in ['splice', 'tis']: gtf_db, feature_cnt = get_label_regions(gtf_file_content, signal) print 'extracted %d %s signal regions' % (feature_cnt, signal) if signal == 'splice': don_true_seq, acc_true_seq, don_fal_seq, acc_fal_seq = true_ss_seq_fetch(faname, gtf_db, boundary=100) print 'summary of', signal, 'signal consensus' print 'don site cons %f non-cons %f' % (round((don_true_seq/feature_cnt)*100, 2), round((don_fal_seq/feature_cnt)*100, 2)) print 'acc site cons %f non-cons %f' % (round((acc_true_seq/feature_cnt)*100, 2), round((acc_fal_seq/feature_cnt)*100, 2)) break ## one singal
def get_features(gtf_file, fa_file): """ get the total number of genes based on the annotation source # of coding genes genes with 3' 5' UTR other different class of genes """ anno_db = GFFParser.Parse(gtf_file) total_genes = len(anno_db) non_coding = 0 cds_coding = 0 total_transcripts = 0 utr = 0 utr5 = 0 utr3 = 0 no_utr = 0 for features in anno_db: total_transcripts += len(features['transcripts']) for trans in features['cds_exons']: if trans.any(): cds_coding += 1 else: non_coding += 1 for idx, trans in enumerate(features['transcripts']): if features['utr3_exons'][idx].any() and features['utr5_exons'][idx].any(): utr +=1 elif features['utr3_exons'][idx].any(): utr3 +=1 elif features['utr5_exons'][idx].any(): utr5 += 1 else: no_utr +=1 print 'total genes: ', total_genes print 'total transcripts: ', total_transcripts print 'coding transcripts: ', cds_coding print 'noncoding transcripts: ', non_coding print '---------------' print 'both utrs', utr print '3 utr ', utr3 print '5 utr ', utr5 print 'no utr ', no_utr
def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100): """ Creating STAR genome index with or without using genome annotation TODO check whether the fasta and gtf files are uncompressed star works with uncompressed files in this step. @args fasta_file: reference genome sequence file .fasta format @type fasta_file: str @args out_dir: genome index binary file storage place @type out_dir: str @args genome_anno: genome annotation file (optional) @type genome_anno: str @args num_workers: number of threads to run (default value = 1) @type num_workers: int @args onematelength: One Mate Length (default value=100) @type num_workers: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") if not genome_anno: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d' % (out_dir, fasta_file, num_workers) else: ## check for the file type gff_hand = helper.open_file(genome_anno) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## according to the file type if ftype: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) else: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) ## create downloadpath if doesnot exists if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError: print "error: cannot create the directory %s." % out_dir sys.exit(0) else:## if present any other old index files clean up the folder for the_file in os.listdir(out_dir): file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print e
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1): """ wrapper for running STAR program @args org_db: a python dictionary with all details about a single organism @type org_db: defaultdict @args read_type: library type - paired-end or single-end (default: PE) @type read_type: str @args max_mates_gap_length: maximum insert size from the sample (default: 10000) @type max_mates_gap_length: int @args num_cpus: number of threads to use for the run (default: 1) @type num_cpus: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") from gfftools import helper, GFFParser genome_dir = org_db['genome_index_dir']## genome indices and annotation file gtf_db = org_db['gtf'] if gtf_db != None: ## check for the annotation file type gff or gtf gff_hand = helper.open_file(gtf_db) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## library type if read_type == 'PE': read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1]) else: read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0]) ## getting the command to uncompress the read file zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} file_prefx, ext = os.path.splitext(org_db['fastq'][0]) out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name']) ## genomic feature information max_lenth_intron = org_db['max_intron_len'] ## according to the file type if gtf_db == None: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus) elif ftype: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db) else: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db) sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run) try: process = subprocess.Popen(make_star_run, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix) except Exception, e: sys.exit("Error running STAR.\n%s" % str( e ))
def trsk_gene_len_dist(gtf_file, out_file="hist_cds_len.pdf"): """ plotting the histograms bases on the genes and CDS length """ import matplotlib.pyplot as plt anno_db = GFFParser.Parse(gtf_file) cds_idx = [] # deleting the empty cds lines for idp, feat in enumerate(anno_db): if not feat['cds_exons'][0].any(): cds_idx.append(idp) anno_db = np.delete(anno_db, cds_idx) trans_len = np.zeros((len(anno_db), 2)) genes = [] for idx, feat in enumerate(anno_db): cds_len = 0 for exc in feat['cds_exons'][0]: cds_len += exc[1] - exc[0] trans_len[idx, 0] = feat['stop'] - feat['start'] trans_len[idx, 1] = cds_len genes.append(feat['name']) ## gene, cds length information df_len_dis_genes = pd.DataFrame(trans_len, columns=['gene_len', 'cds_len'], index=genes) ## plotting the gene length based on the bins of gene length gene_length = trans_len[:, 0] ## gene length from the matrix freq, bins = np.histogram(gene_length, bins=10, range=None, normed=False, weights=None) bins = np.delete(bins, 10) df_gene_len_bin = pd.DataFrame(freq, columns=['gene_frequency'], index=bins) plt.figure() df_gene_len_bin.plot(kind="bar") #plt.savefig() ## plotting the cds length distribution cds_length = trans_len[:, 1] ## cds length distribution freq, bins = np.histogram(cds_length, bins=10, range=None, normed=False, weights=None) bins = np.delete(bins, 10) df_cds_len_bin = pd.DataFrame(freq, columns=['cds_frequency'], index=bins) plt.figure() df_cds_len_bin.plot(kind="bar") plt.savefig(out_file)
def translate_trsk_genes(gtf_file, fas_file, out_seq_fname): """ translate the trsk genes to protein sequence @args gtf_file: genome annotation file @type gtf_file: str @args fas_file: genome sequence file @type fas_file: str @args out_seq_fname: output file in fasta format @type out_seq_fname: str """ if filecmp.cmp(gtf_file, fas_file): exit("Do the two files are exactly same? Please check that!") ## reading the TSkim file to get the features sys.stdout.write('reading genome features from %s\n' % gtf_file) anno_db = GFFParser.Parse(gtf_file) total_genes = len(anno_db) ## genome sequence file reading sys.stdout.write('reading genome sequence from %s\n' % fas_file) seqlab.chrom_name_consistency(fas_file, anno_db) cds_idx = [] # deleting the empty cds lines for idp, feat in enumerate(anno_db): if not feat['cds_exons'][0].any( ): # TSkim annotation expects only single transcript from a region cds_idx.append(idp) anno_db = np.delete(anno_db, cds_idx) genes_with_cds = len(anno_db) fasFH = helper.open_file(fas_file) out_seq_fh = open(out_seq_fname, "w") for rec in SeqIO.parse(fasFH, "fasta"): for idx, feature in enumerate(anno_db): if rec.id == feature['chr']: ## iterate over cds_exons cds_seq = '' for ex in feature['cds_exons'][ 0]: ## single transcript by TSkim cds_seq += rec.seq[ex[0] - 1:ex[1]] if feature['strand'] == '-': cds_seq = cds_seq.reverse_complement() ## #sys.stdout.write(str(cds_seq.translate()) + "\n") ## fasta output if cds_seq: prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') out_seq_fh.write(prt_seq.format("fasta")) # FIXME need an efficient way to translate multiple gene # iterate over chromosome fasFH.close() out_seq_fh.close() sys.stdout.write('total genes fetched: %d\n' % total_genes) sys.stdout.write('total genes translated: %d\n' % genes_with_cds) sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100): """ Creating STAR genome index with or without using genome annotation @args fasta_file: reference genome sequence file .fasta format @type fasta_file: str @args out_dir: genome index binary file storage place @type out_dir: str @args genome_anno: genome annotation file (optional) @type genome_anno: str @args num_workers: number of threads to run (default value = 1) @type num_workers: int @args onematelength: One Mate Length (default value=100) @type onematelength: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") file_prefx, ext = os.path.splitext(fasta_file) if ext in [".bz2", ".gz", ".lzma" ]: ## checking for the compressed form of the file extension exit( "error: STAR - Generating genome indexes - recommended to use the uncompressed FASTA file %s." % fasta_file) if not genome_anno: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d' % (out_dir, fasta_file, num_workers) else: file_prefx, ext = os.path.splitext(genome_anno) if ext in [".bz2", ".gz", ".lzma"]: exit( "error: STAR - Generating genome indexes - recommended to use the uncompressed GTF/GFF file %s." % genome_anno) ## check for the file type gff_hand = helper.open_file(genome_anno) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## according to the file type if ftype: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) else: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) ## create downloadpath if doesnot exists if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError: exit("error: cannot create the directory %s." % out_dir) else: ## if present any other old index files clean up the folder for the_file in os.listdir(out_dir): file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print(e)
def make_anno_db(gff_file): """ extract the features from a gtf/gff file and store efficiently to query @args gff_file: genome annotation file @type gff_file: str """ gff_cont = GFFParser.Parse(gff_file) intron_size = dict() exon_size = dict() for rec in gff_cont: for idx, tid in enumerate(rec['transcripts']): if not rec['exons'][idx].any(): continue try: # (Pdb) rec['exons'][0] -> array(nan) import numpy as np if np.isnan(rec['exons'][idx]): continue except: pass try: exon_cnt = len(rec['exons'][idx]) except: continue if exon_cnt > 1: intron_start = 0 for xq, excod in enumerate(rec['exons'][idx]): if xq > 0: #print intron_start, excod[0]-1 if excod[0] - intron_start == 1: intron_start = excod[1] + 1 exon_size[intron_start - excod[0]] = 1 continue intron_size[excod[0] - intron_start] = 1 #print excod[0]-intron_start intron_start = excod[1] + 1 exon_size[intron_start - excod[0]] = 1 #print intron_start-excod[0] feat_db = dict() if intron_size: keys_int = sorted(intron_size) keys_ex = sorted(exon_size) #print 'MaxIntronLength %d %d %d' %(keys_int[-1], keys_int[-2], keys_int[-3]) feat_db['min_intron'] = int(keys_int[0]) feat_db['max_intron'] = int(keys_int[-3]) feat_db['min_exon'] = int(keys_ex[0]) feat_db['max_exon'] = int(keys_ex[-3]) #print 'MaxExonLength %d %d %d' %(keys_ex[-1], keys_ex[-2], keys_ex[-3]) return feat_db else: print "Error in feature mapping in file %s, please check the source of parent child features" % gff_file sys.exit(-1)
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1): """ wrapper for running STAR program @args org_db: a python dictionary with all details about a single organism @type org_db: defaultdict @args read_type: library type - paired-end or single-end (default: PE) @type read_type: str @args max_mates_gap_length: maximum insert size from the sample (default: 10000) @type max_mates_gap_length: int @args num_cpus: number of threads to use for the run (default: 1) @type num_cpus: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") from gfftools import helper, GFFParser genome_dir = org_db['genome_index_dir']## genome indices and annotation file gtf_db = org_db['gtf'] if gtf_db != None: ## check for the annotation file type gff or gtf gff_hand = helper.open_file(gtf_db) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## library type if read_type == 'PE': read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1]) else: read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0]) ## getting the command to uncompress the read file zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} file_prefx, ext = os.path.splitext(org_db['fastq'][0]) out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name']) ## genomic feature information max_lenth_intron = org_db['max_intron_len'] ##sjdbOverhang mate_len = org_db['mate_length'] ## according to the file type if gtf_db == None: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, mate_len) elif ftype: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len) else: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len) sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run) try: process = subprocess.Popen(make_star_run, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix) except Exception, e: sys.exit("Error running STAR.\n%s" % str( e ))
def filter_gene_models(gff_name, fas_file, outFile): """ check the sequence consistency/quality of predicted fragment @args gff_name: result file gff format from TranscriptSkimmer @type gff_name: str @args fas_file: genome sequence in fasta format @type fas_file: str @args outFile: filtered gene output file @type outFile: str """ sys.stdout.write('using genome sequence file %s\n' % fas_file) sys.stdout.write('using genome annotation file %s\n' % gff_name) sys.stdout.write("parsing genome annotation file...\n") gff_content = GFFParser.Parse( gff_name) ## getting the genome annotation from GFF file sys.stdout.write(" ...done\n") sys.stdout.write("screening for spliced transcripts...\n") orf_short = 0 spliced_cand = 0 sing_exon_gen = 0 transcript_cov = 0 min_orf_length = 400 transcripts_region = defaultdict(list) for gene_recd in gff_content: ## screening the spliced transcripts spliced_transcript = defaultdict(list) for idx, sub_rec in enumerate(gene_recd['transcripts']): try: exon_cnt = len(gene_recd['exons'][idx]) except: continue if exon_cnt > 1: ## skipping the single-exon transcripts if gene_recd['transcript_info'][ idx]: ## discarding the transcript based on the read coverage value if float( numpy.atleast_1d(gene_recd['transcript_info'][idx]) [0]) < 10: ## read coverage value to consider transcript_cov += 1 continue orf_length = 0 for idk, ex in enumerate(gene_recd['exons'][idx]): orf_length += ex[1] - (ex[0] - 1) if idk == 0: #ex[0] = None spliced_transcript[(gene_recd['name'], sub_rec[0], gene_recd['strand'])].append( (None, ex[1])) elif exon_cnt - 1 == idk: #ex[1] = None spliced_transcript[(gene_recd['name'], sub_rec[0], gene_recd['strand'])].append( (ex[0], None)) else: spliced_transcript[(gene_recd['name'], sub_rec[0], gene_recd['strand'])].append( (ex[0], ex[1])) if orf_length < min_orf_length: ## min orf length for the transcripts del spliced_transcript[( gene_recd['name'], sub_rec[0], gene_recd['strand'] )] ## clearing that transcript details orf_short += 1 continue spliced_cand += 1 else: sing_exon_gen += 1 #TODO orf length of the single exon gene will be good # to look, some histone genes are long enough to have # strong TSS region """ single_exon_len = 0 for idk, ex in enumerate(gene_recd['exons'][idx]): single_exon_len = ex[1]-(ex[0]-1) if single_exon_len > 1600: spliced_transcript[(gene_recd['name'], sub_rec[0], gene_recd['strand'])].append((ex[0], ex[1])) """ if spliced_transcript: transcripts_region[gene_recd['chr']].append(spliced_transcript) sys.stdout.write("...considering %d spliced transcripts\n" % spliced_cand) sys.stdout.write( "discarding transcripts...\n\t%d transcripts with single exon\n" % sing_exon_gen) sys.stdout.write( "\t%d transcripts with read coverage value less than 10\n" % transcript_cov) sys.stdout.write( "\t%d transcripts with orf region less than 400 nucleotides\n" % orf_short) genemodels = check_splice_site_consensus(fas_file, transcripts_region) write_filter_gene_models(gff_content, genemodels, outFile)