def calculate_total_lengths(db_file, species): """" Prints number of bases labeled as CDS, UTR, and intron in a gtf db file. Can be used to generate ratios for region distribution pie charts. :param db_file: gffutils.FeatureDB :param species: string used to generate chr19_keys corresonding to GTF column nomenclature. :return: """ keys = af.get_keys(species) db = gffutils.FeatureDB(db_file) cds_dict = af.get_all_cds_dict(db, keys['cds']) exons_dict = af.get_all_exons_dict(db, exon_key=keys['exon'], transcript_id_key=keys['transcript_id']) transcripts_dict = af.get_all_transcripts_dict( db, transcript_key=keys['transcript'], transcript_id_key=keys['transcript_id']) calculate_total_cds_length(db, keys) calculate_total_utr_lengths(db, cds_dict, keys) calculate_total_intron_lengths(db, exons_dict, transcripts_dict, keys)
def getCDScoords(gff): allCDScoords = { } #{ENSGENE_strand : [[cdsexon1start, cdsexon1stop], [cdsexon2start, cdsexon2stop]]} genecount = 0 noCDSexonscount = 0 #Make gff database print 'Indexing gff...' gff_fn = gff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, merge_strategy='merge') db = gffutils.FeatureDB(db_fn) print 'Done indexing!' genes = db.features_of_type('gene') for gene in genes: genecount += 1 #If this gene has 0 coding exons (for example a ncRNA), skip it CDS_count = len(list(db.children(gene, featuretype='mRNA'))) if CDS_count == 0: noCDSexonscount += 1 continue CDSlengths = {} #{transcriptID : combined_length_of_coding_exons} CDScoords = { } #{transcriptID : [[cdsexon1start, cdsexon1stop], [cdsexon2start, cdsexon2stop]]} genename = str(gene.id) chrm = str(gene.chrom) strand = gene.strand for transcript in db.children(gene, featuretype='mRNA', order_by='start'): transcriptID = str(transcript.id) CDScoords[transcriptID] = [] CDSlength = 0 for codingexon in db.children(transcript, featuretype='CDS', order_by='start'): CDScoords[transcriptID].append( [codingexon.start, codingexon.end]) exonlength = codingexon.end - codingexon.start CDSlength += exonlength CDSlengths[transcriptID] = CDSlength longestcds = max(CDSlengths.iterkeys(), key=(lambda key: CDSlengths[key])) for transcript in CDScoords: if transcript == longestcds: allCDScoords[genename + '_' + chrm + '_' + strand] = CDScoords[transcript] os.remove(db_fn) print 'Looked through {0} genes. {1} of them had no coding exons. Found longest CDS sequences for {2} of them.'.format( genecount, noCDSexonscount, len(allCDScoords)) return allCDScoords
def openGTF(gtfPath, verbose=True): try: gtf = gffutils.FeatureDB("{}.db".format(gtfPath), keep_order=True) except ValueError: if verbose: eprint("Indexing...") gtf = gffutils.create_db(gtfPath, dbfn="{}.db".format(gtfPath), force=True, keep_order=True, disable_infer_genes=True, disable_infer_transcripts=True, merge_strategy='merge', sort_attribute_values=True) gtf = gffutils.FeatureDB("{}.db".format(gtfPath), keep_order=True) return gtf
def open_gtf(gtf_path): with_genes = False with_transcripts = False for line in open(gtf_path): if line.startswith('#'): continue feat_type = line.split("\t")[2] if feat_type == "gene": with_genes = True elif feat_type == "transcript": with_transcripts = True else: # TODO maybe we can use a break. There are pros and cons # The first feature is not always a gene. continue try: gtf = gffutils.FeatureDB("{}.db".format(gtf_path), keep_order=True) except ValueError: gtf = gffutils.create_db(gtf_path, dbfn="{}.db".format(gtf_path), force=True, keep_order=True, disable_infer_genes=with_genes, disable_infer_transcripts=with_transcripts, merge_strategy="merge", sort_attribute_values=True) return gtf
def __init__(self, data, dbfn=None, id_column='id', csv2rec_kwargs=None): """ Generic class for handling tables of data. :param data: If a string, assume it's a filename and load that using `csv2rec_kwargs`. Otherwise, assume it's a record array. :param dbfn: Filename for a `gffutils.FeatureDB`. Optional, but really handy. :param id_column: Which column contains gene accessions that can be looked up in the `gffutils.FeatureDB`. :param csv2rec_kwargs: Kwargs passed to `matplotlib.mlab.csv2rec`. Default is dict(delimiter="\\t", missing="NA"). """ if csv2rec_kwargs is None: csv2rec_kwargs = dict(delimiter='\t', missing='NA') if isinstance(data, basestring): data = csv2rec(data, **csv2rec_kwargs) self.id_column = id_column self.data = data self.dbfn = dbfn self.gffdb = None if self.dbfn: self.gffdb = gffutils.FeatureDB(dbfn) self._cached_lookup = None
def extract_gene_gff(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir): global_names=globals() for db_file in strain_db_dir_path.iterdir(): global_names[db_file.stem.strip()+"_db"]=gffutils.FeatureDB(db_file) global_names["MGG_db"]=SeqIO.index("../../70-15_refference_genome/magnaporthe_oryzae_70-15_8_genes.fasta","fasta") gene_file_path=all_row_gene_fasta_dir/(all_row_gene_list_file.stem+"_all_row_gene_fasta.fasta") if gene_file_path.exists() is True:return with gene_file_path.open("w") as out_fl: with all_row_gene_list_file.open() as protein_id_list: for protein_id in protein_id_list: protein_id_list=protein_id.split("_",1) strain_id=protein_id_list[0] gff_protein_id=protein_id_list[1] if strain_id=="MGG": MGG_SeqRecord=globals().get("MGG_db")[protein_id[0:9]] MGG_SeqRecord_amend=SeqRecord( MGG_SeqRecord.seq, id=protein_id[0:9], description="" ) SeqIO.write(MGG_SeqRecord_amend,out_fl,"fasta") else: if strain_id=="WD-3-1": strain_id=protein_id[0:8] gff_protein_id=protein_id[9:] gene_plain_sequences=globals().get(strain_id+"_db")[gff_protein_id].sequence( str(contig_path/(strain_id+".fasta")), use_strand=True ) record = SeqRecord( Seq(gene_plain_sequences,Bio.Alphabet.IUPAC.unambiguous_dna), id=protein_id, description="" ) SeqIO.write(record, out_fl,"fasta")
def getjunjieregions(junjiegff): #Make gff databases print 'Indexing gff...' gff_fn = junjiegff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, verbose = True) db = gffutils.FeatureDB(db_fn) print 'Done indexing!' junjieregions = {} #{chrm : {strand : [list of nt in a junjie region]}} regions = db.features_of_type('region') for region in regions: for exon in db.children(region, featuretype = 'exon'): chrm = exon.chrom strand = exon.strand nt = range(exon.start, exon.end + 1) if chrm not in junjieregions: junjieregions[chrm] = {} if strand not in junjieregions[chrm]: junjieregions[chrm][strand] = [] junjieregions[chrm][strand] += nt #Remove duplicates for chrm in junjieregions: for strand in junjieregions[chrm]: nt = junjieregions[chrm][strand] junjieregions[chrm][strand] = list(set(nt)) os.remove(db_fn) return junjieregions
def get_gtf_db(gtf, in_memory=False): """ create a gffutils DB """ db_file = gtf + ".db" if file_exists(db_file): return gffutils.FeatureDB(db_file) db_file = ":memory:" if in_memory else db_file if in_memory or not file_exists(db_file): infer_extent = guess_infer_extent(gtf) db = gffutils.create_db(gtf, dbfn=db_file, infer_gene_extent=infer_extent) if in_memory: return db else: return gffutils.FeatureDB(db_file)
def main(): cds_file, gtf_file, out_file = parse_args(sys.argv[1:]) cdsinfo = make_cdsList(cds_file) outfn = open(out_file, "w+") # Make GFFdb, store db in tmpGFF.db file make_GFFdb(gtf_file) db = gffutils.FeatureDB('tmpGFF.db', keep_order=True) for mRNA in db.features_of_type("transcript", order_by="start"): # print transcript_id print("Processing transcript_id: ", mRNA["transcript_id"][0]) # print transcript print(mRNA, file=outfn) # print original exons for i in db.children(mRNA, featuretype="exon", order_by="start"): print(i, file=outfn) transcript = mRNA.id exons = get_exonList(db, mRNA) # Some transcript doesn't have CDS # Need to check first if transcript in cdsinfo: cds_start = cdsinfo[transcript][0] cds_end = cdsinfo[transcript][1] fiveUTR, threeUTR, cds_exons = Get_cds(exons, cds_start, cds_end, mRNA.strand) #Disable printing fiveUTR and threeUTR #for i in fiveUTR: # print(i, file = outfn) #for i in threeUTR: # print(i, file = outfn) for i in cds_exons: print(i, file=outfn) outfn.close()
def getnoncodingtxs(gff, genesofint): #Given a list of genes, figure out which ones have 0 coding transcripts. #Then go through and get the longest transcript for that gene. geneandtx = {} #{ENSMUSG : ENSMUST} geneboundaries = {} #{ENSMUSG : [genestart, genestop]} genecount = 0 noncodinggenecount = 0 #Make gff database print 'Indexing gff...' gff_fn = gff db_fn = os.path.abspath(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True) db = gffutils.FeatureDB(db_fn) print 'Done indexing!' genes = db.features_of_type('gene') for gene in genes: geneid = str(gene.id).replace('gene:', '').split('.')[0] if geneid in genesofint: genecount += 1 iscoding = seeifcoding(gene, db) if not iscoding: noncodinggenecount += 1 longesttx = getlongesttranscript(gene, db) geneandtx[geneid] = longesttx print 'Looked for {0} genes in gff and found {1} of them. Of these {2} had no coding transcripts.'.format( len(genesofint), genecount, noncodinggenecount) return geneandtx
def main(): # Get database file db_fn = args.gffFile db = gffutils.FeatureDB(db_fn) genes = db.features_of_type('gene', order_by='start') genelist = np.genfromtxt(args.geneList, delimiter=',', dtype=None) genelist_tpose = np.transpose(genelist) # Open output file with open(args.outputFile, 'wb') as outputFile: # Write column headings for gene in genes: for i in range(0, len(genelist_tpose)): if gene.id == genelist_tpose[i]: gene_start = gene.start gene_stop = gene.end genename = gene.id genechrom = gene.chrom # Upstream and downstram positions indicated by command line arguements totalstart = gene_start - args.upstreamSize totalstop = gene_stop + args.downstreamSize bedArray = [genechrom, totalstart, totalstop, genename] outputFile.write("\t".join(str(i) for i in bedArray) + "\n")
def classifygenes(exoniccoords, gff): #exoniccoords = {} #{geneid : {txid : [positionfactor, [set of exonic coords]]}} genetypes = {} #{geneid : type of 3' UTR end} print 'Indexing gff...' gff_fn = gff db_fn = os.path.abspath(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True) db = gffutils.FeatureDB(db_fn) print 'Done indexing!' for gene in exoniccoords: geneexoniccoords = exoniccoords[gene] coulditbeALE = isitALE(geneexoniccoords, db) coulditbeTUTR = isitTUTR(geneexoniccoords, db) if coulditbeALE == True and coulditbeTUTR == False: genetype = 'ALE' elif coulditbeALE == False and coulditbeTUTR == True: genetype = 'TUTR' elif coulditbeALE == False and coulditbeTUTR == False: genetype = 'mixed' elif coulditbeALE == True and coulditbeTUTR == True: genetype = 'ERROR' genetypes[gene] = genetype return genetypes
def main(args): logging.info("GTF:{gtf_db}, Transtypes:{transtypes}, outfile:{out}".format( gtf_db=args.gtf_db, transtypes=args.transtypes, out=args.out)) filtered_ids = set() # GTF db to be imported for filtering gtf_db = gffutils.FeatureDB(args.gtf_db) # Lets go through all the genes in the gtf for gene in gtf_db.features_of_type('gene'): # For every gene iterate over its transcripts for transcript in gtf_db.children(gene, featuretype='transcript'): # If it has any of the filtered attributes the process it if set(transcript.attributes['transcript_type']).intersection( args.transtypes): # If it is a single transcript gene we just remove that gene itself if len(list(gtf_db.children(gene, featuretype='transcript'))) == 1: filtered_ids.add(gene.id) # Else remove just the transcripts else: filtered_ids.add(transcript.id) # Write the filtered genes/transcripts to an outfile with open(args.out, "w") as out_handle: for filt_id in sorted(list(filtered_ids)): out_handle.write("{}\n".format(filt_id))
def sanitize_gff_file(gff_fname, in_memory=True, in_place=False): """ Sanitize a GFF file. """ db = None if is_gff_db(gff_fname): # It's a database filename, so load it db = gffutils.FeatureDB(gff_fname) else: # Need to create a database for file if in_memory: db = gffutils.create_db(gff_fname, ":memory:", verbose=False) else: db = get_gff_db(gff_fname) if in_place: gff_out = gffwriter.GFFWriter(gff_fname, in_place=in_place) else: gff_out = gffwriter.GFFWriter(sys.stdout) sanitized_db = sanitize_gff_db(db) for gene_rec in sanitized_db.all_features(featuretype="gene"): gff_out.write_gene_recs(sanitized_db, gene_rec.id) gff_out.close()
def gff2bed12(gff, bedoutfile, nameoutfile): #Make gff database print 'Indexing gff...' gff_fn = gff db_fn = os.path.abspath(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn, merge_strategy='merge', verbose=True) db = gffutils.FeatureDB(db_fn) print 'Done indexing!' transcripts = db.features_of_type('transcript') transcriptcounter = 0 with open(bedoutfile, 'w') as bedoutfh, open(nameoutfile, 'w') as nameoutfh: for transcript in transcripts: transcriptcounter += 1 if transcriptcounter % 1000 == 0: print 'Converting transcript {0}...'.format(transcriptcounter) bed12line = gffutils.FeatureDB.bed12(db, transcript) bed12line = bed12line.split('\t') bed12line[3] = bed12line[3].split('.')[0] bedoutfh.write(('\t').join(bed12line) + '\n') txid = transcript.id.split('.')[0] for gene in db.parents(transcript, featuretype='gene'): geneid = gene.id.split('.')[0] nameoutfh.write(txid + '\t' + geneid + '\n')
def get_sequence(gff_folder: str, name: str, gene_id: str): sequence = None db_file = os.path.join(gff_folder, "{}.db".format(name)) gffdb = gffutils.FeatureDB(db_file) item = gffdb[gene_id] start = item.start end = item.end strand = item.strand fasta_temp = tempfile.NamedTemporaryFile('w', suffix=".fa", delete=False) fasta_temp.close() read = False try: with Popen(['wget', '-O', fasta_temp.name, 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=fasta&id={}&from={}&to={}' .format(name, start, end)]) as proc: proc.communicate() exit_code = proc.wait() if exit_code != 0: print("ERROR failed to download fasta for {}".format(name)) else: read = True if read: with open(fasta_temp.name, "rU") as handle: for record in SeqIO.parse(handle, "fasta"): sequence = record.seq if strand == '-': sequence = sequence.reverse_complement() finally: try: os.remove(fasta_temp.name) except: pass return sequence
def annotate_combined_count_file(count_file, gtf_file, out_file=None): dbfn = gtf_file + ".db" if not file_exists(dbfn): return None if not gffutils: return None db = gffutils.FeatureDB(dbfn, keep_order=True) if not out_file: out_dir = os.path.dirname(count_file) out_file = os.path.join(out_dir, "annotated_combined.counts") # if the genes don't have a gene_id or gene_name set, bail out try: symbol_lookup = { f['gene_id'][0]: f['gene_name'][0] for f in db.features_of_type('exon') } except KeyError: return None df = pd.io.parsers.read_table(count_file, sep="\t", index_col=0, header=0) df['symbol'] = df.apply(lambda x: symbol_lookup.get(x.name, ""), axis=1) df.to_csv(out_file, sep="\t", index_label="id") return out_file
def main(database, nmd_file): dbhuman = gffutils.FeatureDB(database, keep_order=True) event_ids_df = possible_nmd(nmd_file) # if negative correlation event_ids_df = include_exon_nmd(event_ids_df, dbhuman) # if positive correlation # event_ids_df = exclude_exon_nmd(event_ids_df) return event_ids_df
def test__get_all_transcripts_overlapping_exon(self, database, single_exon_id, nmd_exons, all_transcripts_of_exon): db = gffutils.FeatureDB(database) exon = db[single_exon_id] test = nmd_exons._get_all_transcripts_overlapping_exon(exon) true = all_transcripts_of_exon assert test == true
def main(): db = gffutils.FeatureDB(snakemake.input.db) gene = to_bedtool(db.features_of_type("gene")).saveas() slopped = gene.slop(b=100, genome="dm6") merged = slopped.sort().merge() complement = merged.complement(genome="dm6").saveas() bed = complement.each(interName).saveas(snakemake.output.bed) bed.each(interGFF).saveas(snakemake.output.gtf)
def prep( bam_file="/projects/bio/rrna/data/rsubread_aligned_v2/SRR891244_s.bam", gtf_db_file="/projects/bio/rrna/data/annotations/Homo_sapiens.GRCh38.84.features_name.db" ): # bam, gtf_db = cc.prep() bam = pysam.AlignmentFile(bam_file, "rb") gtf_db = gffutils.FeatureDB(gtf_db_file) return bam, gtf_db
def annotation_bed12(annotation_db): import gffutils db = gffutils.FeatureDB(annotation_db) bed12 = '.'.join(annotation_db.strip().split('.')[:-2]) + '.bed12' with open(bed12, 'w') as handle: for t in db.features_of_type('transcript'): handle.write(db.bed12(t, name_field='transcript_id') + '\n') return bed12
def filter_by_attribute(attr_in, dbfn): db = gffutils.FeatureDB(dbfn) for feat in db.all_features(): if feat.attributes['gene_biotype'] != [attr_in]: continue print(feat)
def main(dbfile, fafile, outprefix): """ 统计之前需要先试用gff2sqliteDB.py把gff3转换为database文件 """ db = gffutils.FeatureDB(dbfile, keep_order=True) seqs = Fasta(fafile) gene_stat(db, seqs, outprefix) mRNA_stat(db, seqs, outprefix)
def convertGffToBedGffUtils(gffFile): fn = gffutils.example_filename(gffFile) # db = gffutils.create_db(fn, dbfn=gffFile[:-4] + '.db', force=True, keep_order=True, \ # merge_strategy='merge', sort_attribute_values=True) db = gffutils.FeatureDB(gffFile[:-4] + '.db', keep_order=True) gffIterator = db.all_features(order_by='start') bedVersion = pybedtools_integration.to_bedtool(gffIterator) print bedVersion
def translatePHB47(): import gffutils lookup = defaultdict(str) db = gffutils.FeatureDB('/home/maize/shared/databases/genomes/Zea_mays/PHB47/Zmaysvar.PHB47v1.1.gene_exons.gff3.db', keep_order=True) for gene in db.features_of_type('gene', order_by='start'): for i in db.children(gene, featuretype='mRNA', order_by='start'): lookup[i['Name'][0]] = gene['Name'][0] return lookup
def db2gtf(gtf, db): logger.info( "Converting gene annotation file to .gtf format (takes a while)...") with open(gtf, "w") as f: for record in gffutils.FeatureDB(db).all_features(order_by=('seqid', 'start')): f.write(str(record) + '\n') logger.info("Gene database written to " + gtf)
class TestGeneInfo: source_dir = os.path.dirname(os.path.realpath(__file__)) gffutils_db = gffutils.FeatureDB(os.path.join(source_dir, 'toy_data/synth.db'), keep_order=True) gene_db = gffutils_db['ENSMUSG00000020196.10'] def test_basic(self): gene_info = GeneInfo([self.gene_db], self.gffutils_db) assert gene_info.get_gene_region() == ("chr10", 1000, 10000) assert gene_info.transcript_start("ENSMUST00000001712.7") == 1000 assert gene_info.transcript_end("ENSMUST00000001713.7") == 10000 assert gene_info.transcript_region("ENSMUST00000001715.7") == (8000, 8800) assert gene_info.transcript_exon_count("ENSMUST00000001713.7") == 6 assert gene_info.total_transcript_length( "ENSMUST00000001712.7") == 1105 assert gene_info.chr_id == "chr10" assert gene_info.isoform_strands["ENSMUST00000001713.7"] == '-' assert gene_info.gene_id_map[ "ENSMUST00000001713.7"] == 'ENSMUSG00000020196.10' def test_intron_profiles(self): gene_info = GeneInfo([self.gene_db], self.gffutils_db) assert gene_info.intron_profiles.features[0] == (1101, 1999) assert gene_info.intron_profiles.features[-1] == (8201, 8499) assert len(gene_info.intron_profiles.features) == 9 assert gene_info.intron_profiles.profiles["ENSMUST00000001712.7"] == [ 1, 1, -1, 1, -1, -1, 1, -1, -1 ] assert gene_info.intron_profiles.profiles["ENSMUST00000001714.7"] == [ -2, -2, -2, -2, -2, -2, -1, -1, -2 ] assert gene_info.intron_profiles.profiles["ENSMUST00000001715.7"] == [ -2, -2, -2, -2, -2, -2, -1, -1, 1 ] def test_exon_profiles(self): gene_info = GeneInfo([self.gene_db], self.gffutils_db) assert gene_info.exon_profiles.features[0] == (1000, 1100) assert gene_info.exon_profiles.features[-1] == (9500, 10000) assert len(gene_info.exon_profiles.features) == 11 assert gene_info.exon_profiles.profiles["ENSMUST00000001712.7"] == [ 1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1 ] assert gene_info.exon_profiles.profiles["ENSMUST00000001714.7"] == [ -2, -2, -2, -2, -2, -2, -2, 1, -2, -2, -2 ] assert gene_info.split_exon_profiles.features[2] == (2101, 2200) assert gene_info.split_exon_profiles.features[-1] == (9500, 10000) assert len(gene_info.split_exon_profiles.features) == 11 assert gene_info.split_exon_profiles.profiles[ "ENSMUST00000001712.7"] == [1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1] assert gene_info.split_exon_profiles.profiles[ "ENSMUST00000001714.7"] == [ -2, -2, -2, -2, -2, -2, -2, 1, -2, -2, -2 ]
def main(): # Get the database db_fn = args.databaseFile db = gffutils.FeatureDB(db_fn) genes = db.features_of_type('gene', order_by='start') # Open the output BED file with open(args.outputFile, 'w') as outputFile: for gene in genes: # Get all exons exonList = list( db.children(gene, featuretype='exon', order_by='start')) # Skip genes that only have 1 exon, or genes that don't have an exon e.g. miRNA if len(exonList) == 0 or len(exonList) == 1: continue # Convert to BED 0-based coordinate system. Only have to change start. for exon in exonList: exon.start = exon.start - 1 # Make all pairwise combos combos = itertools.combinations(exonList, 2) junctionArray = [] for combo in combos: if not testOverlap(*combo): junctionArray.append(combo) # Create BED file for i in range(0, len(junctionArray)): # Only keep up to 100bp on each side of the junction (exon1Start, exon1Stop, exon2Start, exon2Stop) = defineRegion(i, junctionArray) # Construct various parts of the junction BED file totalStart = exon1Start # Take exon1 start totalStop = exon2Stop # Take exon2 stop chrom = junctionArray[i][0].chrom strand = junctionArray[i][0].strand score = junctionArray[i][0].score color = "255,0,0" name = ''.join( junctionArray[i][0].attributes['Name']) + '|' + ''.join( junctionArray[i][1].attributes['Name']) num = "2" lengths = str(len(range(exon1Start, exon1Stop))) + ',' + str( len(range(exon2Start, exon2Stop))) starts = str(exon1Start - totalStart) + ',' + str(exon2Start - totalStart) bedArray = [ chrom, totalStart, totalStop, name, score, strand, totalStart, totalStop, color, num, lengths, starts ] # Output to BED file outputFile.write("\t".join(str(i) for i in bedArray) + "\n")
def retrieve_splicing_gff(gffFile=None): info = defaultdict(dict) gffDbFile = gffFile + ".db" try: gffDatabase = gffutils.FeatureDB(gffDbFile) except: gffutils.create_db(gffFile, gffDbFile) gffDatabase = gffutils.FeatureDB(gffDbFile) genes = gffDatabase.features_of_type('gene') for geneObj in genes: event = geneObj.attributes['ID'] if "_" in geneObj.chrom: continue info[event]['chromosome'] = geneObj.chrom info[event]['strand'] = geneObj.strand spliceType = geneObj.source if spliceType != "SE": raise Exception inclusionIso, exclusionIso = [ i for i in gffDatabase.children(geneObj, featuretype='mRNA') ] inclusionExons, exclusionExons = [ list(i) for i in map(gffDatabase.children, [inclusionIso, exclusionIso]) ] inclusionJxns, exclusionJxns = map(get_junctions, [inclusionExons, exclusionExons]) info[event]["BODY"] = inclusionExons[1] if geneObj.strand == "+": info[event]["UP"] = inclusionExons[0] info[event]["DOWN"] = inclusionExons[2] else: info[event]["UP"] = inclusionExons[2] info[event]["DOWN"] = inclusionExons[0] info[event]['start'] = inclusionIso.start info[event]['end'] = inclusionIso.stop info[event]['inclusionJxns'] = inclusionJxns info[event]['exclusionJxns'] = exclusionJxns return info