def index_gff(gff_filename, output_dir, compress_id=False): """ Index the given GFF and placed the indexed representation in the output directory. """ print "Indexing GFF..." if compress_id: print " - Using compressed IDs to create indexed filenames." # First check that the GFF is not already indexed indexed_files = glob.glob(os.path.join(output_dir, "chr*")) if len(indexed_files) >= 1: print "%s appears to already be indexed. Aborting." \ %(gff_filename) return #print " - GFF: %s" %(gff_filename) #print " - Outputting to: %s" %(output_dir) overall_t1 = time.time() t1 = time.time() gff_genes = gene_utils.load_genes_from_gff(gff_filename) t2 = time.time() #print " - Loading of genes from GFF took %.2f seconds" %(t2 - t1) t1 = time.time() serialize_genes(gff_genes, gff_filename, output_dir, compress_id=compress_id) t2 = time.time() print " Serialization of genes from GFF took %.2f seconds" %(t2 - t1) overall_t2 = time.time() print "Indexing of GFF took %.2f seconds." %(overall_t2 - overall_t1)
def sanitize_gff(gff_fname, output_dir, include_introns=True): """ Sanitize a GFF file. Return the revised GFF file. """ gff_out_fname = os.path.join(output_dir, os.path.basename(gff_fname)) genes = gene_utils.load_genes_from_gff(gff_fname, include_introns=include_introns) t1 = time.time() with open(gff_out_fname, "w") as gff_out_file: gff_out = miso_gff_utils.Writer(gff_out_file) for gene in genes: gene_obj = genes[gene]["gene_object"] gene_record = genes[gene]["hierarchy"][gene]["gene"] gene_hierarchy = genes[gene]["hierarchy"][gene] # Write gene record write_rec_to_gff(gff_out, sanitize_record(gene_record)) for mRNA in gene_obj.isoforms: mRNA_id = mRNA.label mRNA_obj = gene_hierarchy["mRNAs"][mRNA_id] mRNA_record = mRNA_obj["record"] # Write out the mRNA record write_rec_to_gff(gff_out, sanitize_record(mRNA_record)) # Get parts of each mRNA parts_records = \ [mRNA_obj["exons"][p.label]["record"] for p in mRNA.parts] if gene_obj.strand == "-": parts_records = fix_up_down_ids(parts_records) for part in parts_records: write_rec_to_gff(gff_out, part) t2 = time.time() print "Sanitizing took %.2f seconds" %(t2 - t1) return gff_out_fname
def loaded_events_to_genes(self, single_event_name=None, read_len=None, overhang_len=None): """ Parse the loaded set of events into gene structures. Map events to genes. """ if len(self.events) == 0: raise Exception, "Must load events first before they can be converted to genes." events_to_genes = {} t1 = time.time() if single_event_name: # If given an event name, only parse that event event_names = [single_event_name] else: event_names = self.events.keys() for event_name in event_names: event = self.events[event_name] if self.event_type == 'SE' or self.event_type == 'RI': gene = Gene.se_event_to_gene(event.up_part_len, event.len, event.dn_part_len, event.chrom, label=event.label) elif self.event_type == 'TandemUTR': gene = Gene.tandem_utr_event_to_gene(event.core_len, event.ext_len, event.chrom, label=event.label) elif (self.event_type == 'AFE' or self.event_type == 'ALE'): gene = Gene.afe_ale_event_to_gene(event.proximal_exons, event.distal_exons, self.event_type, event.chrom, label=event.label, read_len=read_len, overhang_len=overhang_len) else: raise Exception, "Unsupported event type: %s" % ( self.event_type) events_to_genes[event_name] = gene t2 = time.time() print "Parsing of events to genes took %.2f seconds." % (t2 - t1) return events_to_genes
def index_exons(gff_fname): gff_genes = gene_utils.load_genes_from_gff(gff_fname) exons = defaultdict(bool) for gene_id in gff_genes: gene_obj = gff_genes[gene_id]["gene_object"] se = gene_obj.isoforms[0].parts[1] # Index the exon by chromosome exons[(gene_obj.chrom, se.start, se.end, gene_obj.strand)] = True return exons
def instert_introns_to_gff3(gff_filename, output_gff3_filename): output_filename = os.path.join("%s_introns.gff3" % (output_gff3_filename)) print "Adding introns to GFF..." print " - Input: %s" % (gff_filename) print " - Output: %s" % (output_filename) gff_out = gff_utils.Writer(open(output_filename, "w")) gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA_id in gene_tree[gene_id]["mRNAs"]: curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%s:%d-%d:%s" % (isoform.label, gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, ".", gene_obj.strand, ".", attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." % ((t2 - t1) / 60.)
def loaded_events_to_genes(self, single_event_name=None, read_len=None, overhang_len=None): """ Parse the loaded set of events into gene structures. Map events to genes. """ if len(self.events) == 0: raise Exception, "Must load events first before they can be converted to genes." events_to_genes = {} t1 = time.time() if single_event_name: # If given an event name, only parse that event event_names = [single_event_name] else: event_names = self.events.keys() for event_name in event_names: event = self.events[event_name] if self.event_type == 'SE' or self.event_type == 'RI': gene = Gene.se_event_to_gene(event.up_part_len, event.len, event.dn_part_len, event.chrom, label=event.label) elif self.event_type == 'TandemUTR': gene = Gene.tandem_utr_event_to_gene(event.core_len, event.ext_len, event.chrom, label=event.label) elif (self.event_type == 'AFE' or self.event_type == 'ALE'): gene = Gene.afe_ale_event_to_gene(event.proximal_exons, event.distal_exons, self.event_type, event.chrom, label=event.label, read_len=read_len, overhang_len=overhang_len) else: raise Exception, "Unsupported event type: %s" %(self.event_type) events_to_genes[event_name] = gene t2 = time.time() print "Parsing of events to genes took %.2f seconds." %(t2 - t1) return events_to_genes
def extract_lens_from_gff(gff_fname, output_dir): entries = [] output_basename = "%s.lens" %(os.path.basename(gff_fname)) output_fname = os.path.join(output_dir, output_basename) print "Extracting lengths from GFF file..." print " - Input GFF: %s" %(gff_fname) print " - Output file: %s" %(output_fname) if os.path.isfile(output_fname): print "Overwriting %s" %(output_fname) gff_genes = gene_utils.load_genes_from_gff(gff_fname) for gene_id in gff_genes: gene = gff_genes[gene_id]["gene_object"] # Get the length of each isoform iso_lens = [] iso_labels = [] iso_exon_lens = [] genomic_coords = [] for iso in gene.isoforms: iso_lens.append(str(iso.len)) iso_labels.append(iso.label) exon_lens = [str(exon.len) for exon in iso.parts] iso_exon_lens.append(exon_lens) genomic_coords.append([iso.genomic_start, iso.genomic_end]) genomic_coords = np.array(genomic_coords) genomic_lens = \ map(str, list(genomic_coords[:, 1] - genomic_coords[:, 0] + 1)) entry = \ {"event_name": gene.label, "mRNA_lens": ",".join(iso_lens), "mRNA_labels": ",".join(iso_labels), "exon_lens": ";".join([",".join(exons) for exons in iso_exon_lens]), "genomic_lens": ",".join(genomic_lens)} entries.append(entry) entries_df = pandas.DataFrame(entries) entries_df.to_csv(output_fname, cols=["event_name", "mRNA_labels", "mRNA_lens", "exon_lens", "genomic_lens"], sep="\t", index=False)
def add_introns_to_gff(gff_filename, output_dir): """ Add 'intron' entries to GFF. """ output_basename = \ utils.trim_gff_ext(os.path.basename(gff_filename)) ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1] output_filename = \ os.path.join(output_dir, "%s.with_introns.%s" %(output_basename, ext_to_use)) print "Adding introns to GFF..." print " - Input: %s" %(gff_filename) print " - Output: %s" %(output_filename) if os.path.isfile(output_filename): print "Found file %s, skipping.." %(output_filename) return output_filename gff_out = miso_gff_utils.Writer(open(output_filename, "w")) gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA in gene_obj.isoforms: mRNA_id = mRNA.label curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%d-%d:%s.intron" \ %(gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, strand=gene_obj.strand, attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." %((t2 - t1)/60.)
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir, with_flanking_introns=False, flanking_introns_coords=None, overwrite=True, entries_to_include=["gene", "mRNA", "exon"]): """ Fetch sequence from GFF file. Outputs: (1) GFF file containing an annotation of the sequences. (2) FASTA file with the actual sequences. If asked, fetch the flanking intronic sequences. Flanking regions are marked below: U: region of upstream intron D: region of downstream intron U D [ U P ]-----[ S E ]-----[ D N ] a,b c,d a,b,c,d correspond to optional flanking intron coordinates that determine the regions of the upstream/downstream introns that should be fetched: a, b: negative ints, position relative to 5' splice site of SE a < b c, d: positive ints, position relative to 3' splice site of SE c < d """ # Load GFF genes gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname, reverse_recs=True) file_basename = re.sub("\.gff3?", "", os.path.basename(gff_fname)) output_basename = "%s.event_seqs" %(file_basename) if flanking_introns_coords is not None: output_basename = "%s.flank_intronic_%s_%s_%s_%s" \ %(output_basename, flanking_introns_coords[0], flanking_introns_coords[1], flanking_introns_coords[2], flanking_introns_coords[3]) gff_outdir = os.path.join(output_dir, "gff_coords") utils.make_dir(gff_outdir) gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename)) fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename)) if not overwrite: if os.path.isfile(fasta_output_fname): print "Output file %s exists. Skipping..." %(fasta_output_fname) return fasta_output_fname print "Outputting GFF coordinates to: %s" %(gff_output_fname) if os.path.isfile(gff_output_fname): print " - Overwriting existing file" print "Outputting sequences to: %s" %(fasta_output_fname) if os.path.isfile(fasta_output_fname): print " - Overwriting existing file" genes = gene_utils.load_genes_from_gff(gff_fname) gff_out_file = open(gff_output_fname, "w") gff_out = miso_gff_utils.Writer(gff_out_file) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] # GFF records to write for the current gene recs_to_write = [] # For mRNA entries, extract the flanking introns of the # alternative exon if asked event_recs = get_event_recs_from_gene(gene_obj, gene_tree) long_mRNA_id = event_recs["long_mRNA"].get_id() if event_recs is None: continue # Write out up, se, and dn exons recs_to_write.extend([event_recs["up_exon"]["record"], event_recs["se_exon"]["record"], event_recs["dn_exon"]["record"]]) if with_flanking_introns: introns_coords = \ get_flanking_introns_coords(gene_obj) if introns_coords == None: raise Exception, "Cannot find flanking introns coordinates." sys.exit(1) # Fetch upstream intron sequence up_intron_start, up_intron_end = \ introns_coords["up_intron"] up_intron_len = up_intron_end - up_intron_start + 1 # Fetch downstream intron sequence dn_intron_start, dn_intron_end = \ introns_coords["dn_intron"] dn_intron_len = dn_intron_end - dn_intron_start + 1 # If given custom coordinates, use them instead of entire up/down # flanking intronic coordinates. se_exon_rec = event_recs["se_exon"]["record"] if flanking_introns_coords is not None: # (start,end) of upstream intron sequence a, b = \ int(flanking_introns_coords[0]), int(flanking_introns_coords[1]) c, d = \ int(flanking_introns_coords[2]), int(flanking_introns_coords[3]) a, b, c, d = error_check_intronic_coords(a, b, c, d, up_intron_len, dn_intron_len) # Coordinates relative to 5' splice site of sequence to be fetched # The start of upstream intron sequence is negative from the 5' ss up_intron_start = se_exon_rec.start + a up_intron_end = se_exon_rec.start + b dn_intron_start = se_exon_rec.end + c dn_intron_end = se_exon_rec.end + d # Make GFF records for up/dn intronic sequences chrom = se_exon_rec.seqid source = se_exon_rec.source rec_type = "intron" strand = se_exon_rec.strand up_intron_str = "%s.up_intron" %(long_mRNA_id) up_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", up_intron_start, up_intron_end, strand=strand, attributes={"ID": [up_intron_str], "Parent": [gene_obj.label]}) dn_intron_str = "%s.dn_intron" %(long_mRNA_id) dn_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", dn_intron_start, dn_intron_end, strand=strand, attributes={"ID": [dn_intron_str], "Parent": [gene_obj.label]}) recs_to_write.append(up_intron_rec) recs_to_write.append(dn_intron_rec) # Write out records to GFF for rec in recs_to_write: gff_out.write(rec) gff_out_file.close() # Output FASTA sequences output_fasta_seqs_from_gff(gff_output_fname, fasta_fname, fasta_output_fname) return fasta_output_fname
def instert_introns_to_gff3(gff_filename, output_gff3_filename): output_filename = os.path.join("%s_introns.gff3" %(output_gff3_filename)) print "Adding introns to GFF..." print " - Input: %s" %(gff_filename) print " - Output: %s" %(output_filename) gff_out = gff_utils.Writer(open(output_filename, "w")) gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] gene_start = int(str(gene_tree[gene_id]['gene']).split(",")[3].strip(" ")) gene_end = int(str(gene_tree[gene_id]['gene']).split(",")[4].strip(" ")) # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA_id in gene_tree[gene_id]["mRNAs"]: curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] #curr_cds= gene_tree[gene_id]["mRNAs"][mRNA_id]["CDSs"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) #gff_out.write(curr_cds[cds]["record"]) # Now output the introns for isoform in gene_obj.isoforms: #print gene_obj.isoforms intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts): final_exon_end=0 first_exon_start=0 if(len(intron_coords)==len(isoform.parts)-1): final_exon_end=second_exon.end if(len(isoform.parts)==1): first_exon_start=first_exon.start #print first_exon if(first_exon_start>1 and gene_start==1): intron_start=1 intron_end = first_exon_start - 1 elif(gene_end>final_exon_end and final_exon_end!=0): #print str(isoform.parts[len(isoform.parts)-1]).split(",") print first_exon_start intron_start=final_exon_end+1 intron_end=gene_end else: intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%d-%d:%s" %(gene_obj.chrom, intron_start, intron_end,gene_obj.strand) intron_rec = \ gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end,".",gene_obj.strand, attributes={"ID": [gene_obj.label], "Parent": [isoform.label]}) gff_out.write(intron_rec) for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon #print "test" #print "s" #else: # intron_start = first_exon.end + 1 # intron_end = second_exon.start - 1 intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%d-%d:%s" %(gene_obj.chrom, intron_start, intron_end,gene_obj.strand) intron_rec = \ gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end,".",gene_obj.strand, attributes={"ID": [gene_obj.label], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." %((t2 - t1)/60.)