def instert_introns_to_gff3(gff_filename, output_gff3_filename): output_filename = os.path.join("%s_introns.gff3" % (output_gff3_filename)) print "Adding introns to GFF..." print " - Input: %s" % (gff_filename) print " - Output: %s" % (output_filename) gff_out = gff_utils.Writer(open(output_filename, "w")) gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA_id in gene_tree[gene_id]["mRNAs"]: curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%s:%d-%d:%s" % (isoform.label, gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, ".", gene_obj.strand, ".", attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." % ((t2 - t1) / 60.)
def shorten_gff(input_gff, output_gff, max_id_len=75): # # List of search and replace with IDs # replace_ids = [] # # pattern to identify ID= elements # pat = 'ID=(.+);' # input_file = open(input_gff, 'r') # for line in input_file: # # find ID= elements # match = re.search(pat, line) # if match != None: # assert(len(match.groups()) > 0) # id_to_replace = match.groups()[0] # if len(id_to_replace) >= max_id_len: # new_id = shorten_id(id_to_replace) # #replace_ids.append((id_to_replace, new_id)) # old_to_new_ids[id_to_replace] = new_id """ Replace the ith ID in old_ids with the ith ID in new_ids. Output result to output gff. """ new_recs = [] # Load input GFF t1 = time.time() gff_in = GFF.GFFDatabase(from_filename=input_gff, reverse_recs=True) # Mapping from old to new IDs old_to_new_ids = {} for rec in gff_in: new_record = shorten_rec(rec, old_to_new_ids, max_id_len) new_recs.append(new_record) t2 = time.time() print "Loading of input GFF took %.2f seconds" %(t2 - t1) print "Writing revised gff to: %s" %(output_gff) output_file = open(output_gff, 'w') gff_writer = GFF.Writer(output_file) # Write new GFF file gff_writer.write_recs(new_recs) output_file.close()
def get_events_in_region(gff_filename, region, record_types=["gene"]): """ Output Return all 'gene' entries in a given GFF file that intersect the given region. record_types is a list of GFF records to collect (e.g. gene, mRNA, ...) """ gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) # Parse the query region parsed_region = parse_query_region(region) query_chrom, query_start, query_end, \ query_strand = parsed_region matched_records = [] num_recs = 0 for record in gff_db: chrom = record.seqid # Name name = record.type start, end = int(record.start), int(record.end) strand = record.strand # Skip GFF records that don't match our record types if name not in record_types: continue num_recs += 1 # Check that there is intersection if (query_chrom != chrom) or \ (not utils.intersect_coords(query_start, query_end, start, end)): # Skip if chromosomes don't match or if there's no intersection continue # If strand is supplied in query region, check that # the strand matches if (query_strand is not None) and \ (strand != query_strand): continue # Must match record_id = record.get_id() print "%s" % (record_id) print " - ", record matched_records.append(record) print "Looked through %d records." % (num_recs) return matched_records
def add_introns_to_gff(gff_filename, output_dir): """ Add 'intron' entries to GFF. """ output_basename = \ utils.trim_gff_ext(os.path.basename(gff_filename)) ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1] output_filename = \ os.path.join(output_dir, "%s.with_introns.%s" %(output_basename, ext_to_use)) print "Adding introns to GFF..." print " - Input: %s" %(gff_filename) print " - Output: %s" %(output_filename) if os.path.isfile(output_filename): print "Found file %s, skipping.." %(output_filename) return output_filename gff_out = miso_gff_utils.Writer(open(output_filename, "w")) gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA in gene_obj.isoforms: mRNA_id = mRNA.label curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%d-%d:%s.intron" \ %(gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, strand=gene_obj.strand, attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." %((t2 - t1)/60.)
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir, with_flanking_introns=False, flanking_introns_coords=None, overwrite=True, entries_to_include=["gene", "mRNA", "exon"]): """ Fetch sequence from GFF file. Outputs: (1) GFF file containing an annotation of the sequences. (2) FASTA file with the actual sequences. If asked, fetch the flanking intronic sequences. Flanking regions are marked below: U: region of upstream intron D: region of downstream intron U D [ U P ]-----[ S E ]-----[ D N ] a,b c,d a,b,c,d correspond to optional flanking intron coordinates that determine the regions of the upstream/downstream introns that should be fetched: a, b: negative ints, position relative to 5' splice site of SE a < b c, d: positive ints, position relative to 3' splice site of SE c < d """ # Load GFF genes gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname, reverse_recs=True) file_basename = re.sub("\.gff3?", "", os.path.basename(gff_fname)) output_basename = "%s.event_seqs" %(file_basename) if flanking_introns_coords is not None: output_basename = "%s.flank_intronic_%s_%s_%s_%s" \ %(output_basename, flanking_introns_coords[0], flanking_introns_coords[1], flanking_introns_coords[2], flanking_introns_coords[3]) gff_outdir = os.path.join(output_dir, "gff_coords") utils.make_dir(gff_outdir) gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename)) fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename)) if not overwrite: if os.path.isfile(fasta_output_fname): print "Output file %s exists. Skipping..." %(fasta_output_fname) return fasta_output_fname print "Outputting GFF coordinates to: %s" %(gff_output_fname) if os.path.isfile(gff_output_fname): print " - Overwriting existing file" print "Outputting sequences to: %s" %(fasta_output_fname) if os.path.isfile(fasta_output_fname): print " - Overwriting existing file" genes = gene_utils.load_genes_from_gff(gff_fname) gff_out_file = open(gff_output_fname, "w") gff_out = miso_gff_utils.Writer(gff_out_file) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] # GFF records to write for the current gene recs_to_write = [] # For mRNA entries, extract the flanking introns of the # alternative exon if asked event_recs = get_event_recs_from_gene(gene_obj, gene_tree) long_mRNA_id = event_recs["long_mRNA"].get_id() if event_recs is None: continue # Write out up, se, and dn exons recs_to_write.extend([event_recs["up_exon"]["record"], event_recs["se_exon"]["record"], event_recs["dn_exon"]["record"]]) if with_flanking_introns: introns_coords = \ get_flanking_introns_coords(gene_obj) if introns_coords == None: raise Exception, "Cannot find flanking introns coordinates." sys.exit(1) # Fetch upstream intron sequence up_intron_start, up_intron_end = \ introns_coords["up_intron"] up_intron_len = up_intron_end - up_intron_start + 1 # Fetch downstream intron sequence dn_intron_start, dn_intron_end = \ introns_coords["dn_intron"] dn_intron_len = dn_intron_end - dn_intron_start + 1 # If given custom coordinates, use them instead of entire up/down # flanking intronic coordinates. se_exon_rec = event_recs["se_exon"]["record"] if flanking_introns_coords is not None: # (start,end) of upstream intron sequence a, b = \ int(flanking_introns_coords[0]), int(flanking_introns_coords[1]) c, d = \ int(flanking_introns_coords[2]), int(flanking_introns_coords[3]) a, b, c, d = error_check_intronic_coords(a, b, c, d, up_intron_len, dn_intron_len) # Coordinates relative to 5' splice site of sequence to be fetched # The start of upstream intron sequence is negative from the 5' ss up_intron_start = se_exon_rec.start + a up_intron_end = se_exon_rec.start + b dn_intron_start = se_exon_rec.end + c dn_intron_end = se_exon_rec.end + d # Make GFF records for up/dn intronic sequences chrom = se_exon_rec.seqid source = se_exon_rec.source rec_type = "intron" strand = se_exon_rec.strand up_intron_str = "%s.up_intron" %(long_mRNA_id) up_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", up_intron_start, up_intron_end, strand=strand, attributes={"ID": [up_intron_str], "Parent": [gene_obj.label]}) dn_intron_str = "%s.dn_intron" %(long_mRNA_id) dn_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", dn_intron_start, dn_intron_end, strand=strand, attributes={"ID": [dn_intron_str], "Parent": [gene_obj.label]}) recs_to_write.append(up_intron_rec) recs_to_write.append(dn_intron_rec) # Write out records to GFF for rec in recs_to_write: gff_out.write(rec) gff_out_file.close() # Output FASTA sequences output_fasta_seqs_from_gff(gff_output_fname, fasta_fname, fasta_output_fname) return fasta_output_fname
def get_const_exons_by_gene(gff_filename, output_dir, output_filename=None, all_constitutive=False, min_size=0, output_format='gff'): """ Get consitutive exons from GFF file. Arguments: - gff_filename: GFF input filename - output_dir: output directory Optional arguments: - min_size: minimum exon size - output_format: gff or BED - all_constitutive: treat all exons as constitutive """ print "Getting constitutive exons..." print " - Input GFF: %s" % (gff_filename) print " - Output dir: %s" % (output_dir) print " - Output format: %s" % (output_format) if not os.path.isdir(output_dir): os.makedirs(output_dir) if min_size > 0: print " - Including only exons greater than or " \ "equal to %d-bp" \ %(min_size) t1 = time.time() gff_in = gff_utils.GFFDatabase(from_filename=gff_filename) const_exons_by_gene = [] num_exons = 0 for gene, mRNAs in gff_in.mRNAs_by_gene.iteritems(): # For each gene, look at all mRNAs and return constitutive exon curr_const_exons = \ get_const_exons_from_mRNA(gff_in, mRNAs, all_constitutive=all_constitutive, min_size=min_size) const_exons_by_gene.extend(curr_const_exons) num_exons += len(curr_const_exons) t2 = time.time() basename = re.sub("[.]gff3?", "", os.path.basename(gff_filename)) if output_filename is None: # Create default output filename if not # given one as argument output_filename = os.path.join(output_dir, "%s.min_%d.const_exons.gff" \ %(basename, min_size)) if not all_constitutive: print "Constitutive exon retrieval took %.2f seconds (%d exons)." \ %((t2 - t1), num_exons) output_exons_to_file(const_exons_by_gene, output_filename, output_format=output_format) else: print "Constitutive exons GFF was given, so not outputting " \ "another one." return const_exons_by_gene, output_filename