def output_exons_as_bed(self): """ Output the table's exons as BED. Only implemented for ensGene.txt; probably not necessary to work out for other tables since this is only used for aggregate statistics. """ if self.source != "ensGene": return output_filename = os.path.join(self.exons_dir, "%s.exons.bed" %(self.source)) print "Outputting exons..." if os.path.isfile(output_filename): print " - Found %s. Skipping..." %(output_filename) return output_filename exons_file = open(output_filename, "w") for idx, series in self.raw_table.iterrows(): gene_info = series.to_dict() gene_id = gene_info["name2"] chrom = gene_info["chrom"] strand = gene_info["strand"] exonStarts = gene_info["exonStarts"] exonEnds = gene_info["exonEnds"] # Keep 0-based start of ensGene table since # this will be outputted as a BED exon_starts = (int(start) for start in exonStarts.rstrip(",").split(",")) exon_ends = (int(end) for end in exonEnds.rstrip(",").split(",")) exon_coords = zip(exon_starts, exon_ends) # Output as BED: encode gene ID bedtools_utils.output_intervals_as_bed(exons_file, chrom, exon_coords, strand, name=gene_id) exons_file.close() return output_filename
def output_introns(self, min_intron_size=50): """ Given the merged exons, compute the intronic coordinates. Only implemented for ensGene.txt; probably not necessary to work out for other tables since this is only used for aggregate statistics. Exclude intronic content that is less than 'min_intron_size'. """ if self.source != "ensGene": return output_filename = os.path.join(self.introns_dir, "ensGene.introns.bed") print "Outputting introns..." if os.path.isfile(output_filename): print " - Found %s. Skipping..." %(output_filename) return print " - Output file: %s" %(output_filename) introns_file = open(output_filename, "w") # Load ensGene exons merged_exons_by_gene = self.load_merged_exons_by_gene() for gene_id, merged_exons in merged_exons_by_gene.iteritems(): chrom = merged_exons[0]["chrom"] strand = merged_exons[0]["strand"] # For each gene, get its list of introns and serialize them exon_coords = [(int(exon["start"]), int(exon["end"])) \ for exon in merged_exons] intron_coords = [] for first_exon, second_exon in zip(exon_coords, exon_coords[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon[1] + 1 intron_end = second_exon[0] - 1 if intron_start >= intron_end: continue # Filter on intron size (in 0-based coordinates) intron_size = intron_end - intron_start if intron_size < min_intron_size: continue intron_coords.append((intron_start, intron_end)) bedtools_utils.output_intervals_as_bed(introns_file, chrom, intron_coords, strand, name=gene_id) introns_file.close()