示例#1
0
 def output_exons_as_bed(self):
     """
     Output the table's exons as BED.
     
     Only implemented for ensGene.txt; probably not
     necessary to work out for other tables since this is
     only used for aggregate statistics.
     """
     if self.source != "ensGene":
         return
     output_filename = os.path.join(self.exons_dir,
                                    "%s.exons.bed" %(self.source))
     print "Outputting exons..."
     if os.path.isfile(output_filename):
         print "  - Found %s. Skipping..." %(output_filename)
         return output_filename
     exons_file = open(output_filename, "w")
     for idx, series in self.raw_table.iterrows():
         gene_info = series.to_dict()
         gene_id = gene_info["name2"]
         chrom = gene_info["chrom"]
         strand = gene_info["strand"]
         exonStarts = gene_info["exonStarts"]
         exonEnds = gene_info["exonEnds"]
         # Keep 0-based start of ensGene table since
         # this will be outputted as a BED
         exon_starts = (int(start) for start in exonStarts.rstrip(",").split(","))
         exon_ends = (int(end) for end in exonEnds.rstrip(",").split(","))
         exon_coords = zip(exon_starts, exon_ends)
         # Output as BED: encode gene ID
         bedtools_utils.output_intervals_as_bed(exons_file,
                                                chrom, exon_coords, strand,
                                                name=gene_id)
     exons_file.close()
     return output_filename
示例#2
0
    def output_introns(self, min_intron_size=50):
        """
        Given the merged exons, compute the intronic coordinates.
        
        Only implemented for ensGene.txt; probably not
        necessary to work out for other tables since this is
        only used for aggregate statistics.

        Exclude intronic content that is less than 'min_intron_size'.
        """
        if self.source != "ensGene":
            return
        output_filename = os.path.join(self.introns_dir,
                                       "ensGene.introns.bed")
        print "Outputting introns..."
        if os.path.isfile(output_filename):
            print "  - Found %s. Skipping..." %(output_filename)
            return
        print " - Output file: %s" %(output_filename)
        introns_file = open(output_filename, "w")
        # Load ensGene exons
        merged_exons_by_gene = self.load_merged_exons_by_gene()
        for gene_id, merged_exons in merged_exons_by_gene.iteritems():
            chrom = merged_exons[0]["chrom"]
            strand = merged_exons[0]["strand"]
            # For each gene, get its list of introns and serialize them
            exon_coords = [(int(exon["start"]), int(exon["end"])) \
                           for exon in merged_exons]
            intron_coords = []
            for first_exon, second_exon in zip(exon_coords, exon_coords[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon[1] + 1
                intron_end = second_exon[0] - 1
                if intron_start >= intron_end:
                    continue
                # Filter on intron size (in 0-based coordinates)
                intron_size = intron_end - intron_start
                if intron_size < min_intron_size:
                    continue
                intron_coords.append((intron_start, intron_end))
            bedtools_utils.output_intervals_as_bed(introns_file,
                                                   chrom,
                                                   intron_coords,
                                                   strand,
                                                   name=gene_id)
        introns_file.close()