def get_overlaping_clusters(self, region, overlap=1): clusters = [] bam_tell, read_start = self.get_bam_tell(region) print "TELL", bam_tell, read_start if bam_tell or region.start < LINEAR_SIZE*4: r = BamReader(self.bam_path, self.logger, bam_tell, read_start, self.chr_dict_inv) for line in r: c = Cluster(read=SAM, cached=False, read_half_open=self.read_half_open, rounding=self.rounding) try: c.read_line(line) except InvalidLine: print "Invalid line, .bam or .bai corrupt" break if c.overlap(region) >= overlap: clusters.append(c) elif c.start > region.end or c.name != region.name: break if len(clusters) > 0: print "Num clusters", len(clusters) print "first:", clusters[0].start, clusters[0].end if len(clusters) > 1: print "end:", clusters[-1].start, clusters[-1].end else: print "No clusters found!" print return clusters
def test_overlap(self): cluster1 = Cluster(read=BED) cluster1.read_line('chr1 1 100 hola 666 +') cluster2 = Cluster(read=BED) cluster2.read_line('chr1 51 200 hola 666 +') cluster3 = Cluster(read=PK) cluster3.read_line('chr3 1 100 100:1') cluster4 = Cluster(read=BED) cluster4.read_line('chr5 1 1000 hola 666 +') cluster5 = Cluster(read=PK) cluster5.read_line('chr5 1 300 300:1') cluster6 = Cluster(read=BED) cluster6.read_line('chr5 100 900 hola 666 +') cluster_discard = Cluster(read=BED, read_half_open=True) cluster_discard.read_line('chrX 61836251 61836287 id:105282 1000 +') cluster_satellite = Cluster(read=BED) cluster_satellite.read_line('chrX 61836270 61837703 Satellite') self.assertEqual(cluster_discard.overlap(cluster_satellite), 0.5) self.assertEqual(cluster1.overlap(cluster3), 0) self.assertEqual(cluster1.overlap(cluster1), 1) self.assertEqual(cluster1.overlap(cluster2), 0.5)
def get_overlaping_clusters(self, region, overlap=1): clusters = [] self.logger.debug('Launching Samtools for %s...'%region) proc = subprocess.Popen("samtools view %s %s:%s-%s"%(self.bam_path, region.name, region.start, region.end), stdout=subprocess.PIPE, shell=True) out, err = proc.communicate() self.logger.debug('... done') lines = filter(None, out.split("\n")) self.logger.debug('Numlines in %s: %s'%(region, len(lines))) for line in lines: c = Cluster(read=SAM, cached=False, read_half_open=self.read_half_open, rounding=self.rounding) try: c.read_line(line) except InvalidLine: print "Invalid line, .bam or .bai corrupt" break if c.overlap(region) >= overlap: clusters.append(c) elif c.start > region.end or c.name != region.name: break return clusters