Пример #1
0
    def update_observed_distribution(self, reads):

        # Get (flowcell, lane, surface, swath, tile, x, y) tuples for each read
        read_locs = []
        for (key, read, idx) in reads:
            read_loc = tk_lane.extract_read_position(read)
            if read_loc is not None:
                read_locs.append((read_loc, read))

        # Sort by flowcell
        def flowcell(read_loc):
            return "%s" % (read_loc[0].flowcell)
        read_locs.sort(key=flowcell)
        lane_groups = itertools.groupby(read_locs, flowcell)

        # Measure distances between all pairs
        for (lane, lane_reads) in lane_groups:
            lane_reads = list(lane_reads)
            cmp_reads = min(200, len(lane_reads))
            lane_loc_coords = [self.lane_coordinate_system.convert_to_lane_coords(loc) for (loc, _) in lane_reads]
            for i in range(cmp_reads):
                loc1, read1 = lane_reads[i]
                lane_loc1 = lane_loc_coords[i]

                for j in range(i+1, len(lane_reads)):
                    loc2, read2 = lane_reads[j]
                    lane_loc2 = lane_loc_coords[j]

                    dist = math.sqrt((lane_loc1[0]-lane_loc2[0])**2 + (lane_loc1[1] - lane_loc2[1])**2)
                    self.observed_distribution.increment(dist)
Пример #2
0
def estimate_flowcell_geometry(bam_in, lane_coordinate_system):

    bam_in.reset()

    lane_extents = {}
    flowcells = set()
    result = XYrange()

    for read in itertools.islice(bam_in, 100000):
        read_loc = tk_lane.extract_read_position(read)
        loc_coords = lane_coordinate_system.convert_to_lane_coords(read_loc)
        flowcells.add(read_loc.flowcell)
        key = read_loc.flowcell + '_' + read_loc.lane
        if key not in lane_extents:
            lane_extents[key] = XYrange()
        lane_extents[key].update(loc_coords)
        result.update(loc_coords)
    bam_in.reset()

    min_overlap = 1.0
    lanes = lane_extents.keys()
    for i in xrange(len(lanes)):
        for j in xrange(i+1, len(lanes)):
            min_overlap = min(lane_extents[lanes[i]].overlap(lane_extents[lanes[j]]), min_overlap)

    print "Flowcells : ", flowcells
    for k, v in lane_extents.items():
        print k, v

    print "Flowcell overlap = ", min_overlap

    if len(flowcells) > 1:
        print "Multiple flowcells found"

    if min_overlap < 0.9:
        print "Flowcells do not have identical geometry."
        return None

    return {"x": (result.xrange.min, result.xrange.max), "y": (result.yrange.min, result.yrange.max)}
Пример #3
0
    def count_dups_by_distance(self, namedreads):
        """Count number of nearby duplicates in a set of reads.  A pair is counted as 1"""
        # Get (flowcell, lane, surface, swath, tile, x, y) tuples for each read
        read_locs = []
        for (footprint, barcode, read) in namedreads:
            read_loc = tk_lane.extract_read_position(read)
            if read_loc is not None:
                read_locs.append((read_loc, read))

        # Sort by flowcell_lane
        def flowcell_lane(read_loc):
            return "%s_%s" % (read_loc[0].flowcell, read_loc[0].lane)

        read_locs.sort(key=flowcell_lane)
        lane_groups = itertools.groupby(read_locs, flowcell_lane)

        opt_dups_found = 0  # really close dupes
        diff_dups_found = 0  # somewhat close dupes

        # Measure distances between all pairs in a lane
        for (lane, lane_reads) in lane_groups:
            lane_reads = list(lane_reads)

            layout = self.lane_coordinate_system.get_layout_for_read_loc(
                lane_reads[0][0])
            test_dups = layout.has_diffusion_duplicates(
                MAX_DIFFUSION_DUP_DISTANCE)

            if len(lane_reads) > 100:
                martian.log_info("Got dup cluster of size: %d" %
                                 len(lane_reads))
                first_read = lane_reads[0][1]
                martian.log_info(
                    "tid: %d, pos: %d, mapq: %d, seq: %s" %
                    (first_read.reference_id, first_read.reference_start,
                     first_read.mapping_quality, first_read.query_sequence))

            opt_dups = set()
            diff_dups = set()
            dump = []
            cmp_reads = min(200, len(lane_reads))
            lane_loc_coords = [
                self.lane_coordinate_system.convert_to_lane_coords(loc)
                for (loc, _) in lane_reads
            ]
            for i in range(cmp_reads):
                loc1, read1 = lane_reads[i]
                lane_loc1 = lane_loc_coords[i]

                for j in range(i + 1, len(lane_reads)):
                    loc2, read2 = lane_reads[j]
                    lane_loc2 = lane_loc_coords[j]

                    dist = math.sqrt((lane_loc1[0] - lane_loc2[0])**2 +
                                     (lane_loc1[1] - lane_loc2[1])**2)
                    if test_dups and dist < MAX_DIFFUSION_DUP_DISTANCE:
                        diff_dups.add(j)
                        if self.write_to_stdout and j not in diff_dups:
                            dump.append(
                                ("%d\t" + ("%d\t" * 14)) %
                                (dist, loc1.surface, loc1.swath, loc1.tile,
                                 loc1.x, loc1.y, lane_loc1[0], lane_loc1[1],
                                 loc2.surface, loc2.swath, loc2.tile, loc2.x,
                                 loc2.y, lane_loc2[0], lane_loc2[1]))

                    if dist < OPTICAL_DUPLICATE_DISTANCE:
                        opt_dups.add(j)

            if self.write_to_stdout and len(diff_dups) >= 2:
                for x in dump:
                    print("%d\t%s" % (len(diff_dups), x))

            diff_dups_found += len(diff_dups)
            opt_dups_found += len(opt_dups)

        return opt_dups_found, diff_dups_found