def call_debruijn_graph(self, windows, reads): """Helper function to call debruijn_graph module.""" windows_haplotypes = [] # Build and process de-Bruijn graph for each window. for window in windows: if window.end - window.start > self.config.ws_config.max_window_size: continue if not self.ref_reader.is_valid_interval(window): continue ref = self.ref_reader.bases(window) # redacted dbg_reads = [ read for read in reads if ranges.ranges_overlap(window, utils.read_range(read)) ] with timer.Timer() as t: graph = debruijn_graph.build(ref, dbg_reads, self.config.dbg_config) graph_building_time = t.GetDuration() if not graph: candidate_haplotypes = [ref] else: candidate_haplotypes = graph.candidate_haplotypes() if candidate_haplotypes and candidate_haplotypes != [ref]: candidate_haplotypes_info = realigner_pb2.CandidateHaplotypes( span=window, haplotypes=candidate_haplotypes) windows_haplotypes.append(candidate_haplotypes_info) self.diagnostic_logger.log_graph_metrics(window, graph, candidate_haplotypes, graph_building_time) return windows_haplotypes
def read_span(self): if self._read_span is None and self.reads: spans = [utils.read_range(r) for r in self.reads] self._read_span = ranges.make_range(spans[0].reference_name, min(s.start for s in spans), max(s.end for s in spans)) return self._read_span
def test_read_range(self): """Tests reads have their ranges calculated correctly.""" start = 10000001 read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') self.assertEquals(ranges.make_range('chrX', start, start + 5), utils.read_range(read)) read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') self.assertEquals(ranges.make_range('chrX', start, start + 5 + 16), utils.read_range(read))
def assign_reads_to_assembled_regions(assembled_regions, reads): """Assign each read to the maximally overlapped window. Args: assembled_regions: list[AssemblyRegion], list of AssemblyRegion to assign reads to. Does not assume AssemblyRegion are sorted. reads: iterable[learning.genomics.genomics.Read], to be processed. Does not assume the reads are sorted. Returns: [AssemblyRegion], information on assigned reads for each assembled region. list[learning.genomics.genomics.Read], the list of unassigned reads. """ regions = [ar.region for ar in assembled_regions] unassigned_reads = [] for read in reads: read_range = utils.read_range(read) window_i = ranges.find_max_overlapping(read_range, regions) if window_i is not None: assembled_regions[window_i].add_read(read) else: unassigned_reads.append(read) return unassigned_reads