def to_sorted_list(self) -> List[pysam.AlignedSegment]: """Returns the accumulated records in coordinate order.""" with NamedTemporaryFile(suffix=".bam", delete=True) as fp: filename = fp.name path = self.to_path(path=Path(filename), index=False) bam = sam.reader(path) return list(bam)
def assert_actual_vs_expected(actual_path: str, expected_records: List[pysam.AlignedSegment]) -> None: """Helper method to ensure the expected records are in the SAM/BAM at the actual path.""" with sam.reader(actual_path) as sam_reader: actual_records = [r for r in sam_reader] for actual, expected in zip(actual_records, expected_records): assert actual == expected assert len(actual_records) == len(expected_records)
def valid_bam(valid_sam: Path) -> Generator[Path, None, None]: bam: Path = Path(__file__).parent / 'data' / 'valid.bam' num_read = 0 with sam.reader(valid_sam) as fh_in: with sam.writer(bam, fh_in.header, file_type=SamFileType.BAM) as fh_out: for rec in fh_in: num_read += 1 fh_out.write(rec) assert num_read == 8 yield bam bam.unlink()
def align(reads: Iterable[FastqRecord], idxbase: Path, executable_path: Path = Path('bwa'), algo_opts: Optional[AlgorithmOptions] = None, scoring_opts: Optional[ScoringOptions] = None, io_opts: Optional[InputOutputOptions] = None, suppress_secondaries: bool = False, stderr_out: Any = sys.stderr) -> Iterable[AlignmentResult]: """Aligns the given reads with BWA mem. See :py:mod:`~samwell.bwa_mem` for a detailed explanation for the implementation approach. Args: reads: the reads to align idxbase: the path prefix for all the BWA-specific index files executable_path: the path to the BWA executable algo_opts: the algorithm options scoring_opts: the scoring options io_opts: the input and output options suppress_secondaries: true to discard all secondary alignments, false otherwise Returns: An iterable over the alignment results. An alignment result is a tuple consisting of the original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the alignments (see :class:`~pysam.AlignedSegment`) """ # Build the command line used to run BWA MEM command_line = _build_command_line(idxbase=idxbase, executable_path=executable_path, algo_opts=algo_opts, scoring_opts=scoring_opts, io_opts=io_opts) # Create a sub-process in which to run BWA mem. This process will read FASTQ records from # stdin, write SAM records to stdout, and write any error/logging information to stderr. bwa_mem_process = subprocess.Popen(args=command_line, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) # Create a sub-process in which we read the stderr of the BWA mem subprocess and write it to # the given stderr_out handle. bwa_mem_stderr_process = _SourceToSinkThread( source=iter(bwa_mem_process.stderr), sink_add_func=stderr_out.write, sink_close_func=None) bwa_mem_stderr_process.start() # Create a queue of FASTQ records that the sub-process who will write to BWA mem's stdin # will also write. This is so we can collate/join the input FASTQ records with the output SAM # (or alignment) records. A sentinel value (None) will be written to indicate no more reads # will be placed in the queue. reads_queue: queue.Queue = queue.Queue() # Create a sub-process to consume the input FASTQ records and write them to BWA mem's stdin. We # write in a separate thread to avoid any deadlock with waiting for output from BWA mem's # stdout. This can happen in a synchronous implementation where BWA mem is buffering reads and # we are waiting for some results from BWA mem's stdout, but really BWA mem is waiting for # either more reads from stdin or for stdin to be closed. interleaved_pairs = io_opts.interleaved_pairs if io_opts is not None else None bwa_input_process = _build_bwa_input_process( reads=reads, to_bwa_handle=bwa_mem_process.stdin, to_output_queue=reads_queue, interleaved_pairs=interleaved_pairs) # Go through the output num_aligned = 0 try: # Wait for some reads to be written. pysam will block opening the input file until some # data is available, or the stream is closed. If no data is added, don't even try opening # the stream. while bwa_input_process.num_added == 0 and not bwa_input_process.done: # the input process is still running but no reads have been added time.sleep(.1) if bwa_input_process.num_added == 0 and bwa_input_process.done: # the input process is done (error or success) and no reads have been added, so skip # opening pysam raise StopIteration # Read through the output of BWA mem, and collate that with the queue of reads given to # BWA mem with sam.reader(path=bwa_mem_process.stdout, file_type=SamFileType.SAM) as reader: alignment_results = _collate_alignments( reads_queue=reads_queue, alignments_reader=reader, suppress_secondaries=suppress_secondaries) # A simple loop with its only purpose to count the number of alignment results for result in alignment_results: num_aligned += 1 yield result finally: # Close the stdin of the BWA mem process. This should signal BWA mem to shut down, and # for the input thread to stop. bwa_mem_process.stdin.close() # Join the input thread as now stdin of the BWA mem process is closed. bwa_input_process.join(timeout=1.0) # Check if the inputting reads to BWA had an exception if bwa_input_process.exception is not None: raise bwa_input_process.exception elif bwa_input_process.is_alive(): raise RuntimeError( "BWA process encountered no errors but did not terminate.") # Check that the number of reads given to BWA mem was the same # returned by BWA mem num_left = bwa_input_process.num_added - num_aligned if num_left != 0: raise ValueError( f"Still had {num_left:,d} remaining reads from BWA") # Shut down the BWA mem process. If it fails to shutdown, log a warning and continue on try: bwa_mem_process.wait(timeout=5.0) except subprocess.TimeoutExpired as ex: logger = logging.getLogger(__name__) logger.warning("Could not shutdown BWA, ignoring error: %s", str(ex)) # Shut down the stderr thread bwa_mem_stderr_process.join(timeout=1.0)
def test_sam_file_open_reading_with_reader(valid_sam: Path) -> None: with sam.reader(path=valid_sam, file_type=None) as samfile: assert sum(1 for _ in samfile) == 8
def header_text(valid_sam: Path) -> Dict[str, Any]: """Returns the raw dictionary text in the valid_sam. """ with sam.reader(valid_sam) as fh: return fh.text
def header_dict(valid_sam: Path) -> Dict[str, Any]: """Returns the multi-level dictionary in the valid_sam. """ with sam.reader(valid_sam) as fh: return fh.header
def expected_records(valid_sam: Path) -> List[pysam.AlignedSegment]: """Returns the records that are found in the valid_sam. """ with sam.reader(valid_sam) as fh: return [r for r in fh]