Exemplo n.º 1
0
 def to_sorted_list(self) -> List[pysam.AlignedSegment]:
     """Returns the accumulated records in coordinate order."""
     with NamedTemporaryFile(suffix=".bam", delete=True) as fp:
         filename = fp.name
         path = self.to_path(path=Path(filename), index=False)
         bam = sam.reader(path)
         return list(bam)
Exemplo n.º 2
0
def assert_actual_vs_expected(actual_path: str,
                              expected_records: List[pysam.AlignedSegment]) -> None:
    """Helper method to ensure the expected records are in the SAM/BAM at the actual path."""
    with sam.reader(actual_path) as sam_reader:
        actual_records = [r for r in sam_reader]
    for actual, expected in zip(actual_records, expected_records):
        assert actual == expected
    assert len(actual_records) == len(expected_records)
Exemplo n.º 3
0
def valid_bam(valid_sam: Path) -> Generator[Path, None, None]:
    bam: Path = Path(__file__).parent / 'data' / 'valid.bam'
    num_read = 0
    with sam.reader(valid_sam) as fh_in:
        with sam.writer(bam, fh_in.header, file_type=SamFileType.BAM) as fh_out:
            for rec in fh_in:
                num_read += 1
                fh_out.write(rec)
    assert num_read == 8
    yield bam
    bam.unlink()
Exemplo n.º 4
0
def align(reads: Iterable[FastqRecord],
          idxbase: Path,
          executable_path: Path = Path('bwa'),
          algo_opts: Optional[AlgorithmOptions] = None,
          scoring_opts: Optional[ScoringOptions] = None,
          io_opts: Optional[InputOutputOptions] = None,
          suppress_secondaries: bool = False,
          stderr_out: Any = sys.stderr) -> Iterable[AlignmentResult]:
    """Aligns the given reads with BWA mem.

    See :py:mod:`~samwell.bwa_mem` for a detailed explanation for the implementation approach.

    Args:
        reads: the reads to align
        idxbase: the path prefix for all the BWA-specific index files
        executable_path: the path to the BWA executable
        algo_opts: the algorithm options
        scoring_opts: the scoring options
        io_opts: the input and output options
        suppress_secondaries: true to discard all secondary alignments, false otherwise

    Returns:
        An iterable over the alignment results.  An alignment result is a tuple consisting of the
        original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the alignments (see
        :class:`~pysam.AlignedSegment`)
    """

    # Build the command line used to run BWA MEM
    command_line = _build_command_line(idxbase=idxbase,
                                       executable_path=executable_path,
                                       algo_opts=algo_opts,
                                       scoring_opts=scoring_opts,
                                       io_opts=io_opts)

    # Create a sub-process in which to run BWA mem.  This process will read FASTQ records from
    # stdin, write SAM records to stdout, and write any error/logging information to stderr.
    bwa_mem_process = subprocess.Popen(args=command_line,
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       universal_newlines=True)

    # Create a sub-process in which we read the stderr of the BWA mem subprocess and write it to
    # the given stderr_out handle.
    bwa_mem_stderr_process = _SourceToSinkThread(
        source=iter(bwa_mem_process.stderr),
        sink_add_func=stderr_out.write,
        sink_close_func=None)
    bwa_mem_stderr_process.start()

    # Create a queue of FASTQ records that the sub-process who will write to BWA mem's stdin
    # will also write.  This is so we can collate/join the input FASTQ records with the output SAM
    # (or alignment) records.  A sentinel value (None) will be written to indicate no more reads
    # will be placed in the queue.
    reads_queue: queue.Queue = queue.Queue()

    # Create a sub-process to consume the input FASTQ records and write them to BWA mem's stdin. We
    # write in a separate thread to avoid any deadlock with waiting for output from BWA mem's
    # stdout.  This can happen in a synchronous implementation where BWA mem is buffering reads and
    # we are waiting for some results from BWA mem's stdout, but really BWA mem is waiting for
    # either more reads from stdin or for stdin to be closed.
    interleaved_pairs = io_opts.interleaved_pairs if io_opts is not None else None
    bwa_input_process = _build_bwa_input_process(
        reads=reads,
        to_bwa_handle=bwa_mem_process.stdin,
        to_output_queue=reads_queue,
        interleaved_pairs=interleaved_pairs)

    # Go through the output
    num_aligned = 0
    try:
        # Wait for some reads to be written.  pysam will block opening the input file until some
        # data is available, or the stream is closed.  If no data is added, don't even try opening
        # the stream.
        while bwa_input_process.num_added == 0 and not bwa_input_process.done:
            # the input process is still running but no reads have been added
            time.sleep(.1)
        if bwa_input_process.num_added == 0 and bwa_input_process.done:
            # the input process is done (error or success) and no reads have been added, so skip
            # opening pysam
            raise StopIteration
        # Read through the output of BWA mem, and collate that with the queue of reads given to
        # BWA mem
        with sam.reader(path=bwa_mem_process.stdout,
                        file_type=SamFileType.SAM) as reader:
            alignment_results = _collate_alignments(
                reads_queue=reads_queue,
                alignments_reader=reader,
                suppress_secondaries=suppress_secondaries)
            # A simple loop with its only purpose to count the number of alignment results
            for result in alignment_results:
                num_aligned += 1
                yield result
    finally:
        # Close the stdin of the BWA mem process.  This should signal BWA mem to shut down, and
        # for the input thread to stop.
        bwa_mem_process.stdin.close()

        # Join the input thread as now stdin of the BWA mem process is closed.
        bwa_input_process.join(timeout=1.0)

        # Check if the inputting reads to BWA had an exception
        if bwa_input_process.exception is not None:
            raise bwa_input_process.exception
        elif bwa_input_process.is_alive():
            raise RuntimeError(
                "BWA process encountered no errors but did not terminate.")

        # Check that the number of reads given to BWA mem was the same # returned by BWA mem
        num_left = bwa_input_process.num_added - num_aligned
        if num_left != 0:
            raise ValueError(
                f"Still had {num_left:,d} remaining reads from BWA")

        # Shut down the BWA mem process.  If it fails to shutdown, log a warning and continue on
        try:
            bwa_mem_process.wait(timeout=5.0)
        except subprocess.TimeoutExpired as ex:
            logger = logging.getLogger(__name__)
            logger.warning("Could not shutdown BWA, ignoring error: %s",
                           str(ex))

        # Shut down the stderr thread
        bwa_mem_stderr_process.join(timeout=1.0)
Exemplo n.º 5
0
def test_sam_file_open_reading_with_reader(valid_sam: Path) -> None:
    with sam.reader(path=valid_sam, file_type=None) as samfile:
        assert sum(1 for _ in samfile) == 8
Exemplo n.º 6
0
def header_text(valid_sam: Path) -> Dict[str, Any]:
    """Returns the raw dictionary text in the valid_sam. """
    with sam.reader(valid_sam) as fh:
        return fh.text
Exemplo n.º 7
0
def header_dict(valid_sam: Path) -> Dict[str, Any]:
    """Returns the multi-level dictionary in the valid_sam. """
    with sam.reader(valid_sam) as fh:
        return fh.header
Exemplo n.º 8
0
def expected_records(valid_sam: Path) -> List[pysam.AlignedSegment]:
    """Returns the records that are found in the valid_sam. """
    with sam.reader(valid_sam) as fh:
        return [r for r in fh]