示例#1
0
文件: bam.py 项目: rtcz/varlock
 def __init__(self, bam_filename: str, start_index: int, end_index: int):
     """
     :param bam_filename:
     :param start_index: iterate from 0-based index inclusive
     :param end_index: iterate to 0-based index inclusive
     
     Iterator includes mapped reads within the range and all unmapped reads.
     BAM index ensures that BAM is sorted and is needed to resolve range.
     Iterator assumes that unplaced alignment can be anywhere in BAM file.
     """
     assert start_index <= end_index
     
     self._bam_file = open_bam(bam_filename, 'rb')
     
     if not self._bam_file.has_index():
         raise IndexError('BAM has no index')
     
     self._fai = FastaIndex.from_bam(self._bam_file)
     self._start_ref_name, self._start_ref_pos = self._fai.resolve_start_pos(start_index)
     self._end_ref_name, self._end_ref_pos = self._fai.resolve_end_pos(end_index)
     
     self._start_ref_id = self._fai.ref_id(self._start_ref_name)
     self._end_ref_id = self._fai.ref_id(self._end_ref_name)
     
     self._iterator = self._bam_file.fetch(until_eof=True)
示例#2
0
    def create_vac(self, bam_filename: str, vcf_filename: str,
                   out_vac_filename: str, ref_fasta_filename: str,
                   skip_indels: bool):
        """
        BAM and VCF should use same reference genome.
        VCF must contain INFO column with sub-fields AC and AN.
        :param bam_filename: filename of the SAM/BAM file, from which the header is extracted
        :param vcf_filename: filename of the input VCF file
        :param out_vac_filename: filename of the output VAC file
        :param ref_fasta_filename: filename to reference FASTA file
        :param skip_indels: whether to skip indels and keep only SNPs
        """
        # TODO use fasta index / vcf header instead of BAM header

        # load the reference FASTA
        ref_fasta = None
        if ref_fasta_filename is not None:
            if self._verbose:
                print('--- Loading Reference Fasta ---')
            ref_fasta = pyfaidx.Fasta(ref_fasta_filename)

        # is VCF gzipped?
        # is_gzipped = vcf_filename.endswith(('.gz', '.bgz'))

        # open all files and create the VAC file
        if self._verbose:
            print('--- Processing VCF %s ---' % vcf_filename)
        with pysam.VariantFile(vcf_filename) as vcf_file, \
                open_bam(bam_filename, 'rb') as sam_file, \
                open(out_vac_filename, 'wb') as out_vac_file:
            vac = Vac(FastaIndex.from_bam(sam_file), self._verbose)
            vac.vcf2vac(vcf_file, out_vac_file, ref_fasta, skip_indels)
示例#3
0
文件: bam.py 项目: rtcz/varlock
 def __init__(self, bam_filename: str):
     """
     :param bam_filename:
     Iterates over all reads within BAM file.
     
     Iterator does not use BAI.
     """
     self._bam_file = open_bam(bam_filename, 'rb')
     self._iterator = self._bam_file.fetch(until_eof=True)
示例#4
0
文件: bam.py 项目: rtcz/varlock
 def __init__(self, bam_filename: str):
     """
     :param bam_filename:
     Iterates only on both placed and unplaced unmapped reads.
     
     Iterator does not use BAI.
     """
     self._bam_file = open_bam(bam_filename, 'rb')
     self._iterator = self._bam_file.fetch(until_eof=True)
示例#5
0
    def mutate(self, vac_filename: str, mut_bam_filename: str, secret: bytes,
               mut_p: float, rng: VeryRandom):
        """
        :param vac_filename:
        :param mut_bam_filename:
        :param secret: Secret key written into DIFF used for unmapped alignment encryption.
        :param mut_p: random variant (mutation) probability per genome base
        :param rng:  random number generator
        :return diff_file:
        """
        self._stats = {}

        header = bam.mut_header(self._bam_header, self.checksum,
                                cmn.checksum(vac_filename))

        with bam.open_bam(mut_bam_filename, 'wb', header=header) as mut_bam_file, \
                iters.VariantIterator(vac_filename, self._fai, mut_p, rng) as vac_iter, \
                iters.FullBamIterator(self._bam_filename) as bam_iter:
            mut = Mutator(fai=self._fai, verbose=self._verbose)
            bdiff_io = mut.mutate(mut_bam_file=mut_bam_file,
                                  variant_iter=vac_iter,
                                  bam_iter=bam_iter,
                                  secret=secret,
                                  rng=rng)

        self._stats = {
            self.STAT_ALIGNMENT_COUNT: mut.alignment_counter,
            self.STAT_COVERING_COUNT: mut.covering_counter,
            self.STAT_VAC_COUNT: mut.vac_counter,
            self.STAT_MUT_COUNT: mut.mut_counter,
            self.STAT_DIFF_COUNT: mut.diff_counter,
            self.STAT_ALIGNMENT_MUT_COUNT: mut.alignment_mut_counter
        }

        # TODO resolve difference between mutated and converted (bam->sam->bam) and bam
        # print('before bam ' + bytes2hex(checksum(out_bam_file._filename)))
        # bam2sam(out_bam_file._filename, out_bam_file._filename + b'.sam')
        # print('before sam ' + bytes2hex(checksum(out_bam_file._filename + b'.sam')))
        # sam2bam(out_bam_file._filename + b'.sam', out_bam_file._filename)
        #
        # print('after bam ' + bytes2hex(checksum(out_bam_file._filename)))
        # bam2sam(out_bam_file._filename, out_bam_file._filename + b'.sam')
        # print('after sam ' + bytes2hex(checksum(out_bam_file._filename + b'.sam')))
        # exit(0)

        return bdiff_io.file(
            header={
                BdiffIO.FROM_INDEX: self._fai.first_index(),
                BdiffIO.TO_INDEX: self._fai.last_index(),
                self.BDIFF_CHECKSUM_TAG: cmn.checksum(mut_bam_filename),
                self.BDIFF_SECRET_TAG: cmn.bytes2hex(secret)
            })
示例#6
0
    def __init__(self, filename: str, verbose: bool = False):
        """
        :param filename: BAM filename
        :param verbose:
        """
        self._verbose = verbose
        self._stats = {}
        self._bam_filename = filename

        with bam.open_bam(self._bam_filename, 'rb') as bam_file:
            self._bam_header = bam_file.header
            self._fai = FastaIndex.from_bam(bam_file)

        self._checksum = None
示例#7
0
文件: bam.py 项目: rtcz/varlock
    def __init__(self, bam_filename: str, start_index: int = None, end_index: int = None):
        """
        :param bam_filename:
        :param start_index: iterate from 0-based index inclusive
        :param end_index: iterate to 0-based index inclusive
        Iterates over mapped reads only.
        """
        if start_index is not None and end_index is not None:
            assert start_index <= end_index
        
        self._bam_file = open_bam(bam_filename, 'rb')
        
        if not self._bam_file.has_index():
            raise IndexError('BAM has no index')
        
        self._fai = FastaIndex.from_bam(self._bam_file)
        # empty iterator
        self._iterator = iter(())
        
        self.start_ref_id = None
        self.curr_ref_id = None
        self.end_ref_id = None
        self.counter = 0

        if start_index is end_index is None:
            # fetch all
            self._iterator = self._bam_file.fetch()
        else:
            self.start_ref_name, self.start_ref_pos = self._fai.resolve_start_pos(start_index)
            self.end_ref_name, self.end_ref_pos = self._fai.resolve_end_pos(end_index)
            
            if self.start_ref_name == self.end_ref_name:
                # single iterator
                self._iterator = self._bam_file.fetch(
                    reference=self.start_ref_name,
                    start=self.start_ref_pos,
                    end=self.end_ref_pos
                )
            else:
                # multiple iterators
                self.start_ref_id = self._fai.ref_id(self.start_ref_name)
                self.curr_ref_id = self.start_ref_id
                self.end_ref_id = self._fai.ref_id(self.end_ref_name)
                
                if self.curr_ref_id > self.end_ref_id:
                    raise ValueError("Start reference has position after end reference.")
示例#8
0
 def vac(cls):
     with open_bam(cls.RESOURCE_PATH + 'input.sam', 'rb') as sam_file:
         return Vac(FastaIndex.from_bam(sam_file))
示例#9
0
    def unmutate(self,
                 bdiff_file: io.BytesIO,
                 out_bam_filename: str,
                 start_ref_name: str = None,
                 start_ref_pos: int = None,
                 end_ref_name: str = None,
                 end_ref_pos: int = None,
                 include_unmapped: bool = False,
                 unmapped_only: bool = False):
        """
        Unmutate BAM file in range specified by DIFF file or by parameters.
        :param rng:
        :param bdiff_file:
        :param out_bam_filename:
        :param start_ref_name: inclusive
        :param start_ref_pos: 0-based, inclusive
        :param end_ref_name: inclusive
        :param end_ref_pos: 0-based, inclusive
        :param include_unmapped: Include all unplaced unmapped reads.
        :param unmapped_only: Only unmapped reads - both placed and unplaced.
         Overrides other parameters.

        When range is supplied partialy covered reads are also included,
        but only variants within range are unmutated.
        """
        self._stats = {}

        with bam.open_bam(self._bam_filename,
                          'rb') as bam_file:  # type: pysam.AlignmentFile
            header = bam.unmut_header(bam_file.header)
            mut = Mutator(fai=self._fai, verbose=self._verbose)

            with bam.open_bam(out_bam_filename, 'wb',
                              header=header) as out_bam_file:
                bdiff_io = BdiffIO(bdiff_file)
                if self._verbose:
                    print('SNV diff count %d' % bdiff_io.snv_count)
                    print('INDEL diff count %d' % bdiff_io.indel_count)

                secret = self.extract_secret_bytes(bdiff_io)
                if (include_unmapped or unmapped_only) and secret is None:
                    raise ValueError(
                        'BDIFF must contain secret to decrypt unmapped reads.')

                # validate checksum
                if self.checksum != bdiff_io.header[self.BDIFF_CHECKSUM_TAG]:
                    print(self.checksum)
                    print(bdiff_io.header[self.BDIFF_CHECKSUM_TAG])

                    raise ValueError('BDIFF does not refer to this BAM')

                # TODO user friendly exception on missing bdiff_io header value
                start_index, end_index = self.resolve_range(
                    bdiff_from_index=bdiff_io.header[BdiffIO.FROM_INDEX],
                    bdiff_to_index=bdiff_io.header[BdiffIO.TO_INDEX],
                    start_ref_name=start_ref_name,
                    start_ref_pos=start_ref_pos,
                    end_ref_name=end_ref_name,
                    end_ref_pos=end_ref_pos)

                # TODO move iterators to with statement
                mut.unmutate(
                    bam_iter=iters.bam_iterator(self._bam_filename,
                                                start_index, end_index,
                                                unmapped_only,
                                                include_unmapped),
                    bdiff_iter=iters.BdiffIterator(bdiff_io=bdiff_io,
                                                   fai=self._fai,
                                                   start_index=start_index,
                                                   end_index=end_index),
                    out_bam_file=out_bam_file,
                    secret=secret)

            self._stats = {
                self.STAT_ALIGNMENT_COUNT: mut.alignment_counter,
                self.STAT_COVERING_COUNT: mut.covering_counter,
                self.STAT_MUT_COUNT: mut.mut_counter,
                self.STAT_DIFF_COUNT: mut.diff_counter,
                self.STAT_ALIGNMENT_MUT_COUNT: mut.alignment_mut_counter
            }