示例#1
0
 def _to_virtual_offset(self, offset):
     """
     Convert an offset in uncompressed bytes to a virtual offset that the
     bgzf reader can use.
     """
     isel = np.where(self._data_start <= offset)[0]
     start_offset, block_length, data_start, data_len = self._blocks[
         isel[-1]]
     return make_virtual_offset(start_offset, offset - data_start)
示例#2
0
    def check_random(self, filename):
        """Check BGZF random access by reading blocks in forward & reverse order"""
        h = gzip.open(filename, "rb")
        old = h.read()
        h.close()

        h = open(filename, "rb")
        blocks = list(bgzf.BgzfBlocks(h))
        h.close()

        #Forward
        new = _empty_bytes_string
        h = bgzf.BgzfReader(filename, "rb")
        self.assertTrue(h.seekable())
        self.assertFalse(h.isatty())
        self.assertEqual(h.fileno(), h._handle.fileno())
        for start, raw_len, data_start, data_len in blocks:
            #print start, raw_len, data_start, data_len
            h.seek(bgzf.make_virtual_offset(start,0))
            data = h.read(data_len)
            self.assertEqual(len(data), data_len)
            #self.assertEqual(start + raw_len, h._handle.tell())
            self.assertEqual(len(new), data_start)
            new += data
        h.close()
        self.assertEqual(len(old), len(new))
        self.assertEqual(old, new)

        #Reverse
        new = _empty_bytes_string
        h = bgzf.BgzfReader(filename, "rb")
        for start, raw_len, data_start, data_len in blocks[::-1]:
            #print start, raw_len, data_start, data_len
            h.seek(bgzf.make_virtual_offset(start,0))
            data = h.read(data_len)
            self.assertEqual(len(data), data_len)
            #self.assertEqual(start + raw_len, h._handle.tell())
            new = data + new
        h.close()
        self.assertEqual(len(old), len(new))
        self.assertEqual(old, new)

        #Jump back - non-sequential seeking
        if len(blocks) >= 3:
            h = bgzf.BgzfReader(filename, "rb", max_cache = 1)
            #Seek to a late block in the file,
            #half way into the third last block
            start, raw_len, data_start, data_len = blocks[-3]
            voffset = bgzf.make_virtual_offset(start, data_len // 2)
            h.seek(voffset)
            self.assertEqual(voffset, h.tell())
            data = h.read(1000)
            self.assertTrue(data in old)
            self.assertEqual(old.find(data), data_start + data_len // 2)
            #Now seek to an early block in the file,
            #half way into the second block
            start, raw_len, data_start, data_len = blocks[1]
            h.seek(bgzf.make_virtual_offset(start, data_len // 2))
            voffset = bgzf.make_virtual_offset(start, data_len // 2)
            h.seek(voffset)
            self.assertEqual(voffset, h.tell())
            #Now read all rest of this block and start of next block
            data = h.read(data_len + 1000)
            self.assertTrue(data in old)
            self.assertEqual(old.find(data), data_start + data_len // 2)
            h.close()

        #Check seek/tell at block boundaries
        v_offsets = []
        for start, raw_len, data_start, data_len in blocks:
            for within_offset in [0, 1, data_len // 2, data_len - 1]:
                if within_offset < 0 or data_len <= within_offset:
                    continue
                voffset = bgzf.make_virtual_offset(start, within_offset)
                real_offset = data_start + within_offset
                v_offsets.append((voffset, real_offset))
        shuffle(v_offsets)
        h = bgzf.BgzfReader(filename, "rb", max_cache = 1)
        for voffset, real_offset in v_offsets:
            h.seek(0)
            assert voffset >= 0 and real_offset >= 0
            self.assertEqual(h.read(real_offset), old[:real_offset])
            self.assertEqual(h.tell(), voffset)
        for voffset, real_offset in v_offsets:
            h.seek(voffset)
            self.assertEqual(h.tell(), voffset)
        h.close()
示例#3
0
def calculate_chunks(filename, num_chunks):
    """
    Calculate the boundaries in the BAM file and partition into chunks.

    :param str filename: name of the BAM file
    :param int num_chunks: number of chunks to partition the boundaries into
    :return: a list of tuples containing the start and end boundaries
    """
    if num_chunks <= 0:
        raise ValueError("The number of chunks to calculate should be >= 1")

    if num_chunks == 1:
        # aln_file = pysam.AlignmentFile(filename)
        # header_size = bgzf.split_virtual_offset(aln_file.tell())[0]
        # aln_file.close()

        pr = ParseRecord(0, 0, 0, 0, -1, 0, 0)
        return [pr]

    try:
        f = open(filename, 'r')
        # get all the block start offsets
        block_offsets = []
        decompressed_lengths = []
        i = 0

        for values in FastBgzfBlocks(f):
            block_offsets.append(values[0])
            decompressed_lengths.append(values[3])

            #if i % 50000 == 0:
            #   print  'Block {}'.format(i)
            i += 1

        # partition the starts into manageable chunks
        div, mod = divmod(len(block_offsets), num_chunks)

        fastq_fh = bgzf.BgzfReader(filename, 'r')
        header_size = 0
        partitioned_offsets = [(header_size, 0)]

        for i in range(1, num_chunks):
            index = div * i + min(i, mod)

            virtual_offset = bgzf.make_virtual_offset(block_offsets[index], 0)
            fastq_fh.seek(virtual_offset)
            line = fastq_fh.readline().strip()
            while line != '+':
                line = fastq_fh.readline().strip()
            quality_line = fastq_fh.readline()
            virtual_offset = fastq_fh.tell()

            # block start & within block offset
            partitioned_offsets.append(
                bgzf.split_virtual_offset(virtual_offset))

        fastq_fh.close()

        # now let's calculate beginning and ends
        params = []

        for i, offset in enumerate(partitioned_offsets):
            index = block_offsets.index(partitioned_offsets[i][0])
            begin_read_offset = 0
            begin_read_size = 0
            file_offset = 0
            file_bytes = 0
            end_read_offset = 0
            end_read_size = 0

            if i == 0:
                # first
                begin_read_offset = 0
                begin_read_size = 0
                file_offset = block_offsets[index]
                # print 'file_offset=', file_offset
                file_bytes = partitioned_offsets[i + 1][0] - file_offset
                # print 'file_bytes=', file_bytes
                end_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i + 1][0], 0)
                end_read_size = partitioned_offsets[i + 1][1]
            elif i == num_chunks - 1:
                # last
                begin_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i][0], partitioned_offsets[i][1])
                begin_read_size = decompressed_lengths[index] - \
                                  partitioned_offsets[i][1]
                file_offset = block_offsets[index + 1]
                file_bytes = -1
                end_read_offset = 0
                end_read_size = 0
            else:
                # all others
                if offset[1] == 0:
                    # bgzf boundary
                    print('****************HUH')
                    return

                begin_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i][0], partitioned_offsets[i][1])
                begin_read_size = decompressed_lengths[index] - \
                                  partitioned_offsets[i][1]
                file_offset = block_offsets[index + 1]
                file_bytes = partitioned_offsets[i + 1][0] - file_offset

                end_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i + 1][0], 0)
                end_read_size = partitioned_offsets[i + 1][1]

            pr = ParseRecord(header_size, begin_read_offset, begin_read_size,
                             file_offset, file_bytes, end_read_offset,
                             end_read_size)
            params.append(pr)

        return params

    except Exception as e:
        print('calculate_chunks error: {}'.format(str(e)))
示例#4
0
    def get_sequence(self, contig, start, end, strand=1, all_upper=False):
        """
        Return the genomic DNA sequence spanning [start, end) on contig.
        :param contig: The name of the contig on which the start and end coordinates are located
        :param start: The start location of the sequence to be returned (this endpoint is included in the interval)
        :param end: The end location of the sequence to be returned (tis endpoint is not included in the interval)
        :param strand: The DNA strand of the sequence to be returned (-1 for negative strand, 1 for positive strand)
        :param all_upper: If true, return the sequence in all uppercase letters. Otherwise return lowercase letters
            for positions that are "soft-masked" (see https://genomevolution.org/wiki/index.php/Masked).
        :return: A string of DNA nucleotides of length end-start
        """
        if contig not in self._index:
            raise ContigNotFound(message='Contig {} not found'.format(contig),
                                 requested_contig=contig,
                                 valid_contigs=list(self._index.keys()))
        if start < 0:
            raise CoordinateOutOfBounds(
                message='Start coordinate below 0',
                problematic_coordinate=start,
                problem_with_start=True,
                coordinate_too_small=True,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if start > self.contig_lengths[contig]:
            raise CoordinateOutOfBounds(
                message='Start coordinate past end of contig',
                problematic_coordinate=start,
                problem_with_start=True,
                coordinate_too_small=False,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if end > self.contig_lengths[contig]:
            raise CoordinateOutOfBounds(
                message='End coordinate past end of contig',
                problematic_coordinate=end,
                problem_with_start=False,
                coordinate_too_small=False,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if end < 0:
            raise CoordinateOutOfBounds(
                message='End coordinate below 0',
                problematic_coordinate=end,
                problem_with_start=False,
                coordinate_too_small=True,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if start >= end:
            raise InvalidCoordinates(start=start, end=end)

        query_length = end - start
        start_pos_file_distance = self._text_distance_to_file_distance(start)

        start_block = sorted(
            self._index[contig].search(start_pos_file_distance))[0]
        start_block_offset = start_block.data
        verbose_print('Retrieving sequence for {} [{},{}) ...'.format(
            contig, start, end))

        sequence_start_offset = start_pos_file_distance - start_block.begin

        retrieved_sequence = ''
        with bgzf.BgzfReader(self.bgzipped_fasta_filename, 'rt') as fasta_file:
            fasta_file.seek(
                bgzf.make_virtual_offset(start_block_offset,
                                         sequence_start_offset))
            while len(retrieved_sequence) < query_length:
                retrieved_sequence += fasta_file.readline().rstrip()
        trimmed_sequence = retrieved_sequence[:query_length]

        if all_upper:
            trimmed_sequence = trimmed_sequence.upper()

        if strand == -1:
            return reverse_complement(trimmed_sequence)
        else:
            return trimmed_sequence