Пример #1
0
def read_pair_info_iter(bam_in, target_regions, references):

    while (True):
        try:
            keep_going = True
            while (keep_going):
                r1 = bam_in.next()
                if not (r1.is_secondary):
                    keep_going = False

            keep_going = True
            while (keep_going):
                r2 = bam_in.next()
                if not (r2.is_secondary):
                    keep_going = False
        except StopIteration:
            break

        name1 = r1.qname.split()[0]
        name2 = r2.qname.split()[0]
        assert (name1 == name2)

        if r1.is_read1:
            read1 = r1
            read2 = r2
        else:
            read1 = r2
            read2 = r1

        if r1.tid == r2.tid and r2.pos < r1.pos:
            pos2 = r1
            pos1 = r2
        else:
            pos1 = r1
            pos2 = r2

        read_pair_info = {}
        read_pair_info['name'] = name1

        if read1.is_read1:
            read1_index = 1
            read2_index = 2
        else:
            read1_index = 2
            read2_index = 1

        read_pair_info['r1_index'] = read1_index
        read_pair_info['r2_index'] = read2_index

        read_pair_info[
            'is_duplicate'] = read1.is_duplicate or read2.is_duplicate

        read_pair_info['r1_mapped'] = not (read1.is_unmapped)
        read_pair_info['r2_mapped'] = not (read2.is_unmapped)

        read_pair_info['r1_mapq'] = read1.mapq
        read_pair_info['r2_mapq'] = read2.mapq

        if read1.tid == -1:
            ref1 = ''
        else:
            ref1 = references[read1.tid]

        read_pair_info['r1_chrom'] = ref1
        read_pair_info['r1_pos'] = read1.pos
        read_pair_info['r1_direct'] = not (read1.is_reverse)
        read_pair_info['r1_min_qual'] = tk_fasta.get_min_qual(read1.qual)
        read_pair_info['r1_seq_len'] = len(
            read1.seq) if read1.seq is not None else 0
        if read1.seq is not None and 'N' in read1.seq:
            read_pair_info['r1_contains_N'] = True
        else:
            read_pair_info['r1_contains_N'] = False

        if read2.tid == -1:
            ref2 = ''
        else:
            ref2 = references[read2.tid]

        read_pair_info['r2_chrom'] = ref2
        read_pair_info['r2_pos'] = read2.pos
        read_pair_info['r2_direct'] = not (read2.is_reverse)
        read_pair_info['r2_min_qual'] = tk_fasta.get_min_qual(read2.qual)
        read_pair_info['r2_seq_len'] = len(
            read2.seq) if read2.seq is not None else 0
        if read2.seq is not None and 'N' in read2.seq:
            read_pair_info['r2_contains_N'] = True
        else:
            read_pair_info['r2_contains_N'] = False

        #
        # TODO: it would be preferable to add override ability in the tk_io calls,
        # but jedna doesn't have its own tenkit branch
        #
        read_raw_bc = crdna_io.get_read_raw_barcode(read1)
        if read_raw_bc is None:
            read_raw_bc = ''
        read_bc = crdna_io.get_read_barcode(read1)
        if read_bc is None:
            read_bc = ''
        read_bc_qual = crdna_io.get_read_barcode_qual(read1)
        read_sample_index = tk_io.get_read_sample_index(read1)
        if read_sample_index is None:
            read_sample_index = ''
        read_sample_index_qual = tk_io.get_read_sample_index_qual(read1)

        read_pair_info['10X_raw_bc_len'] = len(read_raw_bc)
        read_pair_info['sample_index_len'] = len(read_sample_index)

        read_pair_info['10X_raw_bc'] = read_raw_bc
        read_pair_info['10X_called_bc'] = read_bc
        read_pair_info['sample_index'] = read_sample_index

        read_pair_info['10X_bc_min_qual'] = tk_fasta.get_min_qual(read_bc_qual)
        read_pair_info['10X_bc_mean_qual'] = tk_fasta.get_mean_qual(
            read_bc_qual)

        if read_sample_index:
            read_pair_info['sample_index_min_qual'] = tk_fasta.get_min_qual(
                read_sample_index_qual)
            read_pair_info['sample_index_mean_qual'] = tk_fasta.get_mean_qual(
                read_sample_index_qual)
        else:
            read_pair_info['sample_index_min_qual'] = None
            read_pair_info['sample_index_mean_qual'] = None

        if not (ref1 == ref2) or target_regions is None:
            read_pair_targ_dist = None
        else:
            left_pos = pos1.pos
            right_pos = pos2.pos + (len(pos2.seq)
                                    if pos2.seq is not None else 0)
            regions = target_regions.get(ref1, None)

            if regions is None:
                # No targets on this chrom -- distance is large positive number
                read_pair_targ_dist = 100000000
            else:
                read_pair_targ_dist = get_read_regions_dist(
                    left_pos, right_pos, regions)

        read_pair_info['read_pair_targ_dist'] = read_pair_targ_dist

        r1_mapped, r1_soft_clipped = cigar_count_bases(read1)
        r2_mapped, r2_soft_clipped = cigar_count_bases(read2)

        read_pair_info['mapped_bases'] = r1_mapped + r2_mapped
        read_pair_info[
            'soft_clipped_bases'] = r1_soft_clipped + r2_soft_clipped

        if (not read1.is_unmapped
                and not read2.is_unmapped) and read1.tid == read2.tid:
            read_pair_info['insert_length'] = pos2.aend - pos1.pos
        else:
            read_pair_info['insert_length'] = None

        read_pair_info['r1_q20_bases'] = tk_fasta.get_bases_qual(
            read1.qual, 20)
        read_pair_info['r1_q30_bases'] = tk_fasta.get_bases_qual(
            read1.qual, 30)

        read_pair_info['r2_q20_bases'] = tk_fasta.get_bases_qual(
            read2.qual, 20)
        read_pair_info['r2_q30_bases'] = tk_fasta.get_bases_qual(
            read2.qual, 30)

        read_pair_info['si_q20_bases'] = tk_fasta.get_bases_qual(
            read_sample_index_qual, 20)
        read_pair_info['si_q30_bases'] = tk_fasta.get_bases_qual(
            read_sample_index_qual, 30)

        read_pair_info['bc_q20_bases'] = tk_fasta.get_bases_qual(
            read_bc_qual, 20)
        read_pair_info['bc_q30_bases'] = tk_fasta.get_bases_qual(
            read_bc_qual, 30)

        yield read_pair_info
def min_qual_below(qual, threshold):
    """ Return true if the min qual is below the threshold
         qual: a read's qual string  """
    return threshold is not None and tk_fasta.get_min_qual(qual) < threshold