示例#1
0
def deconcat(sequence, prev):
    if prev == 'R3':
        o1 = parasail.sg_qx_trace(sequence, SEQ_R5_F5, 3, 1, SCOREMAT)
        o2 = parasail.sg_qx_trace(sequence, SEQ_R5_R3, 3, 1, SCOREMAT)
        if o1.score >= MIN_SCORE and o1.score > o2.score:
            return o1.get_traceback().comp.find(
                '|'), o1.end_query, o1.score, 'F5'
        elif o2.score >= MIN_SCORE:
            return o2.get_traceback().comp.find(
                '|'), o2.end_query, o2.score, 'R3'
        else:
            return None
    elif prev == 'F5':
        o1 = parasail.sg_qx_trace(sequence, SEQ_F3_R3, 3, 1, SCOREMAT)
        o2 = parasail.sg_qx_trace(sequence, SEQ_F3_F5, 3, 1, SCOREMAT)
        if o1.score >= MIN_SCORE and o1.score > o2.score:
            return o1.get_traceback().comp.find(
                '|'), o1.end_query, o1.score, 'R3'
        elif o2.score >= MIN_SCORE:
            return o2.get_traceback().comp.find(
                '|'), o2.end_query, o2.score, 'F5'
        else:
            return None
    else:
        raise ValueError(
            "Expected previous primer to be F5 or R3. Saw {0} instead. Abort!".
            format(prev))
示例#2
0
def find_matching_probe_pair(m1, m2, readlen, readseq, proberegions):
    """

    :param m1: (shift, list of matching ext)
    :param m2: (shift, list of matching lig)
    :return: matched pairs if possible, otherwise whatever matches
    """
    # ex: (0, [('ML_COV_675f_22PPB_00080', 'ext', 'TTTGTCACGCACTCAAAGGGA')])
    shift1, list1 = m1
    shift2, list2 = m2
    match_candidates = []
    for name1, type1, fullseq1 in list1:
        if type1 != 'ext':
            continue  # we want the 5' to always be ext, 3' always to be lig
        ext_start0, ext_end1, ext_seq = proberegions[name1]['ext']
        for name2, type2, fullseq2 in list2:
            if type2 != 'lig': continue
            lig_start0, lig_end1, lig_seq = proberegions[name2]['lig']
            expected_size = abs(lig_end1 - ext_start0)
            if name1 == name2:
                return ((readlen - expected_size), name1, name2, shift1,
                        shift2, fullseq1, fullseq2)
            match_candidates.append(((readlen - expected_size), name1, name2,
                                     shift1, shift2, fullseq1, fullseq2))
        # if we reach here, we have a 5' ext, but no matching 3' lig based on the hashing, so let's try to align it
        for name2, info in proberegions.items():
            lig_start0, lig_end1, lig_seq = info['lig']
            query_seq = readseq[-(len(lig_seq) + MAX_SW_PAD):]
            o1 = parasail.sg_qx_trace(query_seq, lig_seq, 3, 1,
                                      SW_SCORE_MATRIX)
            if o1.score >= MIN_SW_SCORE:
                expected_size = abs(lig_end1 - ext_start0)
                return ((readlen - expected_size), name1, name1, shift1,
                        len(query_seq) - o1.end_query - 1, ext_seq, lig_seq)

    # go through lig and find matching ext
    for name2, type2, fullseq2 in list2:
        if type2 != 'lig': continue
        lig_start0, lig_end1, lig_seq = proberegions[name2]['lig']
        for name1, info in proberegions.items():
            ext_start0, ext_end1, ext_seq = info['ext']
            query_seq = readseq[:(len(ext_seq) + MAX_SW_PAD)]
            o1 = parasail.sg_qx_trace(query_seq, ext_seq, 3, 1,
                                      SW_SCORE_MATRIX)
            if o1.score >= MIN_SW_SCORE:
                expected_size = abs(lig_end1 - ext_start0)
                return ((readlen - expected_size), name2, name2,
                        len(query_seq) - o1.end_query - 1, shift2, ext_seq,
                        lig_seq)

    if len(match_candidates) == 0:
        return None
    else:
        # return match with minimal diff in expected size
        match_candidates.sort(key=lambda x: abs(x[0]))
        return match_candidates[0]
示例#3
0
def trim5p3p_helper(r: SeqRecord) -> ScoreTuple:
    """
    Search for 5' and 3' in the first and last 100 bp window
    """
    s1 = str(r.seq[:100])
    s2 = str(r.reverse_complement().seq[:100])

    o1 = parasail.sg_qx_trace(s1, SEQ_5P, 3, 1, SCOREMAT)
    o2 = parasail.sg_qe_db_trace(s2, SEQ_3P_REV, 3, 1, SCOREMAT)
    lenA = None
    if o2.score >= MINSCORE_3P:
        lenA = trimA(s2[o2.end_query + 1:])

    if MIN_A_LEN == 0:
        end3 = len(r.seq) - o2.end_query - 1
        return ScoreTuple(score5=o1.score,
                          end5=o1.end_query,
                          score3=o2.score,
                          end3=end3,
                          endA=end3)
    elif lenA is not None:
        end3 = len(r.seq) - o2.end_query - 1
        endA = end3 - lenA + 1
        return ScoreTuple(score5=o1.score,
                          end5=o1.end_query,
                          score3=o2.score,
                          end3=end3,
                          endA=endA)
    else:
        end3 = len(r.seq) - o2.end_query - 1
        return ScoreTuple(score5=o1.score,
                          end5=o1.end_query,
                          score3=o2.score,
                          end3=end3,
                          endA=end3)
示例#4
0
def find_probe_match_by_mapping(r, seq2, proberegions, ext_regions_by_start,
                                lig_regions_by_start):
    i = bisect.bisect_left(ext_regions_by_start,
                           (r.reference_start - SW_SEARCH_PAD, '*'))
    lig_last_index = min(
        len(lig_regions_by_start) - 1,
        bisect.bisect_right(lig_regions_by_start,
                            (r.reference_end + SW_SEARCH_PAD, '*'),
                            lo=i))
    while i < len(ext_regions_by_start):
        probename1 = ext_regions_by_start[i][1]
        ext_start0, ext_end1, ext_seq = proberegions[probename1]['ext']
        o1 = parasail.sg_qx_trace(seq2, ext_seq, 3, 1, SW_SCORE_MATRIX)
        if o1.score >= MIN_SW_SCORE:
            # now we need to know how long the UMI is
            num1, s1 = next(iter_cigar_string(str(o1.cigar.decode, 'utf-8')))
            if s1 != 'I':  # don't see the UMI! ugh!
                num1 = 0
            j = max(0, i - MAX_SW_PAD)
            while j <= lig_last_index:
                probename2 = lig_regions_by_start[j][1]
                lig_start0, lig_end1, lig_seq = proberegions[probename2]['lig']
                o2 = parasail.sg_qx_trace(seq2[-len(lig_seq) - MAX_SW_PAD:],
                                          lig_seq, 3, 1, SW_SCORE_MATRIX)
                if o2.score >= MIN_SW_SCORE:
                    #pdb.set_trace()
                    # it's a hit of (probename1, probename2)
                    num2, s2 = next(
                        iter_cigar_string_backwards(
                            str(o2.cigar.decode, 'utf-8')))
                    if s2 != 'I':
                        num2 = 0
                    expected_size = abs(lig_end1 - ext_start0)
                    size_diff = len(r.seq) - num1 - num2 - expected_size
                    return (size_diff, probename1, probename2,\
                            num1, num2, ext_seq, lig_seq)
                j += 1
        i += 1
    return None
示例#5
0
def deconcat(sequence, prev):
    if prev == "R3":
        o1 = parasail.sg_qx_trace(sequence, SEQ_R5_F5, 3, 1, SCOREMAT)
        o2 = parasail.sg_qx_trace(sequence, SEQ_R5_R3, 3, 1, SCOREMAT)
        if o1.score >= MIN_SCORE and o1.score > o2.score:
            return o1.get_traceback().comp.find("|"), o1.end_query, o1.score, "F5"
        elif o2.score >= MIN_SCORE:
            return o2.get_traceback().comp.find("|"), o2.end_query, o2.score, "R3"
        else:
            return None
    elif prev == "F5":
        o1 = parasail.sg_qx_trace(sequence, SEQ_F3_R3, 3, 1, SCOREMAT)
        o2 = parasail.sg_qx_trace(sequence, SEQ_F3_F5, 3, 1, SCOREMAT)
        if o1.score >= MIN_SCORE and o1.score > o2.score:
            return o1.get_traceback().comp.find("|"), o1.end_query, o1.score, "R3"
        elif o2.score >= MIN_SCORE:
            return o2.get_traceback().comp.find("|"), o2.end_query, o2.score, "F5"
        else:
            return None
    else:
        raise ValueError(
            f"Expected previous primer to be F5 or R3. Saw {prev} instead. Abort!"
        )
示例#6
0
def validate_reconstructed_seq(seq, orig):
    """
    seq --- the sequence that is reconstructed
    orig --- the original sequence

    because the reconstructed seq can be longer, we don't care about deletions
      (deletions w.r.t could just be exon skipping or minor base errors)
    we only care that there is NOT a lot of insertions (which would indicate error in my bubble solution)
    """
    o1 = parasail.sg_qx_trace(seq, orig, 3, 1, parasail.matrix_create("ACGT", 2, -5))
    if o1.score < l2*2*.90: return False, o1.cigar.decode
    for num, type in iter_cigar_string(o1.cigar.decode):
        if type == 'I' and num > 5:
            return False, o1.cigar.decode
    return True, o1.cigar.decode
示例#7
0
def sars_cov_2_s_genome(genome):
    '''Returns translated SARS-CoV-2 spike sequence contained in nucleotide string'''
    try:
        ref_gene = str(
            SeqIO.read(
                f'{PACKAGE_PATH}/schemes/sars-cov-2-s/assets/MN908947.3.S.fa',
                'fasta').seq)
        result = parasail.sg_qx_trace(ref_gene, genome, 10, 1,
                                      parasail.dnafull)
        aln_start = result.traceback.query.rfind(ref_gene)
        aln_end = aln_start + len(ref_gene)
        gene = Seq(result.traceback.ref[aln_start:aln_end])
        protein = str(gene.ungap().translate()).strip('*')
    except Exception as e:
        raise RuntimeError('Problem extracting spike sequence').with_traceback(
            e.__traceback__)
    return protein
示例#8
0
def node_is_similar(seq1, seq2):
    l1 = len(seq1)
    l2 = len(seq2)
    if l1 == 0 or l2 == 0: return False
    if l1 <= 2 and l2 <= 2: return True
    if l1 < l2:
        l1, l2 = l2, l1
        seq1, seq2 = seq2, seq1
    # always make seq1 the longer one
    o1 = parasail.sg_qx_trace(seq1, seq2, 3, 1, parasail.matrix_create("ACGT", 2, -5))
    # require the the whole (shorter) seq2 must be aligned
    # and set min score to approx 90% accuracy

    if EXPECTED_ERR_RATE == 0:
        return o1.score > l2*2*1.0
    elif EXPECTED_ERR_RATE < 2:
        return o1.score > l1*2*0.8
    else:
        raise Exception("Expected error rate not implemented for {0}% and above".format(EXPECTED_ERR_RATE))
    return res is not None
示例#9
0
def clip_out(bam_filename, umi_len, bc_len, output_prefix, UMI_type, shortread_bc={}, tso_len=0, g5_clip_seq=None):
    """
    :param bam_filename: BAM of post-LIMA (primer-trimmed) CCS sequences
    :param UMI_type: either 'A3' or 'G5' or 'G5-10X'
    :param shortread_bc: a dict of barcode -> "Y|N" for top-ranked. If given, came from short read data.

    --------
    G5-10X
    --------
    5' primer -- BC --- UMI -- TSO --- GGG --- transcript --- polyA

    --------
    G5-clip
    assumes input is like below, where the 5'/3' primer already removed by lima
    Here, we will only clip out the UMI, and write out the rest of the sequence, keeping the RT + transcript
    There is no assumption about the polyA tail existing or not
    --------
    5' primer -- UMI -- [RT primer] --- transcript --- 3' primer
    """
    assert UMI_type in ('A3', 'G5', 'G5-10X', 'G5-clip')
    umi_bc_len = umi_len + bc_len

    if UMI_type == 'G5-clip':
        try:
            import parasail
        except ImportError:
            print("need parasail library for G5-clip mode! Abort!", file=sys.stderr)
            sys.exit(-1)
        para_mat = parasail.matrix_create("ACGT", 2, -5)
        para_search_len = umi_len + len(g5_clip_seq) + 10

    FIELDS = ['id', 'clip_len', 'extra', 'UMI', 'BC', 'BC_rev', 'BC_match', 'BC_top_rank']
    if tso_len > 0: FIELDS += ['TSO']

    f1 = open(output_prefix + '.trimmed.csv', 'w')
    writer1 = DictWriter(f1, FIELDS, delimiter='\t', dialect='unix')
    writer1.writeheader()

    reader = pysam.AlignmentFile(bam_filename, 'rb', check_sq=False)
    #reader = BamIO.IndexedBamReader(bam_filename)
    f2 = pysam.AlignmentFile(output_prefix+'.trimmed.bam', 'wb', header=reader.header)

    for r in reader:
        d = r.to_dict()

        #is_rev_strand = r.flag >> 4 & 1
        if (r.flag >> 4 & 1):
            d['seq'] = str(Seq(r.seq).reverse_complement())
            d['qual'] = r.qual[::-1]
            new_tags = []
            for tag in d['tags']:
                if tag.startswith('dq:i:') or tag.startswith('iq:i:') or tag.startswith('sq:i:'):
                    tag = tag[:5] + tag[::-1][:-5]
                new_tags.append(tag)
            d['tags'] = new_tags
            d['flag'] = '4'   # convert it back to not being rev complemented


        if UMI_type == 'A3':
            A_start, A_end = find_Aend(d['seq'])
            if A_end > 0:
                seq2 = d['seq'][A_end:]  # should be just UMI + BC, unless UMI started with 'A's

                diff = len(seq2) - umi_bc_len
                if diff < 0: # UMI may have started with 'A's
                    seq2 = d['seq'][A_end+diff:]

                seq_extra = 'NA'
                if diff > 0: seq_extra = seq2[:diff]

                if bc_len == 0:
                    seq_bc = ''
                else:
                    seq_bc = seq2[-bc_len:]

                if umi_len == 0:
                    seq_umi = ''
                else:
                    if bc_len == 0:
                        seq_umi = seq2[-umi_len:]
                    else:
                        seq_umi = seq2[-(bc_len+umi_len):-bc_len]


                # reverse complement BC because it's always listed in rev comp in short read data
                seq_bc_rev = str(Seq(seq_bc).reverse_complement())

                match = 'Y' if seq_bc_rev in shortread_bc else 'N'
                match_top = 'Y' if (match=='Y' and shortread_bc[seq_bc_rev]=='Y') else 'N'

                rec = {'id': r.qname,
                       'clip_len': len(seq2),
                       'extra': seq_extra,
                       'UMI': seq_umi,
                       'BC': seq_bc,
                       'BC_rev': seq_bc_rev,
                       'BC_match': match,
                       'BC_top_rank': match_top}
                writer1.writerow(rec)

                # subset the sequence to include only the polyAs
                d['seq'] = d['seq'][:A_end]
                d['qual'] = d['qual'][:A_end]
                assert len(d['seq'])==len(d['qual'])
                new_tags = []
                for tag in d['tags']:
                    if tag.startswith('zs:B'): # defunct CCS tag, don't use
                        pass
                    elif tag.startswith('dq:i:') or tag.startswith('iq:i:') or tag.startswith('sq:i:'):
                        tag = tag[:A_end+5]
                        new_tags.append(tag)
                    else:
                        new_tags.append(tag)
                d['tags'] = new_tags
                x = pysam.AlignedSegment.from_dict(d, r.header)
                f2.write(x)
        elif UMI_type == 'G5':
            G_start, G_end = find_Gstart(d['seq'])
            if G_start > 0:
                seq2 = d['seq'][:G_start]  # should be just UMI

                diff = len(seq2) - umi_len
                if diff < 0:  # UMI may have ended with Gs
                    seq2 = d['seq'][:G_start-diff]

                seq_extra = 'NA'
                if diff > 0:
                    seq_extra = seq2[:diff]
                    seq2 = seq2[diff:]

                rec = {'id': r.qname,
                       'clip_len': len(seq2),
                       'extra': seq_extra,
                       'UMI': seq2,
                       'BC': 'NA',  # Brendan's current design has only UMI, no BC
                       'BC_rev': 'NA',
                       'BC_match': 'NA',
                       'BC_top_rank': 'NA'}
                writer1.writerow(rec)

                # subset the sequence to remove the UMIs and "G"s
                d['seq'] = d['seq'][G_end:]
                d['qual'] = d['qual'][G_end:]
                assert len(d['seq'])==len(d['qual'])
                new_tags = []
                for tag in d['tags']:
                    if tag.startswith('zs:B'):  # defunct CCS tag, don't use
                        pass
                    elif tag.startswith('dq:i:') or tag.startswith('iq:i:') or tag.startswith('sq:i:'):
                        tag = tag[:5] + tag[5+G_end:]
                        new_tags.append(tag)
                    else:
                        new_tags.append(tag)
                d['tags'] = new_tags
                x = pysam.AlignedSegment.from_dict(d, r.header)
                f2.write(x)
        elif UMI_type == 'G5-clip':
            o1 = parasail.sg_qx_trace(d['seq'][:para_search_len],
                                               g5_clip_seq,
                                               10,
                                               3,
                                               para_mat)

            #  'tags': ['bx:B:i,22,20',
            #   ...
            #   'qe:i:2835',
            #   'bc:B:S,0,1',
            #   'bl:Z:CCCGCGTGGCCTCCTGAATTAT',
            #   'bt:Z:CATTGCCACTGTCTTCTGCT',
            #   'RG:Z:70de1488/0--1']}
            c_num, c_type = next(iter_cigar_string(str(o1.cigar.decode, 'utf-8')))
            if c_type == 'I': # this is the (extra) + UMI
                seq2 = d['seq'][:c_num]
                seq_extra = 'NA'
                diff = len(seq2) - umi_len
                if diff < 0: # we need to get a few more bases from the primers
                    tag_dict = dict(x.split(':', 1) for x in d['tags'])
                    try:
                        if tag_dict['bc'] == 'B:S,0,1': # + strand
                            assert tag_dict['bl'].startswith('Z:')
                            Fseq = tag_dict['bl'][2:]  # trimming away the Z:
                        elif tag_dict['bc'] == 'B:S,1,0': # - strand
                            assert tag_dict['bt'].startswith('Z:')
                            Fseq = str(Seq(tag_dict['bt'][2:]).reverse_complement())
                        seq2 = Fseq[diff:] + seq2 # rescue bases from the trimmed F primer
                    except KeyError:
                        pass # just silently not do anything and output the shorter UMI
                        #print("WARNING: older version of lima output, lacking 'bc' tag. Ignoring read {0}...".format(r.qname))
                elif diff > 0:  # there's extras
                    seq_extra = seq2[:diff]
                    seq2 = seq2[diff:]

                rec = {'id': r.qname,
                       'clip_len': len(seq2),
                       'extra': seq_extra,
                       'UMI': seq2,
                       'BC': 'NA',  # Brendan's current design has only UMI, no BC
                       'BC_rev': 'NA',
                       'BC_match': 'NA',
                       'BC_top_rank': 'NA'}
                writer1.writerow(rec)

                # subset the sequence to remove the UMI (but keep the G5 clip seq)
                d['seq'] = d['seq'][c_num:]
                d['qual'] = d['qual'][c_num:]
                assert len(d['seq'])==len(d['qual'])
                new_tags = []
                for tag in d['tags']:
                    if tag.startswith('zs:B'):  # defunct CCS tag, don't use
                        pass
                    elif tag.startswith('dq:i:') or tag.startswith('iq:i:') or tag.startswith('sq:i:'):
                        tag = tag[:5] + tag[5+c_num:]
                        new_tags.append(tag)
                    else:
                        new_tags.append(tag)
                d['tags'] = new_tags
                x = pysam.AlignedSegment.from_dict(d, r.header)
                f2.write(x)
        elif UMI_type == 'G5-10X':
            # need to first invert the sequence so polyA is at the end
            d['seq'] = str(Seq(d['seq']).reverse_complement())
            d['qual'] = d['qual'][::-1]
            # now it is BC -- UMI -- TSO -- GGG -- transcript -- polyA
            umi_bc_tso_len = bc_len + umi_len + tso_len
            G_start, G_end = find_Gstart(d['seq'][umi_bc_tso_len:umi_bc_tso_len+10])

            #pdb.set_trace()

            if G_start >= 0:
                G_start += umi_bc_tso_len
                G_end   += umi_bc_tso_len

                seq2 = d['seq'][:G_start]  # this is BC - UMI - TSO
                seq_tso = seq2[-tso_len:] + d['seq'][G_start:G_end]

                diff = len(seq2) - umi_bc_tso_len
                if diff > 0: # beginning may have included untrimmed primers
                    seq_extra = seq2[:diff]
                    seq2 = seq2[diff:]
                    seq_bc = seq2[:bc_len]
                    seq_umi = seq2[bc_len:umi_bc_len]
                elif diff == 0:
                    seq_extra = 'NA'
                    seq_bc = seq2[:bc_len]
                    seq_umi = seq2[bc_len:umi_bc_len]
                elif diff < 0: # we may have accidentally trimmed away some bases for BC, can't do anything
                    seq_extra = 'NA'
                    seq_bc = seq2[:bc_len+diff]
                    seq_umi = seq2[bc_len+diff:umi_bc_len+diff]

                # reverse complement BC because it's always listed in rev comp in short read data
                seq_bc_rev = str(Seq(seq_bc).reverse_complement())
                match = 'Y' if seq_bc_rev in shortread_bc else 'N'
                match_top = 'Y' if (match=='Y' and shortread_bc[seq_bc_rev]=='Y') else 'N'

                rec = {'id': r.qname,
                       'clip_len': len(seq2)+(G_end-G_start),
                       'extra': seq_extra,
                       'UMI': seq_umi,
                       'BC': seq_bc,
                       'TSO': seq_tso,
                       'BC_rev': seq_bc_rev,
                       'BC_match': match,
                       'BC_top_rank': match_top}
                writer1.writerow(rec)

                # subset the sequence to remove the UMIs and "G"s
                d['seq'] = d['seq'][G_end:]
                d['qual'] = d['qual'][G_end:]
                assert len(d['seq'])==len(d['qual'])
                new_tags = []
                for tag in d['tags']:
                    if tag.startswith('zs:B'):  # defunct CCS tag, don't use
                        pass
                    elif tag.startswith('dq:i:') or tag.startswith('iq:i:') or tag.startswith('sq:i:'):
                        tag = tag[:5] + tag[5+G_end:]
                        new_tags.append(tag)
                    else:
                        new_tags.append(tag)
                d['tags'] = new_tags
                x = pysam.AlignedSegment.from_dict(d, r.header)
                f2.write(x)

    f1.close()
    f2.close()
def clip_out(
    bam_filename: str,
    umi_len: int,
    bc_len: int,
    output_prefix: str,
    UMI_type: umi_types,
    shortread_bc: Optional[Dict[str, str]] = None,
    tso_len: int = 0,
    g5_clip_seq: Optional[str] = None,
) -> None:
    """
    :param bam_filename: BAM of post-LIMA (primer-trimmed) CCS sequences
    :param UMI_type: either 'A3' or 'G5' or 'G5-10X'
    :param shortread_bc: a dict of barcode -> "Y|N" for top-ranked. If given, came from short read data.

    --------
    G5-10X
    --------
    5' primer -- BC --- UMI -- TSO --- GGG --- transcript --- polyA

    --------
    G5-clip
    assumes input is like below, where the 5'/3' primer already removed by lima
    Here, we will only clip out the UMI, and write out the rest of the sequence, keeping the RT + transcript
    There is no assumption about the polyA tail existing or not
    --------
    5' primer -- UMI -- [RT primer] --- transcript --- 3' primer
    """
    if shortread_bc is None:
        shortread_bc = dict()

    if UMI_type not in ("A3", "G5", "G5-10X", "G5-clip"):
        raise ValueError(
            f"UMI is of the wrong type.  Got {UMI_type} Must be one of 'A3', 'G5', 'G5-10X', 'G5-clip'"
        )

    umi_bc_len = umi_len + bc_len

    if UMI_type == "G5-clip":
        try:
            import parasail
        except ImportError:
            logger.error("need parasail library for G5-clip mode! Abort!")
            sys.exit(-1)
        para_mat = parasail.matrix_create("ACGT", 2, -5)
        para_search_len = umi_len + len(g5_clip_seq) + 10

    FIELDS = [
        "id",
        "clip_len",
        "extra",
        "UMI",
        "BC",
        "BC_rev",
        "BC_match",
        "BC_top_rank",
    ]
    if tso_len > 0:
        FIELDS += ["TSO"]

    with pysam.AlignmentFile(bam_filename, "rb", check_sq=False) as reader:
        with open(f"{output_prefix}.trimmed.csv",
                  "w") as f1, pysam.AlignmentFile(
                      f"{output_prefix}.trimmed.bam",
                      "wb",
                      header=reader.header) as f2:
            writer1 = DictWriter(
                f1,
                FIELDS,
                delimiter="\t",
                dialect="unix",
            )
            writer1.writeheader()

            for r in reader:
                d = r.to_dict()

                # is_rev_strand = r.flag >> 4 & 1
                if r.flag >> 4 & 1:
                    d["seq"] = str(Seq(r.seq).reverse_complement())
                    d["qual"] = r.qual[::-1]
                    new_tags = []
                    for tag in d["tags"]:
                        if (tag.startswith("dq:i:") or tag.startswith("iq:i:")
                                or tag.startswith("sq:i:")):
                            tag = tag[:5] + tag[::-1][:-5]
                        new_tags.append(tag)
                    d["tags"] = new_tags
                    d["flag"] = "4"  # convert it back to not being rev complemented

                if UMI_type == "A3":
                    A_start, A_end = find_Aend(d["seq"])
                    if A_end > 0:
                        seq2 = d["seq"][
                            A_end:]  # should be just UMI + BC, unless UMI started with 'A's

                        diff = len(seq2) - umi_bc_len
                        if diff < 0:  # UMI may have started with 'A's
                            seq2 = d["seq"][A_end + diff:]

                        seq_extra = "NA"
                        if diff > 0:
                            seq_extra = seq2[:diff]

                        if bc_len == 0:
                            seq_bc = ""
                        else:
                            seq_bc = seq2[-bc_len:]

                        if umi_len == 0:
                            seq_umi = ""
                        else:
                            if bc_len == 0:
                                seq_umi = seq2[-umi_len:]
                            else:
                                seq_umi = seq2[-(bc_len + umi_len):-bc_len]

                        # reverse complement BC because it's always listed in rev comp in short read data
                        seq_bc_rev = str(Seq(seq_bc).reverse_complement())

                        match = "Y" if seq_bc_rev in shortread_bc else "N"
                        match_top = ("Y" if
                                     (match == "Y"
                                      and shortread_bc[seq_bc_rev] == "Y") else
                                     "N")

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2),
                            "extra": seq_extra,
                            "UMI": seq_umi,
                            "BC": seq_bc,
                            "BC_rev": seq_bc_rev,
                            "BC_match": match,
                            "BC_top_rank": match_top,
                        }
                        writer1.writerow(rec)

                        # subset the sequence to include only the polyAs
                        d["seq"] = d["seq"][:A_end]
                        d["qual"] = d["qual"][:A_end]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:A_end + 5]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)
                elif UMI_type == "G5":
                    G_start, G_end = find_Gstart(d["seq"])
                    if G_start > 0:
                        seq2 = d["seq"][:G_start]  # should be just UMI

                        diff = len(seq2) - umi_len
                        if diff < 0:  # UMI may have ended with Gs
                            seq2 = d["seq"][:G_start - diff]

                        seq_extra = "NA"
                        if diff > 0:
                            seq_extra = seq2[:diff]
                            seq2 = seq2[diff:]

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2),
                            "extra": seq_extra,
                            "UMI": seq2,
                            "BC":
                            "NA",  # Brendan's current design has only UMI, no BC
                            "BC_rev": "NA",
                            "BC_match": "NA",
                            "BC_top_rank": "NA",
                        }
                        writer1.writerow(rec)

                        # subset the sequence to remove the UMIs and "G"s
                        d["seq"] = d["seq"][G_end:]
                        d["qual"] = d["qual"][G_end:]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:5] + tag[5 + G_end:]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)
                elif UMI_type == "G5-clip":
                    o1 = parasail.sg_qx_trace(d["seq"][:para_search_len],
                                              g5_clip_seq, 10, 3, para_mat)

                    #  'tags': ['bx:B:i,22,20',
                    #   ...
                    #   'qe:i:2835',
                    #   'bc:B:S,0,1',
                    #   'bl:Z:CCCGCGTGGCCTCCTGAATTAT',
                    #   'bt:Z:CATTGCCACTGTCTTCTGCT',
                    #   'RG:Z:70de1488/0--1']}
                    c_num, c_type = next(
                        iter_cigar_string(str(o1.cigar.decode, "utf-8")))
                    if c_type == "I":  # this is the (extra) + UMI
                        seq2 = d["seq"][:c_num]
                        seq_extra = "NA"
                        diff = len(seq2) - umi_len
                        if diff < 0:  # we need to get a few more bases from the primers
                            tag_dict = dict(x.split(":", 1) for x in d["tags"])
                            try:
                                if tag_dict["bc"] == "B:S,0,1":  # + strand
                                    assert tag_dict["bl"].startswith("Z:")
                                    Fseq = tag_dict["bl"][
                                        2:]  # trimming away the Z:
                                elif tag_dict["bc"] == "B:S,1,0":  # - strand
                                    assert tag_dict["bt"].startswith("Z:")
                                    Fseq = str(
                                        Seq(tag_dict["bt"]
                                            [2:]).reverse_complement())
                                seq2 = (
                                    Fseq[diff:] + seq2
                                )  # rescue bases from the trimmed F primer
                            except KeyError:
                                pass  # just silently not do anything and output the shorter UMI
                                # print("WARNING: older version of lima output, lacking 'bc' tag. Ignoring read {0}...".format(r.qname))
                        elif diff > 0:  # there's extras
                            seq_extra = seq2[:diff]
                            seq2 = seq2[diff:]

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2),
                            "extra": seq_extra,
                            "UMI": seq2,
                            "BC":
                            "NA",  # Brendan's current design has only UMI, no BC
                            "BC_rev": "NA",
                            "BC_match": "NA",
                            "BC_top_rank": "NA",
                        }
                        writer1.writerow(rec)

                        # subset the sequence to remove the UMI (but keep the G5 clip seq)
                        d["seq"] = d["seq"][c_num:]
                        d["qual"] = d["qual"][c_num:]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:5] + tag[5 + c_num:]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)
                elif UMI_type == "G5-10X":
                    # need to first invert the sequence so polyA is at the end
                    d["seq"] = str(Seq(d["seq"]).reverse_complement())
                    d["qual"] = d["qual"][::-1]
                    # now it is BC -- UMI -- TSO -- GGG -- transcript -- polyA
                    umi_bc_tso_len = bc_len + umi_len + tso_len
                    G_start, G_end = find_Gstart(
                        d["seq"][umi_bc_tso_len:umi_bc_tso_len + 10])

                    # pdb.set_trace()

                    if G_start >= 0:
                        G_start += umi_bc_tso_len
                        G_end += umi_bc_tso_len

                        seq2 = d["seq"][:G_start]  # this is BC - UMI - TSO
                        seq_tso = seq2[-tso_len:] + d["seq"][G_start:G_end]

                        diff = len(seq2) - umi_bc_tso_len
                        if diff > 0:  # beginning may have included untrimmed primers
                            seq_extra = seq2[:diff]
                            seq2 = seq2[diff:]
                            seq_bc = seq2[:bc_len]
                            seq_umi = seq2[bc_len:umi_bc_len]
                        elif diff == 0:
                            seq_extra = "NA"
                            seq_bc = seq2[:bc_len]
                            seq_umi = seq2[bc_len:umi_bc_len]
                        elif (
                                diff < 0
                        ):  # we may have accidentally trimmed away some bases for BC, can't do anything
                            seq_extra = "NA"
                            seq_bc = seq2[:bc_len + diff]
                            seq_umi = seq2[bc_len + diff:umi_bc_len + diff]

                        # reverse complement BC because it's always listed in rev comp in short read data
                        seq_bc_rev = str(Seq(seq_bc).reverse_complement())
                        match = "Y" if seq_bc_rev in shortread_bc else "N"
                        match_top = ("Y" if
                                     (match == "Y"
                                      and shortread_bc[seq_bc_rev] == "Y") else
                                     "N")

                        rec = {
                            "id": r.qname,
                            "clip_len": len(seq2) + (G_end - G_start),
                            "extra": seq_extra,
                            "UMI": seq_umi,
                            "BC": seq_bc,
                            "TSO": seq_tso,
                            "BC_rev": seq_bc_rev,
                            "BC_match": match,
                            "BC_top_rank": match_top,
                        }
                        writer1.writerow(rec)

                        # subset the sequence to remove the UMIs and "G"s
                        d["seq"] = d["seq"][G_end:]
                        d["qual"] = d["qual"][G_end:]
                        assert len(d["seq"]) == len(d["qual"])
                        new_tags = []
                        for tag in d["tags"]:
                            if tag.startswith(
                                    "zs:B"):  # defunct CCS tag, don't use
                                pass
                            elif (tag.startswith("dq:i:")
                                  or tag.startswith("iq:i:")
                                  or tag.startswith("sq:i:")):
                                tag = tag[:5] + tag[5 + G_end:]
                                new_tags.append(tag)
                            else:
                                new_tags.append(tag)
                        d["tags"] = new_tags
                        x = pysam.AlignedSegment.from_dict(d, r.header)
                        f2.write(x)