def scan_alignment(bamfile):
    inpsam = os.popen('samtools view "{}"'.format(bamfile))

    totaltags = totalbases = 0
    cnt_subst = cnt_del = cnt_ins = 0

    for record in sam.parse_sam_simple(inpsam):
        # <ParsedLine qname='6-3799' flag=0 rname='chr9' pos=120960330 mapq=40 cigar='36M' rnext='*' pnext='0' tlen=0 seq='AAAAACTTTATCGGGGATACATGCGGTAGGGTAAAT' qual='*'>
        nreads = int(record.qname.split('-')[1])
        totaltags += nreads

        reflen, readlen = sam.calculate_cigar_length(record.cigar)
        totalbases += nreads * reflen

        nmtag = [int(tok.split(':')[2]) for tok in str(record).split('\t')[11:]
                 if tok.startswith('NM:i:')][0]
        rec_inserts = rec_deletions = 0
        for size, cigarcmd in sam.cigar_pattern.findall(record.cigar):
            if cigarcmd == 'D':
                rec_deletions += int(size)
            elif cigarcmd == 'I':
                rec_inserts += int(size)

        rec_substs = nmtag - rec_inserts - rec_deletions
        assert rec_substs >= 0

        cnt_subst += rec_substs * nreads
        cnt_del += rec_deletions * nreads
        cnt_ins += rec_inserts * nreads

    return {
        'tags': totaltags,
        'bases': totalbases,
        'substitutions': cnt_subst,
        'deletions': cnt_del,
        'insertions': cnt_ins,
Exemplo n.º 2
def process(inpfile, output, refseqfile, delpos='5'):
    if delpos == '5':
        is_leftbind = lambda line: not line.flag & F_REVERSE_STRAND
    elif delpos == '3':
        is_leftbind = lambda line: line.flag & F_REVERSE_STRAND
        raise ValueError('Unknown binding oriention %s' % repr(delpos))

    for line in parse_sam_simple(inpfile):
        if str(line)[0] == '@' or 'D' not in line.cigar:

        leftpos = line.pos - 1 # to zero-based coord
        reflen, seqlen = calculate_cigar_length(line.cigar)
        refseq = refseqfile.get(line.rname, leftpos, leftpos + reflen).upper()
        assert len(refseq) == reflen

        cigar_tokens = map(list, cigar_pattern.findall(line.cigar))
        leftclip = int(cigar_tokens[0][0]) if cigar_tokens[0][1] == 'S' else 0
        rightclip = int(cigar_tokens[-1][0]) if cigar_tokens[-1][1] == 'S' else 0

        refleft = 0
        tainted = False
        for toki, (width, cmd) in enumerate(cigar_tokens):
            width = int(width)
            if cmd != 'D':
                if cmd in 'MPN':
                    refleft += width

            deleted = refseq[refleft:refleft+width]

            if len(set(deleted)) > 1:
                # do not break deletions with two or more heteronucleotide, e.g.:
                #     ^^ 2D
                refleft += width

            lefttok = cigar_tokens[toki-1]
            righttok = cigar_tokens[toki+1]

            if is_leftbind(line): # push to the left as possible
                spanleft = refleft
                if lefttok[1] == 'M':
                    lefttokwidth = int(lefttok[0])

                    for i in xrange(spanleft - 1, spanleft - lefttokwidth - 1, -1):
                        if refseq[i] == deleted[0]:
                            spanleft = i

                    leftexpand = refleft - spanleft

                    if leftexpand > 0:
                        lefttok[0] = str(int(lefttok[0]) - leftexpand)
                        righttok[0] = str(int(righttok[0]) + leftexpand)
                        tainted = True

            else: # push to the right as possible
                spanright = refleft + width - 1
                if righttok[1] == 'M':
                    righttokwidth = int(righttok[0])

                    for i in xrange(refleft + width, refleft + width + righttokwidth):
                        if refseq[i] == deleted[0]:
                            spanright = i

                    rightexpand = spanright - refleft - width + 1

                    if rightexpand > 0:
                        lefttok[0] = str(int(lefttok[0]) + rightexpand)
                        righttok[0] = str(int(righttok[0]) - rightexpand)
                        tainted = True

            refleft += width

        if not tainted:
            fields = str(line).split('\t')
            fields[5] = ''.join(w+cmd for w, cmd in cigar_tokens)