示例#1
0
def export_bam(outbam, read1, read2, quiet=False):
    if read2:

        def gen():
            for r1, r2 in itertools.izip(read1.fetch(quiet=quiet),
                                         read2.fetch(quiet=True)):
                yield (r1, r2)
    else:

        def gen():
            for r1 in read1.fetch(quiet=quiet):
                yield (r1, None)

    for r1, r2 in gen():
        record1 = pysam.AlignedRead()
        record1.qname = r1.name
        record1.seq = r1.seq
        record1.qual = r1.qual

        if r2:
            record1.is_paired = True
            record1.is_read1 = True

            record2 = pysam.AlignedRead()
            record2.qname = r1.name
            record2.seq = r1.seq
            record2.qual = r1.qual
            record2.is_paired = True
            record2.is_read2 = True

        outbam.write(record1)
        if r2:
            outbam.write(record2)
示例#2
0
def read_to_unmapped(read, ref=None):
    '''
    Converts a read from mapped to unmapped.

    Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed)
    '''

    newread = pysam.AlignedRead()

    if ref:
        tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))]

    newread.is_unmapped = True
    newread.mapq = 0
    newread.tlen = 0
    newread.pos = -1
    newread.pnext = -1
    newread.rnext = -1
    newread.tid = -1

    newread.qname = read.qname

    if read.is_paired:
        newread.is_paired = True

    if not read.is_unmapped and read.is_reverse:
        newread.seq = ngsutils.support.revcomp(read.seq)
        newread.qual = read.qual[::-1]
    else:        
        newread.seq = read.seq
        newread.qual = read.qual

    newread.tags = tags

    return newread
示例#3
0
def makeNewRead(trim,loff,newCigar,read,trim2=None,MIPid=None):
  a = pysam.AlignedRead()
  a.qname = read.qname

  a.pos = read.pos+loff
  if (trim > 0) or (trim == None):
    a.seq = read.seq[trim:trim2]
    a.qual = read.qual[trim:trim2]
  else:
    a.seq = read.seq[trim2:trim]
    a.qual = read.qual[trim2:trim]

  a.flag = read.flag
  # Convert PE to SR
  a.is_paired = False
  a.is_read1 = False
  a.is_read2 = False
  a.rname = read.rname
  a.mapq = read.mapq
  a.cigar = newCigar
  #a.mrnm = read.mrnm
  #a.mpos = read.mpos # WILL BE OFF
  #a.isize = read.isize # WILL BE OFF
  
  tags = []
  for key,value in read.tags:
    if key != "NM" and key != "MD" and key != "ZM":
      tags.append((key,value))
    elif key == "ZM" and MIPid == None:
      tags.append((key,value))
  if MIPid != None:
    tags.append(("ZM",MIPid))
  a.tags = tags
  return a
示例#4
0
def read_gen(seq, qual, MD, cigar=None):
    read = pysam.AlignedRead()
    read.seq = seq
    read.qual = qual
    read.tags = [('MD', MD)]
    read.cigar = cigar
    return read
示例#5
0
文件: output.py 项目: zzygyx9119/bs3
    def store(self, qname, N_mismatch, FR, refname, strand, pos, cigar, original_BS, methy, STEVE, rnext = -1, pnext = -1, qual = None, output_genome = None,
              rrbs = False, my_region_serial = -1, my_region_start = 0, my_region_end = 0):

        if self.format == BS_SEEKER1:

            # remove the soft clipped bases from the read
            # this is done for backwards compatibility with the old format
            r_start, r_end, _ = get_read_start_end_and_genome_length(cigar)
            original_BS = original_BS[r_start : r_end]

            if not rrbs:
                #pdb.set_trace()
                self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, STEVE))
            else:
                #pdb.set_trace()
                self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, STEVE, my_region_serial, my_region_start, my_region_end))


        elif self.format == BAM or self.format == SAM:
#	    flag =  0x10 if strand == '-' else 0
#	    cigarr = cigar if strand == '+' else list(reversed(cigar))
 #           rnextt = rnext if rnext == -1 else self.chrom_ids[rnext]
  #          seqq = original_BS if strand == '+' else reverse_compl_seq(original_BS)
	    #pdb.set_trace()
#	    self.to_string = self.to_string +  '\n' + qname + '\t' + str(flag) + '\t' + str(self.chrom_ids[refname]) + '\t' + str(pos) + '\t' + '255' + '\t' +  str(cigarr) + '\t' + str(rnextt) + '\t' + str(pnext) + '\t' + str(seqq) + '\t' + str(qual) + '\t' +  'None' +'\t' +   '[(\'XO\', ' + str(FR) +  '), (\'XS\', '+ str(STEVE) + '), (\'NM\', ' + str(N_mismatch) + '), (\'XM\', ' + str(methy) + '), (\'XG\', ' + str(output_genome) + ')]'
 #           return
	    pdb.set_trace()
            a = pysam.AlignedRead()
            a.qname = qname
            a.seq = original_BS if strand == '+' else reverse_compl_seq(original_BS)
            a.flag =  0x10 if strand == '-' else 0
            a.tid = self.chrom_ids[refname]
            a.pos = pos
            a.mapq = 255
            a.cigar = cigar if strand == '+' else list(reversed(cigar))
            a.rnext = rnext if rnext == -1 else self.chrom_ids[rnext]
            a.pnext = pnext
            a.qual= qual
            if rrbs:
                a.tags = (('XO', FR),
                          ('XS', STEVE),
                          ('NM', N_mismatch),
                          ('XM', methy),
                          ('XG', output_genome),
                          ('YR', my_region_serial),
                          ('YS', my_region_start),
                          ('YE', my_region_end)
                          )

            else:
                a.tags = (('XO', FR),
                          ('XS', STEVE),
                          ('NM', N_mismatch),
                          ('XM', methy),
                          ('XG', output_genome))
            #pdb.set_trace()
            self.f.write(a)
示例#6
0
文件: bam.py 项目: umccr/longranger
def write_read(bam_file,
               name,
               seq,
               qual,
               tid,
               pos,
               mapq=0,
               cigar=None,
               reverse=False,
               paired=False,
               read1=True,
               mate_tid=None,
               mate_pos=None,
               mate_reverse=False):
    """ Creates an alignedRead object and writes it to the corresponding bam file.
    If cigar is unspecified, the alignment is defaulted to a perfect alignment.
    Chrom
    """
    r = pysam.AlignedRead()
    r.qname = name
    r.seq = seq
    r.qual = qual
    r.tid = tid
    r.pos = pos
    r.mapq = mapq

    if pos == -1:
        r.is_unmapped = True

    if cigar:
        r.cigar = cigar
    else:
        r.cigar = [(0, len(seq))]

    r.is_reverse = reverse
    r.is_read1 = read1
    r.is_read2 = not (read1)

    if paired:
        r.is_paired = True

        if mate_tid:
            r.mrnm = mate_tid

        if mate_pos:
            r.mpos = mate_pos

        if mate_pos == -1:
            r.mate_is_unmapped = True

        if mate_reverse:
            r.mate_is_reverse = True
        else:
            r.mate_is_reverse = False

    bam_file.write(r)
示例#7
0
    def generateSamInfo(self, sinfo, seq, cig, startpos, refname, qual):
        a = pysam.AlignedRead()
        a.rname = refname
        a.qname = sinfo.qname
        a.seq = seq
        a.flag = 0  #sinfo.flag & 16
        a.pos = startpos
        a.mapq = sinfo.mapq
        a.cigarstring = cig
        a.rnext = -1
        a.pnext = -1  #a.pos
        #a.isize = sinfo.isize
        a.tlen = 0
        a.qual = qual.strip().replace(" ", "")
        tags = []
        tmp = self.__getTag__(sinfo, "RG")
        if tmp:
            tags.append((
                "RG",
                tmp,
            ))

        tmp = self.__getTag__(sinfo, "X0")
        if tmp:
            tags.append((
                "X0",
                tmp,
            ))
        tmp = self.__getTag__(sinfo, "AS")
        if tmp:
            tags.append((
                "AS",
                tmp,
            ))
        tmp = self.__getTag__(sinfo, "XS")
        if tmp:
            tags.append((
                "XS",
                tmp,
            ))
        tmp = self.__getTag__(sinfo, "YS")
        if tmp:
            tags.append((
                "YS",
                tmp,
            ))
        tmp = self.__getTag__(sinfo, "YT")
        if tmp:
            tags.append((
                "YT",
                tmp,
            ))
        a.tags = tags
        #a.tags = sinfo.tags
        return a
示例#8
0
def convert_read(r, transcript_tid_map, library_type):
    if r.is_unmapped:
        # return copy of original read
        return copy_read(r)
    # copy and modify tags
    tagdict = collections.OrderedDict(r.tags)
    if 'XS' in tagdict:
        del tagdict['XS']
    if 'NH' in tagdict:
        del tagdict['NH']
    # convert transcript reference to genome
    genome_tid, negstrand, exons = transcript_tid_map[r.tid]
    # find genomic start position of transcript
    newpos, eindex, testart, toffset = convert_pos(r.pos, negstrand, exons)
    # parse and convert transcript cigar string
    newcigar, alen, spliced = \
        convert_cigar(r.cigar, negstrand, exons,
                      eindex, testart, toffset)
    if negstrand:
        # set position to left end of transcript
        newpos = newpos - alen + 1
        # flip is_reverse flag
        is_reverse = (not r.is_reverse)
        # reverse complement seq and quals
        seq = DNA_reverse_complement(r.seq)
        qual = None if r.qual is None else r.qual[::-1]
        # flip MD tag
        if 'MD' in tagdict:
            tagdict['MD'] = reverse_complement_MD_tag(tagdict['MD'])
    else:
        is_reverse = r.is_reverse
        seq = r.seq
        qual = r.qual
    # add XS tag
    strand = get_read_strand(r.is_read2, is_reverse, negstrand, library_type)
    tagdict['XS'] = strand
    # create copy of read
    a = pysam.AlignedRead()
    a.qname = r.qname
    a.flag = r.flag
    a.seq = seq
    a.qual = qual
    a.is_reverse = is_reverse
    a.tid = genome_tid
    a.pos = newpos
    a.cigar = newcigar
    a.mapq = r.mapq
    a.rnext = r.rnext
    a.pnext = r.pnext
    a.tlen = r.tlen
    a.tags = tuple(tagdict.iteritems())
    return a
示例#9
0
def writeBAMEntry(outfile, chrom, outputDict, readlength):
    index = 0
    tagList = []
    alignedRead = pysam.AlignedRead()
    queryName = string.split(outputDict["readID"], "/")[0]
    alignedRead.qname = queryName
    if outputDict["sense"] == "-":
        alignedRead.is_reverse = True

    alignedRead.rname = outfile.references.index(chrom)

    if outputDict.has_key("startL"):
        startL = outputDict["startL"]
        stopL = outputDict["stopL"]
        startR = outputDict["startR"]
        stopR = outputDict["stopR"]
        alignedRead.pos = startL
        alignedRead.cigar = [(0, stopL - startL + 1), (3, startR - stopL - 1),
                             (0, stopR - startR + 1)]
        tagList.append(("XS", outputDict["sense"]))
    else:
        alignedRead.pos = outputDict["start"]
        alignedRead.cigar = [(0, readlength)]

    if outputDict.has_key("pairID"):
        pairID = outputDict["pairID"]
        if pairID == "1":
            alignedRead.is_read1 = True
            alignedRead.is_proper_pair = True
        elif pairID == "2":
            alignedRead.is_read2 = True
            alignedRead.is_proper_pair = True
        else:
            pass

    if outputDict.has_key("mismatch"):
        mismatchTag = getMismatches(outputDict["mismatch"])
        if mismatchTag:
            tagList.append(("MD", mismatchTag))

    if tagList:
        alignedRead.tags = tagList

    multiplicity = 1.0 / outputDict.get("weight", 1.0)
    while multiplicity > 0:
        outfile.write(alignedRead)
        multiplicity -= 1.0
        index += 1

    return index
示例#10
0
def unmapped_aligned_read(qname):
    aligned_read = pysam.AlignedRead()
    aligned_read.qname = qname
    aligned_read.flag = 0x4
    aligned_read.rname = -1
    aligned_read.pos = -1
    aligned_read.mapq = 0
    aligned_read.cigar = None
    aligned_read.rnext = -1
    aligned_read.pnext = -1
    aligned_read.tlen = 0
    aligned_read.seq = '*'
    aligned_read.qual = '*'
    return aligned_read
示例#11
0
 def MakeBam(self, ref_name, aln):
     qual_list = self.sam_quality()        
     qual = ''.join(qual_list[aln.begin:aln.end])
     bam = pysam.AlignedRead()
     bam.qname = self.read.name
     bam.seq=self.read.seq.tostring()
     bam.flag = 0
     #bam.rname = ref_name
     bam.pos = aln.begin + 1
     bam.mapq = 255
     bam.cigar = aln.bam_cigar()
     bam.qual = qual
     bam.tags = ( ("NM", 1),
                ("RG", "L1") )
     return bam
示例#12
0
def copy_read(r):
    a = pysam.AlignedRead()
    a.qname = r.qname
    a.seq = r.seq
    a.flag = r.flag
    a.tid = r.tid
    a.pos = r.pos
    a.mapq = r.mapq
    a.cigar = r.cigar
    a.rnext = r.rnext
    a.pnext = r.pnext
    a.isize = r.isize
    a.qual = r.qual
    a.tags = list(r.tags)
    return a
示例#13
0
def mapping_from_line(line):
    combined_mapping = pysam.AlignedRead()
    parsed_line = sam.parse_line(line)
    if parsed_line['strand'] == '-':
        combined_mapping.is_reverse = True
    combined_mapping.seq = parsed_line['SEQ']
    combined_mapping.qual = parsed_line['QUAL']
    combined_mapping.cigarstring = parsed_line['CIGAR']

    # This should obviously be made more general.
    for tag_name in ['Xs', 'Xq', 'Xw']:
        if tag_name in parsed_line:
            tag = [(tag_name, parsed_line[tag_name])]
            combined_mapping.tags = combined_mapping.tags + tag

    return combined_mapping
示例#14
0
def doWork(args):
    """ Main wrapper"""

    # make sam header
    header = {'HD': {'VN': '1.0'}}
    headers, header_lookup = parseReferences(args.ref)
    header['SQ'] = headers
    # open outfile
    outfile = pysam.Samfile(args.samfile, "wh", header=header)

    # parse in the blast file
    blast_records = blastxml.parse(open(args.blast))
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                #print(alignment.title)
                read = pysam.AlignedRead()
                read.qname = blast_record.query
                read.flag = 0  # fix up reverse complementing
                dna = hsp.query.replace('-', '')
                #print(hsp.frame)
                cigar = makeCigar(hsp, blast_record.query_letters
                                  )  # represented as tuple of 2-tuples
                if hsp.frame[1] ^ hsp.frame[0]:
                    seq = Seq(dna)
                    rc = seq.reverse_complement()
                    read.seq = str(rc)
                    read.flag |= 0x10
                    read.pos = hsp.sbjct_end - 1
                    read.cigar = cigar[::-1]
                else:
                    read.seq = dna
                    read.pos = hsp.sbjct_start - 1
                    read.cigar = cigar
                read.rname = header_lookup[
                    alignment.hit_def]  # index to list of headers
                read.mapq = 255  # phred scaled probability score
                read.mrnm = -1  # index of the mate
                read.mpos = -1  # position of the mate
                read.tlen = 0  # insert size of the mates
                outfile.write(read)
                #print(read)
    outfile.close()
示例#15
0
def create_new_read(new_seq, new_qual, read1, read2):
    global notified
    new = pysam.AlignedRead()
    if not read1.qname.startswith("M_"): new.qname = "M_" + read1.qname
    else: new.qname = read1.qname
    new.seq = new_seq
    new.qual = new_qual
    new.is_unmapped = True
    new.pos = -1
    new.mpos = -1

    new.is_qcfail = read1.is_qcfail and read2.is_qcfail
    if read1.tags != None: htags = dict(read1.tags)
    else: htags = {}
    if (len(new_seq) < min_length):
        new.is_qcfail = True
        if "ZQ" in htags: htags["ZQ"] += "L"
        else: htags["ZQ"] = "L"
    stags = set()
    new_tags = []
    if read2.tags != None:
        for tag, value in read2.tags:
            stags.add(tag)
            if tag == "NM" or tag == "MD": continue
            elif tag in htags and value != htags[tag]:  # NEW TAG DIFF VALUE
                if tag == "ZQ":
                    qc_tag = list(set(list(value + htags[tag])))
                    qc_tag.sort()
                    new_tags.append((tag, "".join(qc_tag)))
                else:
                    if tag not in notified:
                        sys.stderr.write(
                            "Do not know how to combine %s BAM tags. Information of one of the reads will get lost during merging.\n"
                            % tag)
                        notified.add(tag)
            elif tag in htags and value == htags[tag]:  # SAME TAG AND VALUE
                new_tags.append((tag, value))
            else:  # NEW TAG
                new_tags.append((tag, value))
    for tag, value in htags.iteritems():
        if tag not in stags: new_tags.append((tag, value))
    new.tags = new_tags
    return new
示例#16
0
def _cleanup_record(record):
    """Marks a BAM record as unmapped, clearing relevant fields and/or setting
    fields to match those of the mate (if mapped). An updated (possibly new)
    record is returned.
    """
    if record.cigar:
        # Build a new read; this is nessesary, as it is not possible
        # to clean the CIGAR string on an existing record in current
        # versions of Pysam.
        unmapped_read = pysam.AlignedRead()
        unmapped_read.qname = record.qname
        unmapped_read.flag = record.flag
        unmapped_read.seq = record.seq
        unmapped_read.qual = record.qual
        unmapped_read.tags = record.tags

        if not record.mate_is_unmapped:
            unmapped_read.rnext = record.rnext
            unmapped_read.pnext = record.pnext
        else:
            unmapped_read.rnext = -1
            unmapped_read.pnext = -1
        unmapped_read.tid = unmapped_read.rnext

        # Set .pos TWICE; this is a workaround for a bug in current versions
        # of pysam, in which the bin in the record is re-calculated BEFORE
        # the new position value is set, using the old pos value.
        unmapped_read.pos = unmapped_read.pnext  # Update 1 of 2
        unmapped_read.pos = unmapped_read.pnext  # Update 2 of 2

        return unmapped_read
    else:
        record.mapq = 0
        if record.mate_is_unmapped:
            record.rnext = -1
            record.pnext = -1
        record.tid = record.rnext
        record.pos = record.pnext
        record.tlen = 0

        return record
示例#17
0
def makeSAMrec(pos, seqinfo, cig, orient, refname, single = False, grpident = None):
    newcig, pos, rmfront, rmback = cleanupCigar(pos, cig, len(seqinfo.seq))
    a = pysam.AlignedRead()
    a.tid = 0
    a.rname = 0
    a.qname = seqinfo.id   
    a.flag = 0x00 
    if orient == '-':
        a.seq = str(seqinfo.seq.reverse_complement())
    else:
        a.seq= str(seqinfo.seq)
    if rmback:
        a.seq = a.seq[:-rmback]
    if rmfront:
        a.seq = a.seq[rmfront:]

    if seqinfo.letter_annotations:
        tmpq = seqinfo.format("fastq").strip().split("\n")[-1]
        if orient == '-':
            tmpq = tmpq[::-1]
        if rmback:
            tmpq = tmpq[:-rmback]
        if rmfront:
            tmpq = tmpq[rmfront:]
        a.qual = tmpq

    a.pos =  pos
    a.cigarstring = newcig   
    a.rnext = -1
    a.pnext= -1
    a.tlen = 0
    if single:
        tags=  []
        if grpident == None:
            tags.append( ("RG", "GROUP-%s"%(seqinfo.id.split("_")[0]),) )
        else:
            tags.append( ("RG", "GROUP-%s"%(grpident),) )
        a.tags = tags
    return a
示例#18
0
 def generateSamInfo(self,  refnamestr, sinfo, seq, cig, startpos, refname, qual, readgroupID):
     a = pysam.AlignedRead()
     a.rname = refname
     a.qname = "%s_%s"%(sinfo.qname, refnamestr)
     a.seq= seq
     a.flag = 0x00 #sinfo.flag & 16
     a.pos =  startpos
     a.mapq = sinfo.mapq
     a.cigarstring =  cig
     a.rnext = -1
     a.pnext= -1 #a.pos
     #a.isize = sinfo.isize
     a.tlen = 0
     a.qual = qual.strip().replace(" ","")
     tags=  []
     # DLS: force to use our regroup information instead of what was previously filled in..
     tags.append( ("RG", readgroupID,) )
         
     tmp = self.__getTag__(sinfo, "X0")
     if tmp:
         tags.append( ("X0", tmp,) )
     tmp = self.__getTag__(sinfo, "AS")
     if tmp:
         tags.append( ("AS", tmp,) )
     tmp = self.__getTag__(sinfo, "XS")
     if tmp:
         tags.append( ("XS", tmp,) )
     tmp = self.__getTag__(sinfo, "YS")
     if tmp:
         tags.append( ("YS", tmp,) )
     tmp = self.__getTag__(sinfo, "YT")
     if tmp:
         tags.append( ("YT", tmp,) )
     a.tags = tags
     #a.tags = sinfo.tags
     return a
示例#19
0
def main():
    # Parameters to be input.
    parser = ArgumentParser()
    parser.add_argument("--infile",
                        action="store",
                        dest="infile",
                        help="input BAM file",
                        required=True)
    parser.add_argument("--outfile",
                        action="store",
                        dest="outfile",
                        help="output BAM file",
                        required=True)
    parser.add_argument(
        '--Ncutoff',
        type=float,
        default=1.0,
        dest='Ncutoff',
        help="Maximum percentage of Ns allowed in a consensus [1.0]")
    parser.add_argument(
        '--readlength',
        type=int,
        default=84,
        dest='read_length',
        help="Length of the input read that is being used. [84]")
    parser.add_argument(
        '--barcode_length',
        type=int,
        default=12,
        dest='blength',
        help=
        'Length of the duplex tag sequence. Should match the value in tag_to_header.[12]'
    )
    parser.add_argument(
        '--read_out',
        type=int,
        default=1000000,
        dest='rOut',
        help=
        'How often you want to be told what the program is doing. [1000000]')
    parser.add_argument('--gzip-fqs',
                        action="store_true",
                        default=False,
                        dest='gzip_fastqs',
                        help='Output gzipped fastqs [False]')
    o = parser.parse_args()

    # Initialization of all global variables, main input/output files, and main iterator and dictionaries.
    in_bam = pysam.Samfile(o.infile, "rb")  # Open the input BAM file
    out_bam = pysam.Samfile(o.outfile, "wb",
                            template=in_bam)  # Open the output BAM file
    fastq_file1 = fastq_open(o.outfile, o.gzip_fastqs, 'r1')
    fastq_file2 = fastq_open(o.outfile, o.gzip_fastqs, 'r2')

    read_num = 0
    duplexes_made = 0
    uP = 0
    nC = 0

    file_done = False  # Initialize end of file bool
    finished = False
    read_one = True

    bam_entry = in_bam.fetch(until_eof=True)  # Initialize the iterator
    first_read = bam_entry.next()  # Get the first read
    read_dict = {}  # Initialize the read dictionary
    first_tag = first_read.qname.split(":")[0]
    qual_score = first_read.qual  # Set a dummy quality score
    consensus_dict = {}
    cig_dum = first_read.cigar  # set a dummy cigar score

    # Start going through the input BAM file, one position at a time.
    for line in bam_entry:
        # Reinitialize first line
        read_num += 1
        if read_one is True and first_read.is_unmapped is False:

            read_dict[first_tag] = [
                first_read.flag, first_read.rname, first_read.pos,
                first_read.mrnm, first_read.mpos, first_read.isize,
                first_read.seq
            ]
            read_one = False

        while line.pos == first_read.pos and file_done is False:
            tag = line.qname.split(":")[0]  # Extract the barcode
            # Add the sequence to the read dictionary

            if line.is_unmapped is False:
                read_dict[tag] = [
                    line.flag, line.rname, line.pos, line.mrnm, line.mpos,
                    line.isize, line.seq
                ]
            try:  # Keep StopIteration error from happening
                line = bam_entry.next()  # Iterate the line
                read_num += 1
            except:
                file_done = True  # Tell the program that it has reached the end of the file
                read_num += 1

            if read_num % o.rOut == 0:
                sys.stderr.write("%s reads processed\n" % read_num)
        else:
            # Send reads to dcs_maker
            first_read = line  # Store the present line for the next group of lines
            first_tag = first_read.qname.split(":")[0]
            read_one = True
            dict_keys = read_dict.keys()

            for dict_tag in read_dict.keys(
            ):  # Extract sequences to send to the dcs_maker
                switch_tag = dict_tag[o.blength:] + dict_tag[:o.blength]

                try:
                    consensus = dcs_maker(
                        [read_dict[dict_tag][6], read_dict[switch_tag][6]],
                        o.read_length)
                    duplexes_made += 1
                    # Filter out consensuses with too many Ns in them
                    if consensus.count("N") / len(consensus) > o.Ncutoff:
                        nC += 1
                    else:
                        # Write a line to the consensus_dictionary
                        a = pysam.AlignedRead()
                        a.qname = dict_tag
                        a.flag = read_dict[dict_tag][0]

                        if a.is_reverse is True:
                            tmp_seq = Seq(consensus, IUPAC.unambiguous_dna)
                            a.seq = str(tmp_seq.reverse_complement())
                        else:
                            a.seq = consensus

                        a.rname = read_dict[dict_tag][1]
                        a.pos = read_dict[dict_tag][2]
                        a.mapq = 255
                        a.cigar = cig_dum
                        a.mrnm = read_dict[dict_tag][3]
                        a.mpos = read_dict[dict_tag][4]
                        a.isize = read_dict[dict_tag][5]
                        a.qual = qual_score

                        # Write DCSs to output BAM file in read pairs.
                        if dict_tag in consensus_dict:
                            if a.is_read1 is True:
                                fastq_file1.write('@:%s\n%s\n+\n%s\n' %
                                                  (a.qname, a.seq, a.qual))
                                out_bam.write(a)
                                fastq_file2.write(
                                    '@:%s\n%s\n+\n%s\n' %
                                    (consensus_dict[dict_tag].qname,
                                     consensus_dict[dict_tag].seq,
                                     consensus_dict[dict_tag].qual))
                                out_bam.write(consensus_dict.pop(dict_tag))
                            else:
                                fastq_file1.write(
                                    '@:%s\n%s\n+\n%s\n' %
                                    (consensus_dict[dict_tag].qname,
                                     consensus_dict[dict_tag].seq,
                                     consensus_dict[dict_tag].qual))
                                out_bam.write(consensus_dict.pop(dict_tag))
                                fastq_file2.write('@:%s\n%s\n+\n%s\n' %
                                                  (a.qname, a.seq, a.qual))
                                out_bam.write(a)
                        else:
                            consensus_dict[dict_tag] = a

                    del read_dict[dict_tag]
                    del read_dict[switch_tag]

                except:
                    pass

        read_dict = {}  # Reset the read dictionary

    # Close BAM files
    in_bam.close()

    # Write unpaired DCSs
    for consTag in consensus_dict.keys():
        a = pysam.AlignedRead()
        a.qname = consTag
        a.flag = 5
        a.seq = '.' * o.read_length
        a.rname = consensus_dict[consTag].rname
        a.pos = consensus_dict[consTag].pos
        a.mapq = 255
        a.cigar = cig_dum
        a.mrnm = consensus_dict[consTag].mrnm
        a.mpos = consensus_dict[consTag].pos
        a.isize = consensus_dict[consTag].isize
        a.qual = qual_score

        if consensus_dict[consTag].is_read1 is False:
            fastq_file1.write('@:%s\n%s\n+\n%s\n' % (a.qname, a.seq, a.qual))
            out_bam.write(a)
            fastq_file2.write(
                '@:%s\n%s\n+\n%s\n' %
                (consensus_dict[consTag].qname, consensus_dict[consTag].seq,
                 consensus_dict[consTag].qual))
            out_bam.write(consensus_dict.pop(consTag))
        else:
            fastq_file1.write(
                '@:%s\n%s\n+\n%s\n' %
                (consensus_dict[consTag].qname, consensus_dict[consTag].seq,
                 consensus_dict[consTag].qual))
            out_bam.write(consensus_dict.pop(consTag))
            fastq_file2.write('@:%s\n%s\n+\n%s\n' % (a.qname, a.seq, a.qual))
            out_bam.write(a)

        uP += 1

    fastq_file1.close()
    fastq_file2.close()
    out_bam.close()

    # Write summary statistics.  Duplexes made includes unpaired duplexes
    sys.stderr.write("Summary Statistics: \n")
    sys.stderr.write("Reads Processed: %s\n" % read_num)
    sys.stderr.write("Duplexes Made: %s\n" % duplexes_made)
    sys.stderr.write("Unpaired Duplexes: %s\n" % uP)
    sys.stderr.write("N-clipped Duplexes: %s\n" % nC)
示例#20
0
def main():
    #Parameters to be input.
    parser = ArgumentParser()
    parser.add_argument("--infile",
                        action="store",
                        dest="infile",
                        help="input BAM file",
                        required=True)
    parser.add_argument("--tagfile",
                        action="store",
                        dest="tagfile",
                        help="output tagcounts file",
                        default='sys.stdout',
                        required=True)
    parser.add_argument("--outfile",
                        action="store",
                        dest="outfile",
                        help="output BAM file",
                        required=True)
    parser.add_argument(
        "--rep_filt",
        action="store",
        type=int,
        dest='rep_filt',
        help="Remove tags with homomeric runs of nucleotides of length x. [9]",
        default=9)
    parser.add_argument(
        '--minmem',
        type=int,
        default=3,
        dest='minmem',
        help="Minimum number of reads allowed to comprise a consensus. [3] ")
    parser.add_argument(
        '--maxmem',
        type=int,
        default=1000,
        dest='maxmem',
        help="Maximum number of reads allowed to comprise a consensus. [1000]")
    parser.add_argument(
        '--cutoff',
        type=float,
        default=.7,
        dest='cutoff',
        help=
        "Percentage of nucleotides at a given position in a read that must be identical in order for a consensus to be called at that position. [0.7]"
    )
    parser.add_argument(
        '--Ncutoff',
        type=float,
        default=1,
        dest='Ncutoff',
        help=
        "With --filt 'n', maximum fraction of Ns allowed in a consensus [1.0]")
    parser.add_argument(
        '--readlength',
        type=int,
        default=84,
        dest='read_length',
        help="Length of the input read that is being used. [80]")
    parser.add_argument(
        '--read_type',
        type=str,
        action="store",
        dest='read_type',
        default="dpm",
        help=
        "A string specifying which types of read to consider.  Read types: n: Neither read 1 or read 2 mapped.  m: Either read 1 or read 2 mapped, but not both.  p: Both read 1 and read 2 mapped, not a propper pair.  d: Both read 1 and read 2 mapped, propper pair.  s: Single ended reads\n\t\t['dpm']"
    )
    parser.add_argument('--isize',
                        type=int,
                        default=-1,
                        dest='isize',
                        help="maximum distance between read pairs")
    parser.add_argument(
        '--read_out',
        type=int,
        default=1000000,
        dest='rOut',
        help=
        'How often you want to be told what the program is doing. [1000000]')
    parser.add_argument(
        '--filt',
        type=str,
        default='osn',
        dest='filt',
        help=
        "A string indicating which filters should be implemented.  Filters: s: Softclipping filter.  o: Overlap filter.  n: N filter.  ['osn']"
    )
    o = parser.parse_args()

    # Initialization of all global variables, main input/output files, and main iterator and dictionaries.
    goodFlag = []
    if 'd' in o.read_type:
        goodFlag.extend((99, 83, 163, 147))
    if 'm' in o.read_type:
        goodFlag.extend((181, 117, 137, 133, 73, 89, 69, 153))
    if 'p' in o.read_type:
        goodFlag.extend((97, 81, 161, 145, 129, 65, 177, 113))
    if 'n' in o.read_type:
        goodFlag.extend((141, 77, 4))
    if 's' in o.read_type:
        goodFlag.extend((0, 16))
    if 'u' in o.read_type:
        goodFlag.extend((103, 167))

    inBam = pysam.Samfile(o.infile, "rb")  # Open the input BAM file
    outBam = pysam.Samfile(o.outfile, "wb",
                           template=inBam)  # Open the output BAM file
    outNC1 = pysam.Samfile(o.outfile.replace(".bam", "_LCC.bam"),
                           "wb",
                           template=inBam)
    nonMap = pysam.Samfile(o.outfile.replace(".bam", "_NM.bam"),
                           "wb",
                           template=inBam)  # File for reads with strange flags
    if o.read_type == 'd':
        extraBam = pysam.Samfile(o.outfile.replace(".bam", "_UP.bam"),
                                 "wb",
                                 template=inBam)

    readNum = 0
    nM = 0
    bF = 0
    oL = 0
    sC = 0
    rT = 0
    nC = 0

    LCC = 0
    ConMade = 0
    if o.read_type == 'd':
        UP = 0

    fileDone = False  # Initialize end of file bool
    finished = False
    readOne = False

    qualScore = 'J' * o.read_length  # Set a dummy quality score

    bamEntry = inBam.fetch(until_eof=True)  # Initialize the iterator
    readWin = [bamEntry.next(), '']  # Get the first read
    winPos = 0

    readDict = {}  # Initialize the read dictionary
    tagDict = defaultdict(lambda: 0)  # Initialize the tag dictionary

    consensusDict = {}

    #Start going through the input BAM file, one position at a time.
    for line in bamEntry:
        winPos += 1
        readWin[winPos % 2] = line
        # Reinitialize first line
        if readOne == True:
            winPos -= 1
        while (readWin[winPos % 2].pos == readWin[(winPos - 1) % 2].pos
               and fileDone == False and readOne == False) or readOne == True:
            if readNum % o.rOut == 0:
                sys.stderr.write("Reads processed:" + str(readNum) + "\n")

            try:
                tag = readWin[winPos % 2].qname.split('|')[1].split('/')[0] + (
                    ":1" if readWin[winPos % 2].is_read1 == True else
                    (":2" if readWin[winPos % 2].is_read2 == True else ":se"))
                tagDict[tag] += 1
            except:
                print readNum
                raise

            # Overlap filter: filters out overlapping reads (with --filt o)
            overlap = False
            if 'o' in o.filt:
                if readWin[winPos %
                           2].pos < readWin[winPos % 2].mpos and readWin[
                               winPos % 2].mpos < readWin[
                                   winPos % 2].pos + o.read_length and int(
                                       readWin[winPos % 2].flag) in (83, 99,
                                                                     147, 163):
                    overlap = True
                elif readWin[winPos % 2].pos > readWin[
                        winPos % 2].mpos and readWin[winPos % 2].pos < readWin[
                            winPos % 2].mpos + o.read_length and int(
                                readWin[winPos % 2].flag) in (83, 99, 147,
                                                              163):
                    overlap = True
                elif readWin[winPos %
                             2].pos == readWin[winPos % 2].mpos and int(
                                 readWin[winPos % 2].flag) in (83, 99, 147,
                                                               163):
                    overlap = True
            readNum += 1

            # Softclip filter: filters out softclipped reads (with --filt s)
            softClip = False
            if 's' in o.filt:
                if readWin[winPos % 2].cigar != None:
                    for tupple in readWin[winPos % 2].cigar:
                        if tupple[0] == 4:
                            softClip = True

            # Check if the given read is good data
            if int(readWin[winPos % 2].flag
                   ) in goodFlag and overlap == False and softClip == False:
                if ('A' * o.rep_filt in tag) or ('C' * o.rep_filt in tag) or (
                        'G' * o.rep_filt in tag) or ('T' * o.rep_filt in tag):
                    # Check for bad barcodes
                    nM += 1
                    nonMap.write(readWin[winPos % 2])
                    rT += 1
                else:
                    # Add the sequence to the read dictionary
                    if tag not in readDict:
                        readDict[tag] = [
                            readWin[winPos % 2].flag,
                            readWin[winPos % 2].rname, readWin[winPos % 2].pos,
                            readWin[winPos % 2].mrnm, readWin[winPos % 2].mpos,
                            readWin[winPos % 2].isize, {
                                str(readWin[winPos % 2].cigar):
                                [0, readWin[winPos % 2].cigar]
                            }
                        ]

                    if str(readWin[winPos % 2].cigar) not in readDict[tag][6]:
                        readDict[tag][6][str(readWin[winPos % 2].cigar)] = [
                            0, readWin[winPos % 2].cigar
                        ]

                    readDict[tag][6][str(readWin[winPos % 2].cigar)].append(
                        readWin[winPos % 2].seq)
                    readDict[tag][6][str(readWin[winPos % 2].cigar)][0] += 1
            else:
                nM += 1
                nonMap.write(readWin[winPos % 2])
                if int(readWin[winPos % 2].flag) not in goodFlag:
                    bF += 1
                elif overlap == True:
                    oL += 1
                elif softClip == True:
                    sC += 1

            winPos += 1
            if readOne == False:
                try:  # Keep StopIteration error from happening at the end of a file
                    readWin[winPos % 2] = bamEntry.next()  # Iterate the line
                except:
                    fileDone = True  # Tell the program that it has reached the end of the file
            else:
                readOne = False
        else:

            # Send reads to consensusMaker
            readOne = True
            for dictTag in readDict.keys(
            ):  # Extract sequences to send to the consensus maker
                # Cigar string filtering
                cigComp = {}
                for cigStr in readDict[dictTag][6].keys(
                ):  # Determin the most common cigar string
                    cigComp[cigStr] = readDict[dictTag][6][cigStr][0]
                maxCig = max(cigComp)
                if cigComp[maxCig] >= o.minmem:
                    if cigComp[maxCig] <= o.maxmem:
                        ConMade += 1
                        consensus, fam_size = consensusMaker(
                            readDict[dictTag][6][maxCig][2:], o.cutoff,
                            o.read_length)
                    else:
                        ConMade += 1
                        consensus, fam_size = consensusMaker(
                            random.sample(readDict[dictTag][6][maxCig][2:],
                                          o.maxmem), o.cutoff, o.read_length)

                    for cigStr in readDict[dictTag][6].keys():
                        if cigStr != maxCig:
                            for n in xrange(
                                    2, len(readDict[dictTag][6][cigStr][2:])):
                                a = pysam.AlignedRead()
                                a.qname = dictTag + ':' + str(fam_size)
                                a.flag = readDict[dictTag][0]
                                a.seq = readDict[dictTag][6][cigStr][n]
                                a.rname = readDict[dictTag][1]
                                a.pos = readDict[dictTag][2]
                                a.mapq = 255
                                a.cigar = readDict[dictTag][6][cigStr][1]
                                a.mrnm = readDict[dictTag][3]
                                a.mpos = readDict[dictTag][4]
                                a.isize = readDict[dictTag][5]
                                a.qual = qualScore
                                outNC1.write(a)
                                LCC += 1

                    cigComp = {}

                    # Filter out consensuses with too many Ns in them
                    if (consensus.count("N") / len(consensus) <= o.Ncutoff
                            and 'n' in o.filt) or ('n' not in o.filt):
                        # Write a line to the consensusDictionary
                        a = pysam.AlignedRead()
                        a.qname = dictTag + ":" + str(fam_size)
                        a.flag = readDict[dictTag][0]
                        a.seq = consensus
                        a.rname = readDict[dictTag][1]
                        a.pos = readDict[dictTag][2]
                        a.mapq = 255
                        a.cigar = readDict[dictTag][6][maxCig][1]
                        a.mrnm = readDict[dictTag][3]
                        a.mpos = readDict[dictTag][4]
                        a.isize = readDict[dictTag][5]
                        a.qual = qualScore

                        # Write SSCSs to output BAM file in read pairs.
                        altTag = dictTag.replace(
                            ("1" if "1" in dictTag else "2"),
                            ("2" if "1" in dictTag else "1"))

                        if altTag in consensusDict:
                            if a.is_read1 == True:
                                outBam.write(a)
                                outBam.write(consensusDict.pop(altTag))
                            else:
                                outBam.write(consensusDict.pop(altTag))
                                outBam.write(a)
                        else:
                            consensusDict[dictTag] = a
                    else:
                        nC += 1
        readDict = {}  # Reset the read dictionary
        if o.read_type == 'd':
            if o.isize != -1:
                for consTag in consensusDict.keys():
                    if consensusDict[consTag].pos + o.isize < readWin[winPos %
                                                                      2].pos:
                        extraBam.write(consensusDict.pop(consTag))
                        UP += 1

    # Write unpaired SSCSs
    for consTag in consensusDict.keys():
        if o.read_type == 'd':
            extraBam.write(consensusDict.pop(consTag))
            UP += 1
        else:
            outBam.write(consensusDict.pop(consTag))

    # Close BAM files
    inBam.close()
    outBam.close()
    nonMap.close()
    outNC1.close()

    if o.read_type == 'd':
        extraBam.close()

    # Write summary statistics
    sys.stderr.write("Summary Statistics: \n")
    sys.stderr.write("Reads processed:" + str(readNum) + "\n")
    sys.stderr.write("Bad reads: %s\n" % nM)
    sys.stderr.write("\tReads with Bad Flags: %s\n" % bF)
    sys.stderr.write("\tOverlapping Reads: %s\n" % oL)
    sys.stderr.write("\tSoftclipped Reads: %s\n" % sC)
    sys.stderr.write("\tRepetitive Duplex Tag: %s\n" % rT)
    sys.stderr.write("Reads with Less Common Cigar Strings: %s\n" % LCC)
    sys.stderr.write("Consensuses Made: %s\n" % ConMade)
    #sys.stderr.write("Unpaired Consensuses: %s\n" % UP)
    sys.stderr.write("Consensuses with Too Many Ns: %s\n\n" % nC)

    # Write the tag counts file.
    tagFile = open(o.tagfile, "w")
    tagFile.write("\n".join([
        "%s\t%d" % (SMI, tagDict[SMI]) for SMI in sorted(
            tagDict.keys(), key=lambda x: tagDict[x], reverse=True)
    ]))
    tagFile.close()
    tagStats(o.tagfile)
示例#21
0
def convert_bam_file(chain_file, file_in, file_out, reverse=False):
    """
    Convert genome coordinates (in BAM/SAM format) between assemblies.  These coordinates
    are stored in the :class:`.chain.ChainFile` object.

    :param chain_file: chain file used for conversion
    :type chain_file: :class:`.chain.ChainFile`
    :param str file_in: the input SAM or BAM file
    :type file_in: string
    :param file_out: the output SAM or file
    :type file_out: string
    :param reverse: reverse direction of original chain file
    :type reverse: boolean
    """
    if not isinstance(chain_file, ChainFile):
        chain_file = g2g_fu.check_file(chain_file)

    if not isinstance(file_in, pysam.Samfile):
        file_in = g2g_fu.check_file(file_in)

    output_file_name = g2g_fu.check_file(file_out, 'w')
    unmapped_file_name = "{0}.unmapped".format(output_file_name)

    LOG.info("CHAIN FILE: {0}".format(chain_file))
    LOG.info("INPUT FILE: {0}".format(file_in))
    LOG.info("OUTPUT FILE: {0}".format(output_file_name))
    LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name))

    if not isinstance(chain_file, ChainFile):
        LOG.info("Parsing chain file...")
        chain_file = ChainFile(chain_file, reverse=reverse)
        LOG.info("Chain file parsed")

    if not isinstance(file_in, pysam.Samfile):
        try:
            sam_file = pysam.Samfile(file_in, 'rb')
            if len(sam_file.header) == 0:
                raise G2GBAMError("BAM File has no header information")
        except:
            sam_file = pysam.Samfile(file_in, 'r')
            if len(sam_file.header) == 0:
                raise G2GBAMError("SAM File has no header information")

    LOG.info("Converting BAM file")

    new_header = sam_file.header

    # replace 'HD'
    new_header['HD'] = {'VN': 1.0, 'SO': 'coordinate'}

    # replace SQ
    tmp = []
    name_to_id = {}
    id = 0
    for ref_name in sorted(chain_file.chrom_size_to):
        tmp.append({
            'LN': chain_file.chrom_size_from[ref_name],
            'SN': ref_name
        })
        name_to_id[ref_name] = id
        id += 1

    new_header['SQ'] = tmp

    if 'PG' not in new_header:
        new_header['PG'] = []

    new_header['PG'].append({'ID': 'gtgtools', 'VN': 1.0})

    if 'CO' not in new_header:
        new_header['CO'] = []

    new_header['CO'].append("Original file: {0}".format(file_in))
    new_header['CO'].append("Chain File: {0}".format(chain_file.file_name))

    dir, temp_file_name = os.path.split(file_out)
    parts = temp_file_name.split('.')
    ext = parts[-1]

    if ext.lower() == 'bam':
        new_file = pysam.Samfile(file_out, 'wb', header=new_header)
        new_file_unmapped = pysam.Samfile(unmapped_file_name,
                                          'wb',
                                          template=sam_file)
    elif ext.lower() == 'sam':
        new_file = pysam.Samfile(file_out, 'wh', header=new_header)
        new_file_unmapped = pysam.Samfile(unmapped_file_name,
                                          'wh',
                                          template=sam_file)
    else:
        raise G2GBAMError(
            "Unable to create new file based upon file extension")

    total = 0
    total_unmapped = 0
    total_fail_qc = 0

    map_statistics = {
        'total': 0,
        'fail_cannot_map': 0,
        'success_simple': 0,
        'success_complex': 0
    }

    map_statistics_pair = {
        'total': 0,
        'fail_cannot_map': 0,
        'success_1_fail_2_simple': 0,
        'success_1_fail_2_complex': 0,
        'success_1_simple_2_fail': 0,
        'success_1_simple_2_simple': 0,
        'success_1_simple_2_complex': 0,
        'success_1_complex_2_fail': 0,
        'success_1_complex_2_simple': 0,
        'success_1_complex_2_complex': 0
    }

    try:
        while True:
            if total and total % 10000 == 0:
                status_success = 0
                status_failed = 0

                for k, v in map_statistics_pair.iteritems():
                    if k.startswith('success'):
                        status_success += v
                    elif k.startswith('fail'):
                        status_failed += v

                LOG.info(
                    "Processed {0:,} reads, {1:,} successful, {2:,} failed".
                    format(total, status_success, status_failed))

            alignment = sam_file.next()
            alignment_new = pysam.AlignedRead()
            read_chr = sam_file.getrname(alignment.tid)

            # READ ONLY

            # aend                  aligned reference position of the read on the reference genome
            # alen                  aligned length of the read on the reference genome.
            # positions             a list of reference positions that this read aligns to
            # qend                  end index of the aligned query portion of the sequence (0-based, exclusive)
            # qlen                  Length of the aligned query sequence
            # qqual                 aligned query sequence quality values
            # qstart                start index of the aligned query portion of the sequence (0-based, inclusive)
            # query                 aligned portion of the read and excludes any flanking bases that were soft clipped
            # rlen                  length of the read

            # TRUE / FALSE (setting effects flag)

            # is_paired             true if read is paired in sequencing
            # is_proper_pair        true if read is mapped in a proper pair
            # is_qcfail             true if QC failure
            # is_read1              true if this is read1
            # is_read2              true if this is read2
            # is_reverse            true if read is mapped to reverse strand
            # is_secondary          true if not primary alignment
            # is_unmapped           true if read itself is unmapped
            # mate_is_reverse       true is read is mapped to reverse strand
            # mate_is_unmapped      true if the mate is unmapped

            # SET

            # cigar                 cigar as list of tuples
            # cigarstring           alignment as a string
            # flag                  properties flag
            # mapq                  mapping quality
            # pnext                 the position of the mate
            # pos                   0-based leftmost coordinate
            # pnext                 the position of the mate
            # qname                 the query name
            # rnext                 the reference id of the mate
            # seq                   read sequence bases, including soft clipped bases
            # tid                   target id, contains the index of the reference sequence in the sequence dictionary

            # DON'T NEED TO SET or SHOULD WE SET?

            # qual                  read sequence base qualities, including soft clipped bases
            # tags                  the tags in the AUX field
            # tlen                  insert size

            total += 1

            LOG.debug('~' * 80)
            LOG.debug("Converting {0} {1} {2} {3}".format(
                alignment.qname, read_chr, alignment.pos,
                alignment.cigarstring))

            if alignment.is_qcfail:
                LOG.debug("\tFail due to qc of old alignment")
                new_file_unmapped.write(alignment)
                total_fail_qc += 1
                continue

            if alignment.is_unmapped:
                LOG.debug("\tFail due to unmapped old alignment")
                new_file_unmapped.write(alignment)
                total_unmapped += 1
                continue

            if not alignment.is_paired:
                LOG.debug("SINGLE END ALIGNMENT")
                map_statistics['total'] += 1

                alignment_new.seq = alignment.seq
                alignment_new.flag = FLAG_NONE
                alignment_new.mapq = alignment.mapq
                alignment_new.qname = alignment.qname
                alignment_new.qual = alignment.qual
                alignment_new.tags = alignment.tags

                read_start = alignment.pos
                read_end = alignment.aend
                read_strand = '-' if alignment.is_reverse else '+'

                mappings = chain_file.find_mappings(read_chr, read_start,
                                                    read_end)

                # unmapped
                if mappings is None:
                    LOG.debug("\tFail due to no mappings")
                    new_file_unmapped.write(alignment)
                    map_statistics['fail_cannot_map'] += 1

                elif len(mappings) == 1:
                    if alignment.is_reverse:
                        alignment_new.flag |= FLAG_REVERSE

                    alignment_new.tid = name_to_id[mappings[0].to_chr]
                    alignment_new.pos = mappings[0].to_start
                    alignment_new.cigar = alignment.cigar
                    new_file.write(alignment_new)

                    LOG.debug("\tSuccess (simple): {0} {1}".format(
                        alignment_new.pos, alignment_new.cigarstring))
                    map_statistics['success_simple'] += 1

                else:
                    LOG.debug("MAPPINGS: {0}".format(len(mappings)))
                    for m in mappings:
                        LOG.debug("> {0}".format(m))

                    if alignment.is_reverse:
                        alignment_new.flag |= FLAG_REVERSE

                    alignment_new.tid = name_to_id[mappings[0].to_chr]
                    alignment_new.pos = mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read_strand, alignment.pos)
                    new_file.write(alignment_new)

                    LOG.debug("\tSuccess (complex): {0} {1}".format(
                        alignment_new.pos, alignment_new.cigarstring))
                    map_statistics['success_complex'] += 1

            else:
                LOG.debug("PAIRED END ALIGNMENT")
                map_statistics_pair['total'] += 1

                alignment_new.seq = alignment.seq
                alignment_new.flag = FLAG_PAIRED
                alignment_new.mapq = alignment.mapq
                alignment_new.qname = alignment.qname
                alignment_new.qual = alignment.qual
                alignment_new.tags = alignment.tags

                if alignment.is_read1:
                    alignment_new.flag |= FLAG_READ1
                if alignment.is_read2:
                    alignment_new.flag |= FLAG_READ2

                if alignment.is_reverse:
                    alignment_new.flag |= FLAG_REVERSE
                if alignment.mate_is_reverse:
                    alignment_new.flag |= FLAG_MREVERSE

                read1_chr = sam_file.getrname(alignment.tid)
                read1_start = alignment.pos
                read1_end = alignment.aend
                read1_strand = '-' if alignment.is_reverse else '+'
                read1_mappings = chain_file.find_mappings(
                    read1_chr, read1_start, read1_end)  #, read1_strand)

                read2_chr = None
                read2_start = None
                read2_end = None
                read2_strand = None
                read2_mappings = None

                if alignment.mate_is_unmapped:
                    alignment_new.flag |= FLAG_MUNMAP
                else:
                    read2_chr = sam_file.getrname(alignment.rnext)
                    read2_start = alignment.pnext
                    read2_end = read2_start + 1
                    read2_strand = '-' if alignment.mate_is_reverse else '+'
                    try:
                        read2_mappings = chain_file.find_mappings(
                            read2_chr, read2_start, read2_end)
                    except:
                        read2_mappings = None

                if read1_mappings is None and read2_mappings is None:

                    alignment_new.flag |= FLAG_UNMAP
                    alignment_new.flag |= FLAG_MUNMAP

                    LOG.debug("\tFail due to no mappings")
                    new_file_unmapped.write(alignment)
                    map_statistics_pair['fail_cannot_map'] += 1

                elif read1_mappings is None and read2_mappings and len(
                        read2_mappings) == 1:

                    alignment_new.flag |= FLAG_UNMAP

                    alignment_new.pos = 0
                    alignment_new.cigarstring = '0M'
                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0

                    LOG.debug(
                        "\tPair Success (1:fail,2:simple): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_fail_2_simple'] += 1

                elif read1_mappings is None and read2_mappings and len(
                        read2_mappings) > 1:

                    alignment_new.flag |= FLAG_UNMAP

                    alignment_new.pos = 0
                    alignment_new.cigarstring = '0M'
                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0

                    LOG.debug(
                        "\tPair Success (1:fail,2:complex): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_fail_2_complex'] += 1

                elif read1_mappings and len(
                        read1_mappings) == 1 and read2_mappings is None:

                    alignment_new.flag |= FLAG_MUNMAP

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pnext = 0
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:simple,2:fail): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_fail'] += 1

                elif read1_mappings and len(
                        read1_mappings) == 1 and read2_mappings and len(
                            read2_mappings) == 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:simple,2:simple): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_simple'] += 1

                elif read1_mappings and len(
                        read1_mappings
                ) == 1 and read2_mappings and len(read2_mappings) > 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = alignment.cigar

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:simple,2:complex): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_simple_2_complex'] += 1

                elif read1_mappings and len(
                        read1_mappings) > 1 and read2_mappings is None:

                    alignment_new.flag |= FLAG_MUNMAP

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pnext = 0
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:complex,2:fail): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_fail'] += 1

                elif read1_mappings and len(
                        read1_mappings) > 1 and read2_mappings and len(
                            read2_mappings) == 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:complex,2:simple): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_simple'] += 1

                elif read1_mappings and len(
                        read1_mappings) > 1 and read2_mappings and len(
                            read2_mappings) > 1:

                    alignment_new.tid = name_to_id[read1_mappings[0].to_chr]
                    alignment_new.pos = read1_mappings[0].to_start
                    alignment_new.cigar = convert_cigar(
                        alignment.cigar, read_chr, chain_file, alignment.seq,
                        read1_strand, alignment.pos)

                    alignment_new.rnext = name_to_id[read2_mappings[0].to_chr]
                    alignment_new.pnext = read2_mappings[0].to_start
                    alignment_new.tlen = 0  # CHECK

                    LOG.debug(
                        "\tPair Success (1:complex,2:complex): {0} {1}".format(
                            alignment_new.pos, alignment_new.cigarstring))
                    new_file.write(alignment_new)
                    map_statistics_pair['success_1_complex_2_complex'] += 1

                else:
                    raise G2GBAMError(
                        "Unknown BAM/SAM conversion/parse situation")

    except StopIteration:
        LOG.info("All reads processed")

    LOG.info("  {:>10} TOTAL ENTRIES".format(total))
    LOG.info("  {:>10} TOTAL UNMAPPED ".format(total_unmapped))
    LOG.info("  {:>10} TOTAL FAIL QC ".format(total_fail_qc))

    if map_statistics['total'] > 0:
        LOG.info("")
        LOG.info("Mapping Summary Single End")
        LOG.info("  {:>10} TOTAL ENTRIES".format(map_statistics['total']))
        LOG.info("")
        LOG.info(
            "  {:>10} TOTAL SUCCESS".format(map_statistics['success_simple'] +
                                            map_statistics['success_complex']))
        LOG.info("  {:>10} Simple".format(map_statistics['success_simple']))
        LOG.info("  {:>10} Complex".format(map_statistics['success_complex']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL FAILURES".format(
            map_statistics['fail_cannot_map']))
        LOG.info("  {:>10} Cannot Map ".format(
            map_statistics['fail_cannot_map']))

    if map_statistics_pair['total'] > 0:
        total_success = 0
        for k, v in map_statistics_pair.iteritems():
            if k.startswith('success'):
                total_success += v

        LOG.info("")
        LOG.info("Mapping Summary Paired End")
        LOG.info("  {:>10} TOTAL ENTRIES".format(map_statistics_pair['total']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL SUCCESS".format(total_success))
        LOG.info("  {:>10} Read 1 Failed, Read 2 Simple".format(
            map_statistics_pair['success_1_fail_2_simple']))
        LOG.info("  {:>10} Read 1 Failed, Read 2 Complex".format(
            map_statistics_pair['success_1_fail_2_complex']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Failed".format(
            map_statistics_pair['success_1_simple_2_fail']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Simple".format(
            map_statistics_pair['success_1_simple_2_simple']))
        LOG.info("  {:>10} Read 1 Simple, Read 2 Complex".format(
            map_statistics_pair['success_1_simple_2_complex']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Failed".format(
            map_statistics_pair['success_1_complex_2_fail']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Simple".format(
            map_statistics_pair['success_1_complex_2_simple']))
        LOG.info("  {:>10} Read 1 Complex, Read 2 Complex".format(
            map_statistics_pair['success_1_complex_2_complex']))
        LOG.info("")
        LOG.info("  {:>10} TOTAL FAILURES".format(
            map_statistics_pair['fail_cannot_map']))
        LOG.info("  {:>10} Cannot Map".format(
            map_statistics_pair['fail_cannot_map']))
        LOG.info("")

    LOG.info("BAM File Converted")
示例#22
0
def split_combined_mapping(combined_mapping, remove_soft_clipped=True):
    ''' Split a combined_mapping into non-overlapping mappings. '''
    R1_mapping = pysam.AlignedRead()
    R1_mapping.is_read1 = True
    R1_mapping.tid = combined_mapping.tid
    R1_mapping.qname = combined_mapping.qname

    R2_mapping = pysam.AlignedRead()
    R2_mapping.is_read2 = True
    R2_mapping.tid = combined_mapping.tid
    R2_mapping.qname = combined_mapping.qname

    skip_index = find_skip_index_in_combined(combined_mapping)

    left_cigar = combined_mapping.cigar[:skip_index]
    right_cigar = combined_mapping.cigar[skip_index + 1:]

    if remove_soft_clipped:
        first_left_op, first_left_length = left_cigar[0]
        if first_left_op == sam.BAM_CSOFT_CLIP:
            left_cigar = left_cigar[1:]

        last_right_op, last_right_length = right_cigar[-1]
        if last_right_op == sam.BAM_CSOFT_CLIP:
            right_cigar = right_cigar[:-1]

    combined_md = dict(combined_mapping.tags)['MD']
    left_ref_bases = sam.total_reference_nucs(left_cigar)
    right_ref_bases = sam.total_reference_nucs(right_cigar)

    _, gap = combined_mapping.cigar[skip_index]
    left_pos = combined_mapping.pos
    right_pos = left_pos + left_ref_bases + gap

    left_md = sam.truncate_md_string_up_to(combined_md, left_ref_bases)
    right_md = sam.truncate_md_string_from_beginning(combined_md,
                                                     right_ref_bases)

    strand = sam.get_strand(combined_mapping)
    if strand == '+':
        R1_mapping.cigar = left_cigar
        R1_mapping.setTag('MD', left_md)

        R2_mapping.cigar = right_cigar
        R2_mapping.setTag('MD', right_md)

        R1_mapping.pos = left_pos
        R2_mapping.pos = right_pos

        R2_mapping.is_reverse = True
    elif strand == '-':
        R1_mapping.cigar = right_cigar
        R1_mapping.setTag('MD', right_md)

        R2_mapping.cigar = left_cigar
        R2_mapping.setTag('MD', left_md)

        R1_mapping.pos = right_pos
        R2_mapping.pos = left_pos
        R1_mapping.is_reverse = True

    R1_seq, R1_qual, R2_seq, R2_qual = extract_seqs_from_combined(
        combined_mapping,
        include_overlap=False,
        remove_soft_clipped=remove_soft_clipped,
        flip_if_reverse=False,
    )
    if R1_seq != '':
        R1_mapping.seq = R1_seq
        R1_mapping.qual = R1_qual
    if R2_seq != '':
        R2_mapping.seq = R2_seq
        R2_mapping.qual = R2_qual

    return R1_mapping, R2_mapping
def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "Alignment file in BAM or SAM format. BAM file should be sorted and indexed"
    )
    parser.add_option(
        "-o",
        "--out-prefix",
        action="store",
        type="string",
        dest="output_prefix",
        help=
        "Prefix of output BAM files. \"prefix.R1.bam\" file contains the 1st read, \"prefix.R2.bam\" file contains the 2nd read"
    )
    (options, args) = parser.parse_args()

    if not (options.input_file):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file):
        print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        sys.exit(0)

    samfile = pysam.Samfile(options.input_file, 'rb')
    OUT1 = pysam.Samfile(
        options.output_prefix + '.R1.bam', 'wb',
        template=samfile)  #bam file containing reads hit to exon region
    OUT2 = pysam.Samfile(
        options.output_prefix + '.R2.bam', 'wb',
        template=samfile)  #bam file containing reads not hit to exon region
    OUT3 = pysam.Samfile(
        options.output_prefix + '.unmap.bam', 'wb',
        template=samfile)  #bam file containing reads not hit to exon region

    total_alignment = 0
    r1_alignment = 0
    r2_alignment = 0
    unmapped = 0

    print >> sys.stderr, "spliting " + options.input_file + " ...",
    try:
        while (1):
            new_alignment = pysam.AlignedRead()  # create AlignedRead object
            old_alignment = samfile.next()
            total_alignment += 1

            new_alignment.qname = old_alignment.qname  # 1st column. read name.
            #new_alignment.flag = old_alignment.flag        # 2nd column. subject to change. flag value
            new_alignment.tid = old_alignment.tid  # 3rd column. samfile.getrname(tid) == chrom name
            new_alignment.pos = old_alignment.pos  # 4th column. reference Start position of the aligned part (of read) [0-based]
            new_alignment.mapq = old_alignment.mapq  # 5th column. mapping quality
            new_alignment.cigar = old_alignment.cigar  # 6th column. subject to change.
            #new_alignment.rnext = old_alignment.rnext      # 7th column. tid of the reference (mate read mapped to)
            #new_alignment.pnext = old_alignment.pnext      # 8th column. position of the reference (0 based, mate read mapped to)
            #new_alignment.tlen = old_alignment.tlen        # 9th column. insert size
            new_alignment.seq = old_alignment.seq  # 10th column. read sequence. all bases.
            new_alignment.qual = old_alignment.qual  # 11th column. read sequence quality. all bases.
            new_alignment.tags = old_alignment.tags  # 12 - columns
            new_alignment.flag = 0x0000
            if old_alignment.is_unmapped:
                OUT3.write(old_alignment)
                unmapped += 1
                continue
            if old_alignment.is_reverse:
                new_alignment.flag = new_alignment.flag | 0x0010

            if old_alignment.is_secondary:
                new_alignment.flag = new_alignment.flag | 0x0100
            if old_alignment.is_qcfail:
                new_alignment.flag = new_alignment.flag | 0x0200
            if old_alignment.is_duplicate:
                new_alignment.flag = new_alignment.flag | 0x0400
            if old_alignment.is_read1:
                OUT1.write(new_alignment)
                r1_alignment += 1
            else:
                OUT2.write(new_alignment)
                r2_alignment += 1

    except StopIteration:
        print >> sys.stderr, "Done"

    print "%-55s%d" % ("Total records:", total_alignment)
    print "%-55s%d" % (options.output_prefix + 'Read 1:', r1_alignment)
    print "%-55s%d" % (options.output_prefix + 'Read 2:', r2_alignment)
    print "%-55s%d" % (options.output_prefix + 'Unmapped:', unmapped)
示例#24
0
        infile = open(filename)
        # ACTUAL INDEX IDENTIFICATION AND READ SORTING
    for seqid, seq, qual in read_fastq(infile):
        seqid = seqid.split()[0]
        seq2, qual2 = None, None
        if qual != None and options.qualityoffset != 33:
            qual = "".join(
                map(lambda x: chr(ord(x) - options.qualityoffset + 33), qual))
        if options.start != None:
            seq2 = seq[options.start:]
            seq = seq[:options.start]
            if qual != None:
                qual2 = qual[options.start:]
                qual = qual[:options.start]

        forward = pysam.AlignedRead()
        forward.qname = seqid
        forward.seq = seq
        if qual != None: forward.qual = qual
        else: forward.qual = "*"
        forward.is_unmapped = True
        forward.pos = -1
        forward.mpos = -1
        if seq2 != None:
            forward.is_read1 = True
            forward.is_paired = True
            reverse = pysam.AlignedRead()
            reverse.qname = seqid
            reverse.is_read2 = True
            reverse.is_paired = True
            reverse.seq = seq2
示例#25
0
def combine_paired_mappings(R1_mapping, R2_mapping, verbose=False):
    ''' Takes two pysam mappings representing opposite ends of a fragment and
    combines them into one mapping, (ab)using BAM_CREF_SKIP to bridge the gap
    (if any) between them.
    '''
    R1_mapping = copy.deepcopy(R1_mapping)
    R2_mapping = copy.deepcopy(R2_mapping)

    R1_strand = sam.get_strand(R1_mapping)

    if R1_strand == '+':
        left_mapping, right_mapping = R1_mapping, R2_mapping
    elif R1_strand == '-':
        left_mapping, right_mapping = R2_mapping, R1_mapping

    # Soft-clipping at the 3' end of a read should only happen if this is
    # read-through into soft-clipping at the 5' end of the other read.
    # If there is non-physical soft-clipping in this pair, give up now.
    # Specifically, check if any pairing of read position to ref position
    # isn't the same.

    if (left_mapping.cigar[-1][0] == sam.BAM_CSOFT_CLIP) or \
       (right_mapping.cigar[0][0] == sam.BAM_CSOFT_CLIP):

        left_pairs = left_mapping.get_aligned_pairs(matches_only=True)
        right_pairs = right_mapping.get_aligned_pairs(matches_only=True)

        if left_pairs != right_pairs:
            return False

    # Otherwise, remove all soft-clipping from the mappings, storing the 5'
    # soft-clipped seq and quals from both reads to add back at the end.

    left_clipped = remove_soft_clipping(left_mapping)
    right_clipped = remove_soft_clipping(right_mapping)

    left_md = dict(left_mapping.tags)['MD']
    right_md = dict(right_mapping.tags)['MD']

    right_aligned_pairs = sam.cigar_to_aligned_pairs(
        right_mapping.cigar, right_mapping.reference_start)

    right_after_overlap_pair_index = len(right_aligned_pairs)
    for i, (read, ref) in enumerate(right_aligned_pairs):
        if ref != None and ref >= left_mapping.aend:
            right_after_overlap_pair_index = i
            break

    right_overlap_pairs = right_aligned_pairs[:right_after_overlap_pair_index]
    right_after_overlap_pairs = right_aligned_pairs[
        right_after_overlap_pair_index:]

    right_reads_after = [
        read for read, ref in right_after_overlap_pairs
        if read != None and read != 's'
    ]
    right_refs_after = [
        ref for read, ref in right_after_overlap_pairs if ref != None
    ]

    right_overlap_cigar = sam.aligned_pairs_to_cigar(right_overlap_pairs)
    right_after_overlap_cigar = sam.aligned_pairs_to_cigar(
        right_after_overlap_pairs)
    right_after_overlap_md = sam.truncate_md_string_from_beginning(
        right_md, len(right_refs_after))

    right_after_overlap_read_start = len(
        right_mapping.seq) - len(right_reads_after)

    right_overlap_seq = right_mapping.seq[:right_after_overlap_read_start]
    right_overlap_qual = right_mapping.query_qualities[:
                                                       right_after_overlap_read_start]

    right_after_overlap_seq = right_mapping.seq[
        right_after_overlap_read_start:]
    right_after_overlap_qual = right_mapping.qual[
        right_after_overlap_read_start:]

    left_aligned_pairs = sam.cigar_to_aligned_pairs(
        left_mapping.cigar, left_mapping.reference_start)

    left_before_overlap_pair_index = -1
    for i, (read, ref) in list(enumerate(left_aligned_pairs))[::-1]:
        if ref != None and ref < right_mapping.pos:
            left_before_overlap_pair_index = i
            break

    left_overlap_pairs = left_aligned_pairs[left_before_overlap_pair_index +
                                            1:]
    left_before_overlap_pairs = left_aligned_pairs[:
                                                   left_before_overlap_pair_index
                                                   + 1]

    left_reads_before = [
        read for read, ref in left_before_overlap_pairs
        if read != None and read != 's'
    ]
    left_refs_before = [
        ref for read, ref in left_before_overlap_pairs if ref != None
    ]

    left_overlap_cigar = sam.aligned_pairs_to_cigar(left_overlap_pairs)
    left_before_overlap_cigar = sam.aligned_pairs_to_cigar(
        left_before_overlap_pairs)
    left_before_overlap_md = sam.truncate_md_string_up_to(
        left_md, len(left_refs_before))

    left_overlap_read_start = len(left_reads_before)
    left_overlap_seq = left_mapping.seq[left_overlap_read_start:]
    left_overlap_qual = left_mapping.query_qualities[left_overlap_read_start:]

    left_before_overlap_seq = left_mapping.seq[:left_overlap_read_start]
    left_before_overlap_qual = left_mapping.qual[:left_overlap_read_start]

    if left_overlap_pairs or right_overlap_pairs:
        gap_length = 0

        left_has_splicing = sam.contains_splicing(left_mapping)
        right_has_splicing = sam.contains_splicing(right_mapping)

        if left_overlap_cigar == right_overlap_cigar:
            # If the two mappings agree about the location of indels in their overlap,
            # use the seq from the mapping with the higher average quality in the
            # overlap.
            left_mean_qual = np.mean(left_overlap_qual)
            right_mean_qual = np.mean(right_overlap_qual)

            if left_mean_qual > right_mean_qual:
                use_overlap_from = 'left'
            else:
                use_overlap_from = 'right'
        elif left_has_splicing != right_has_splicing:
            # A temporary(?) heuristic - if one read has splicing and the other
            # doesn't, use the overlap from the one with splicing under the
            # assumption that the other just has a few bases overhanging the
            # splice junction.
            if left_has_splicing:
                use_overlap_from = 'left'
            else:
                use_overlap_from = 'right'
        else:
            # If the two mappings disagree about the location of indels in their overlap,
            # we need a heuristic for picking which mapping we believe reflects the
            # true structure of the input fragment. The most innocuous explanation
            # is that a 'true' indel happened to lie close to the edge of one of the
            # mappings. A more problematic situation is a 'false' indel (that is,
            # produced during cluster generation or sequencing-by-synthesis, NOT
            # template production). Our strategy is: realign the overlapping part of
            # left mapping starting from the left edge of the overlap according to the
            # cigar of the right mapping and realign the overlapping part of the right
            # mapping starting from the right edge of the overlap according to the cigar
            # of the left mapping. Count the number of mismatches produced by each.
            # If the left overlap can accomodate the right cigar with fewer mismatches,
            # use the right cigar and seq. If the right overlap can accomodate the left
            # cigar with fewer mismatches, use the left cigar and seq.

            # The leftmost aligned_pair from the right mapping is guaranteed by the
            # mapping process to not involve a gap.
            _, overlap_ref_start = right_overlap_pairs[0]
            # Similarly, the rightmost aligned_pair from the left mapping can't be a
            # gap.
            _, overlap_ref_end = left_overlap_pairs[-1]

            realigned_left_cigar = sam.truncate_cigar_blocks_up_to(
                right_mapping.cigar, len(left_overlap_seq))
            realigned_right_cigar = sam.truncate_cigar_blocks_from_beginning(
                left_mapping.cigar, len(right_overlap_seq))

            ref_dict = sam.merge_ref_dicts(
                sam.ref_dict_from_mapping(left_mapping),
                sam.ref_dict_from_mapping(right_mapping),
            )

            try:
                left_using_right_mismatches = realigned_mismatches(
                    left_overlap_seq, overlap_ref_start, realigned_left_cigar,
                    ref_dict)
                right_using_left_mismatches = realigned_mismatches_backwards(
                    right_overlap_seq, overlap_ref_end, realigned_right_cigar,
                    ref_dict)
            except (ValueError, TypeError):
                print(left_mapping)
                print(right_mapping)
                raise

            if verbose:
                logging.info('disagreements in {0}'.format(left_mapping.qname))
                logging.info('left overlap cigar is  {0}'.format(
                    str(left_overlap_cigar)))
                logging.info('right overlap cigar is {0}'.format(
                    str(right_overlap_cigar)))
                logging.info('left_using_right_mismatches - {0}'.format(
                    len(left_using_right_mismatches)))
                logging.info('right_using_left_mismatches - {0}'.format(
                    len(right_using_left_mismatches)))

            if len(left_using_right_mismatches) < len(
                    right_using_left_mismatches):
                use_overlap_from = 'right'
            elif len(right_using_left_mismatches) < len(
                    left_using_right_mismatches):
                use_overlap_from = 'left'
            else:
                logging.info('disagreements in {0}'.format(left_mapping.qname))
                logging.info('left overlap cigar is  {0}'.format(
                    str(left_overlap_cigar)))
                logging.info('right overlap cigar is {0}'.format(
                    str(right_overlap_cigar)))
                logging.info('left_using_right_mismatches - {0}'.format(
                    len(left_using_right_mismatches)))
                logging.info('right_using_left_mismatches - {0}'.format(
                    len(right_using_left_mismatches)))
                logging.info('ambiguous disagreement')
                return False

    else:
        gap_length = right_mapping.pos - left_mapping.aend
        # It doesn't matter what use_overlap_from is set to; there is no overlap
        use_overlap_from = 'left'

    combined_mapping = pysam.AlignedRead()
    combined_mapping.qname = left_mapping.qname
    combined_mapping.tid = left_mapping.tid
    combined_mapping.mapq = min(left_mapping.mapq, right_mapping.mapq)
    combined_mapping.rnext = -1
    combined_mapping.pnext = -1
    combined_mapping.pos = left_mapping.pos

    if R1_strand == '-':
        combined_mapping.is_reverse = True

    gap_cigar = [(sam.BAM_CREF_SKIP, gap_length)]

    if use_overlap_from == 'left':
        combined_mapping.seq = left_mapping.seq + right_after_overlap_seq
        combined_mapping.qual = left_mapping.qual + right_after_overlap_qual
        combined_mapping.cigar = left_mapping.cigar + gap_cigar + right_after_overlap_cigar

        combined_md = sam.combine_md_strings(left_md, right_after_overlap_md)
        combined_mapping.setTag('MD', combined_md)

        overlap_seq_tag = right_overlap_seq
        overlap_qual_tag = fastq.encode_sanger(right_overlap_qual)

    elif use_overlap_from == 'right':
        combined_mapping.seq = left_before_overlap_seq + right_mapping.seq
        combined_mapping.qual = left_before_overlap_qual + right_mapping.qual
        combined_mapping.cigar = left_before_overlap_cigar + gap_cigar + right_mapping.cigar

        combined_md = sam.combine_md_strings(left_before_overlap_md, right_md)
        combined_mapping.setTag('MD', combined_md)

        overlap_seq_tag = left_overlap_seq
        overlap_qual_tag = fastq.encode_sanger(left_overlap_qual)

    if len(overlap_seq_tag) > 0:
        # Having empty tags causes problems, so don't create them.
        combined_mapping.setTag('Xs', overlap_seq_tag)
        combined_mapping.setTag('Xq', overlap_qual_tag)
        combined_mapping.setTag('Xw', use_overlap_from)

    qual = combined_mapping.qual
    seq = combined_mapping.seq
    cigar = combined_mapping.cigar

    before = left_clipped['from_start']
    after = right_clipped['from_end']

    combined_mapping.cigar = before['cigar'] + cigar + after['cigar']
    combined_mapping.seq = before['seq'] + seq + after['seq']
    combined_mapping.qual = before['qual'] + qual + after['qual']

    return combined_mapping
示例#26
0
def write_align_pair(out, dup, r, q):
    qname = '%s:%d-%d(%s)_%s:%d-%d(%s)' % (dup.tName, dup.tStart + 1, dup.tEnd,
                                           dup.tStrand, dup.qName, dup.qStart +
                                           1, dup.qEnd, dup.qStrand)

    tid1 = out.gettid(dup.tName)
    tid2 = out.gettid(dup.qName)

    assert tid1 != -1 and tid2 != -1

    if tid1 != -1:
        a = pysam.AlignedRead()
        a.qname = qname
        a.tid = tid1
        a.pos = dup.tStart
        a.mapq = 255
        a.seq = q
        a.cigar = dup.cigar.to_pysam_list()
        a.tags = [('RG', 'hg38.chain')]  #[,('NM', nm)]
        a.tlen = 0
        a.flag = 0
        a.rnext = tid2

        if dup.tStrand == '-':
            a.flag |= 0x10

        if tid2 != -1:
            a.rnext = tid2
            a.pnext = dup.qStart
            a.flag = 0x1 | 0x2 | 0x40
            if dup.qStrand == '-':
                a.flag |= 0x20

        out.write(a)

    if tid2 != -1:
        cigar2 = dup.cigar
        if dup.qStrand != '+':
            cigar2.reverse()
        cigar2 = cigar2.invert()[0]

        b = pysam.AlignedRead()
        b.qname = qname
        b.tid = tid2
        b.pos = dup.qStart
        b.mapq = 255
        b.seq = r if dup.qStrand == '+' else rc(r)
        b.cigar = cigar2.to_pysam_list()
        b.tags = [('RG', 'hg38.chain')]  # ,('NM', nm)]
        b.tlen = 0
        b.flag = 0
        b.rnext = tid1

        if dup.qStrand == '-':
            b.flag |= 0x10

        if tid1 != -1:
            b.rnext = tid1
            b.pnext = dup.tStart
            b.flag |= 0x1 | 0x2 | 0x80
            if dup.tStrand == '-':
                b.flag |= 0x20

        out.write(b)
示例#27
0
def crossmap_bam_file(mapping,
                      chainfile,
                      infile,
                      outfile_prefix,
                      chrom_size,
                      IS_size=200,
                      IS_std=30.0,
                      fold=3,
                      addtag=True):
    '''

	Description
	-----------
	Convert genome coordinates (in BAM/SAM format) between assemblies.
	BAM/SAM format: http://samtools.sourceforge.net/
	chrom_size is target chromosome size

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	chainfile : file
		Input chain format file.

	infile : file
		Input BAM, SAM or CRAM foramt file.

	outfile_prefix : str
		Output prefix.

	chrom_size : dict
		Chromosome size of the *target* assembly, used to build bam header.

	IS_size : int
		Average insert size of pair-end sequencing.

	IS_std : float
		Stanadard deviation of insert size.

	fold : float
		A mapped pair is considered as \"proper pair\" if both ends mapped to
		different strand and the distance between them is less then fold * stdev
		from the mean.

	addtag : bool
		if addtag is set to True, will add tags to each alignmnet:
			Q = QC (QC failed)
			N = unmapped (originally unmapped or originally mapped but failed
			    to liftover to new assembly)
			M = multiple mapped (alignment can be liftover to multiple places)
			U = unique mapped (alignment can be liftover to only 1 place)

		tags for pair-end sequencing include:
			QF: QC failed
			NN: both read1 and read2 unmapped
			NU: read1 unmapped, read2 unique mapped
			NM: read1 unmapped, multiple mapped
			UN: read1 uniquely mapped, read2 unmap
			UU: both read1 and read2 uniquely mapped
			UM: read1 uniquely mapped, read2 multiple mapped
			MN: read1 multiple mapped, read2 unmapped
			MU: read1 multiple mapped, read2 unique mapped
			MM: both read1 and read2 multiple mapped

		tags for single-end sequencing include:
			QF: QC failed
			SN: unmaped
			SM: multiple mapped
			SU: uniquely mapped
	'''

    # determine the input file format (BAM, CRAM or SAM)
    file_type = ''
    if infile.lower().endswith('.bam'):
        file_type = 'BAM'
        comments = ['ORIGINAL_BAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rb')
        if len(samfile.header) == 0:
            print("BAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.cram'):
        file_type = 'CRAM'
        comments = ['ORIGINAL_CRAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rc')
        if len(samfile.header) == 0:
            print("CRAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.sam'):
        file_type = 'SAM'
        comments = ['ORIGINAL_SAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'r')
        if len(samfile.header) == 0:
            print("SAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    else:
        print(
            "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
            file=sys.stderr)
        sys.exit(1)
    comments.append('CHAIN_FILE=' + chainfile)

    sam_ori_header = samfile.header.to_dict()

    # chromosome ID style of the original BAM file
    chrom_style = sam_ori_header['SQ'][0]['SN']  # either 'chr1' or '1'

    # update chrom_size of target genome
    target_chrom_sizes = {}
    for n, l in chrom_size.items():
        target_chrom_sizes[update_chromID(chrom_style, n)] = l

    (new_header, name_to_id) = sam_header.bam_header_generator(
        orig_header=sam_ori_header,
        chrom_size=target_chrom_sizes,
        prog_name="CrossMap",
        prog_ver=__version__,
        format_ver=1.0,
        sort_type='coordinate',
        co=comments)

    # write to file
    if outfile_prefix is not None:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            printlog(
                ["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam'])
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            printlog([
                "Liftover CRAM file:", infile, '==>', outfile_prefix + '.bam'
            ])
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.sam',
                                     "wh",
                                     header=new_header)
            printlog(
                ["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam'])
        else:
            print(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
                file=sys.stderr)
            sys.exit(1)
    # write to screen
    else:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            printlog(["Liftover BAM file:", infile])
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            printlog(["Liftover CRAM file:", infile])
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile('-', "w", header=new_header)
            printlog(["Liftover SAM file:", infile])
        else:
            print(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
                file=sys.stderr)
            sys.exit(1)
    QF = 0
    NN = 0
    NU = 0
    NM = 0
    UN = 0
    UU = 0
    UM = 0
    MN = 0
    MU = 0
    MM = 0
    SN = 0
    SM = 0
    SU = 0
    total_item = 0
    try:
        while (1):
            total_item += 1
            old_alignment = next(samfile)
            new_alignment = pysam.AlignedRead()  # create AlignedRead object

            new_alignment.query_name = old_alignment.query_name  # 1st column. read name.
            new_alignment.query_sequence = old_alignment.query_sequence  # 10th column. read sequence. all bases.
            new_alignment.query_qualities = old_alignment.query_qualities  # 11th column. read sequence quality. all bases.
            new_alignment.set_tags(old_alignment.get_tags())  # 12 - columns

            # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes
            # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution.
            try:
                rg, rgt = old_alignment.get_tag("RG", with_value_type=True)
            except KeyError:
                pass
            else:
                new_alignment.set_tag("RG", str(rg), rgt)

            ## Pair-end sequencing
            if old_alignment.is_paired:
                new_alignment.flag = 0x1  #pair-end in sequencing
                if old_alignment.is_read1:
                    new_alignment.flag = new_alignment.flag | 0x40
                elif old_alignment.is_read2:
                    new_alignment.flag = new_alignment.flag | 0x80

                if old_alignment.is_qcfail:
                    new_alignment.flag = new_alignment.flag | 0x200
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6
                    new_alignment.next_reference_id = -1  #7
                    new_alignment.next_reference_start = 0  #8
                    new_alignment.template_length = 0  #9

                    QF += 1
                    if addtag: new_alignment.set_tag(tag="QF", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                #==================================
                # R1 originally unmapped
                #==================================
                elif old_alignment.is_unmapped:
                    new_alignment.flag = new_alignment.flag | 0x4  #2
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6

                    # R1 & R2 originally unmapped
                    if old_alignment.mate_is_unmapped:
                        new_alignment.next_reference_id = -1  #7
                        new_alignment.next_reference_start = 0  #8
                        new_alignment.template_length = 0  #9

                        NN += 1
                        if addtag: new_alignment.set_tag(tag="NN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
                    # R1 unmap, R2 is mapped
                    else:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None

                        #------------------------------------
                        # R1 unmapped, R2 failed to liftover
                        #------------------------------------
                        if read2_maps is None:
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            NN += 1
                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 unique
                        #------------------------------------
                        elif len(read2_maps) == 2:
                            # 2-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 multiple
                        #------------------------------------
                        else:
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2-9
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                #==================================
                # R1 is originally mapped
                #==================================
                else:
                    try:
                        read1_chr = samfile.get_reference_name(
                            old_alignment.reference_id)
                        read1_strand = '-' if old_alignment.is_reverse else '+'
                        read1_start = old_alignment.reference_start
                        read1_end = old_alignment.reference_end
                        read1_maps = map_coordinates(mapping, read1_chr,
                                                     read1_start, read1_end,
                                                     read1_strand)
                    except:
                        read1_maps = None

                    if not old_alignment.mate_is_unmapped:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None
                    #------------------------------------
                    # R1 failed to liftover
                    #------------------------------------
                    if read1_maps is None:
                        # read2 is unmapped or failed to convertion
                        if old_alignment.mate_is_unmapped or (read2_maps is
                                                              None):
                            # col2 - col9
                            new_alignment.flag = new_alignment.flag | 0x4  #2
                            new_alignment.reference_id = -1  #3
                            new_alignment.reference_start = 0  #4
                            new_alignment.mapping_quality = 255  #5
                            new_alignment.cigartuples = old_alignment.cigartuples  #6
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            NN += 1
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is multiple mapped
                        else:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = 255  # mapq not available
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                    #------------------------------------
                    # R1 uniquely mapped
                    #------------------------------------
                    elif len(read1_maps) == 2:
                        # col2 - col5
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        new_alignment.reference_id = name_to_id[read1_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read1_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # R2 unmapped before or after conversion
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UN += 1
                            if addtag: new_alignment.set_tag(tag="UN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = abs(
                                new_alignment.reference_start -
                                new_alignment.next_reference_start
                            ) + old_alignment.reference_length
                            # 2
                            if (read2_maps[1][3] != read1_maps[1][3]) and (
                                    new_alignment.template_length <=
                                    IS_size + fold * IS_std) and (
                                        new_alignment.template_length >=
                                        IS_size - fold * IS_std):
                                new_alignment.flag = new_alignment.flag | 0x2

                            UU += 1
                            if addtag: new_alignment.set_tag(tag="UU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is multiple mapped
                        else:
                            # 2 (strand)
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100

                            #7-9
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UM += 1
                            if addtag: new_alignment.set_tag(tag="UM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                    #------------------------------------
                    # R1 multiple mapped
                    #-----------------------------------
                    elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0:
                        # 2
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        # 3-5
                        new_alignment.tid = name_to_id[read1_maps[1]
                                                       [0]]  #chrom
                        new_alignment.pos = read1_maps[1][1]  #start
                        new_alignment.mapq = 255

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # (1) R2 is unmapped
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MN += 1
                            if addtag: new_alignment.set_tag(tag="MN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (2) read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MU += 1
                            if addtag: new_alignment.set_tag(tag="MU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (3) R2 is multiple mapped
                        else:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MM += 1
                            if addtag: new_alignment.set_tag(tag="MM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

            # Singel end sequencing
            else:
                # 7-9
                new_alignment.next_reference_id = -1
                new_alignment.next_reference_start = 0
                new_alignment.template_length = 0

                # (1) originally unmapped
                if old_alignment.is_unmapped:
                    # 2-6
                    new_alignment.flag = new_alignment.flag | 0x4
                    new_alignment.reference_id = -1
                    new_alignment.reference_start = 0
                    new_alignment.mapping_quality = 255
                    new_alignment.cigartuples = old_alignment.cigartuples

                    SN += 1
                    if addtag: new_alignment.set_tag(tag="SN", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                else:
                    new_alignment.flag = 0x0
                    read_chr = samfile.get_reference_name(
                        old_alignment.reference_id)
                    read_strand = '-' if old_alignment.is_reverse else '+'
                    read_start = old_alignment.reference_start
                    read_end = old_alignment.reference_end
                    read_maps = map_coordinates(mapping, read_chr, read_start,
                                                read_end, read_strand)

                    # (2) unmapped afte liftover
                    if read_maps is None:
                        new_alignment.flag = new_alignment.flag | 0x4
                        new_alignment.reference_id = -1
                        new_alignment.reference_start = 0
                        new_alignment.mapping_quality = 255

                        SN += 1
                        if addtag: new_alignment.set_tag(tag="SN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (3) unique mapped
                    if len(read_maps) == 2:
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            try:
                                new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                              -1]  #reverse quality string
                            except:
                                new_alignment.query_qualities = []
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.reference_id = name_to_id[read_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        SU += 1
                        if addtag: new_alignment.set_tag(tag="SU", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (4) multiple mapped
                    if len(read_maps) > 2 and len(read_maps) % 2 == 0:
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.tid = name_to_id[read_maps[1][0]]
                        new_alignment.pos = read_maps[1][1]
                        new_alignment.mapq = old_alignment.mapq

                        SM += 1
                        if addtag: new_alignment.set_tag(tag="SM", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
    except StopIteration:
        printlog(["Done!"])
    OUT_FILE.close()

    if outfile_prefix is not None:
        if file_type == "BAM" or file_type == "CRAM":
            try:
                printlog([
                    'Sort "%s" and save as "%s"' %
                    (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')
                ])
                pysam.sort("-o", outfile_prefix + '.sorted.bam',
                           outfile_prefix + '.bam')
            except:
                printlog(["Warning: ", "output BAM file was NOT sorted"])
            try:
                printlog(['Index "%s" ...' % (outfile_prefix + '.sorted.bam')])
                pysam.index(outfile_prefix + '.sorted.bam',
                            outfile_prefix + '.sorted.bam.bai')
            except:
                printlog(["Warning: ", "output BAM file was NOT indexed."])

    print("Total alignments:" + str(total_item - 1))
    print("	 QC failed: " + str(QF))
    if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0:
        print("	 Paired-end reads:")
        print("\tR1 unique, R2 unique (UU): " + str(UU))
        print("\tR1 unique, R2 unmapp (UN): " + str(UN))
        print("\tR1 unique, R2 multiple (UM): " + str(UM))

        print("\tR1 multiple, R2 multiple (MM): " + str(MM))
        print("\tR1 multiple, R2 unique (MU): " + str(MU))
        print("\tR1 multiple, R2 unmapped (MN): " + str(MN))

        print("\tR1 unmap, R2 unmap (NN): " + str(NN))
        print("\tR1 unmap, R2 unique (NU): " + str(NU))
        print("\tR1 unmap, R2 multiple (NM): " + str(NM))
    if max(SN, SU, SM) > 0:
        print("	 Single-end reads:")
        print("\tUniquley mapped (SU): " + str(SU))
        print("\tMultiple mapped (SM): " + str(SM))
        print("\tUnmapped (SN): " + str(SN))
示例#28
0
    def store(self,
              qname,
              N_mismatch,
              FR,
              refname,
              strand,
              pos,
              cigar,
              original_BS,
              methy,
              STEVE,
              rnext=-1,
              pnext=-1,
              qual=None,
              output_genome=None,
              rrbs=False,
              my_region_serial=None,
              my_region_start=None,
              my_region_end=None):

        if self.format == BS_SEEKER1:

            # remove the soft clipped bases from the read
            # this is done for backwards compatibility with the old format
            r_start, r_end, _ = get_read_start_end_and_genome_length(cigar)
            original_BS = original_BS[r_start:r_end]

            if rrbs:
                self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\n' %
                             (qname, N_mismatch, FR, refname, strand,
                              str(pos + 1).zfill(10), output_genome,
                              original_BS, methy, STEVE))
            else:
                self.f.write(
                    '%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n' %
                    (qname, N_mismatch, FR, refname, strand,
                     str(pos + 1).zfill(10), output_genome, original_BS, methy,
                     my_region_serial, my_region_start, my_region_end, STEVE))

        elif self.format == BAM or self.format == SAM:

            a = pysam.AlignedRead()
            a.qname = qname
            a.seq = original_BS if strand == '+' else reverse_compl_seq(
                original_BS)
            a.flag = 0x10 if strand == '-' else 0
            a.tid = self.chrom_ids[refname]
            a.pos = pos
            a.mapq = 255
            a.cigar = cigar if strand == '+' else list(reversed(cigar))
            a.rnext = rnext if rnext == -1 else self.chrom_ids[rnext]
            a.pnext = pnext
            a.qual = qual
            if rrbs:
                a.tags = (('XO', FR), ('XS', STEVE), ('NM', N_mismatch),
                          ('XM', methy), ('XG', output_genome),
                          ('YR', my_region_serial), ('YS', my_region_start),
                          ('YE', my_region_end))

            else:
                a.tags = (('XO', FR), ('XS', STEVE), ('NM', N_mismatch),
                          ('XM', methy), ('XG', output_genome))

            self.f.write(a)
示例#29
0
def mergeChainedAlignedReads(chainedAlignedReads, refSequence, readSequence):
    """Makes a global aligment for the given chained reads.
    From doc on building pysam line
    a = pysam.AlignedRead()
    a.qname = "read_28833_29006_6945"
    a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
    a.flag = 99
    a.rname = 0
    a.pos = 32
    a.mapq = 20
    a.cigar = ( (0,10), (2,1), (0,25) )
    a.mrnm = 0
    a.mpos=199
    a.isize=167
    a.qual="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"
    a.tags = ( ("NM", 1),
               ("RG", "L1") )
    """
    cAR = pysam.AlignedRead()
    aR = chainedAlignedReads[0]
    cAR.qname = aR.qname

    #Parameters we don't and therefore set properly
    #cAR.flag = aR.flag
    #cAR.mapq = aR.mapq
    #cAR.mrnm = 0
    #cAR.mpos=0
    #cAR.isize=0
    #cAR.qual = "<" * len(readSequence)
    #cAR.tags = aR.tags
    cAR.rnext = -1
    cAR.pos = 0
    cAR.is_reverse = aR.is_reverse
    if cAR.is_reverse:
        cAR.seq = reverseComplement(readSequence)
    else:
        cAR.seq = readSequence
    cAR.rname = aR.rname
    cigarList = []
    pPos = 0
    if cAR.is_reverse:  #Iterate from the other end of the sequence
        pQPos = -(len(readSequence) - 1)
    else:
        pQPos = 0

    for aR in chainedAlignedReads:
        assert cAR.is_reverse == aR.is_reverse
        #Add a deletion representing the preceding unaligned reference positions
        assert aR.pos >= pPos
        if aR.pos > pPos:
            cigarList.append((2, aR.pos - pPos))
            pPos = aR.pos

        #Add an insertion representing the preceding unaligned read positions
        qPos = getAbsoluteReadOffset(aR, refSequence, readSequence)
        assert qPos >= pQPos
        if qPos > pQPos:
            cigarList.append((1, qPos - pQPos))
            pQPos = qPos

        #Add the operations of the cigar, filtering hard and soft clipping
        for op, length in aR.cigar:
            assert op in (0, 1, 2, 4, 5)
            if op in (0, 1, 2):
                cigarList.append((op, length))
            if op in (0, 2):  #Is match or deletion
                pPos += length
            if op in (0, 1):  #Is match or insertion
                pQPos += length

    #Now add any trailing deletions/insertions
    assert pPos <= len(refSequence)
    if pPos < len(refSequence):
        cigarList.append((2, len(refSequence) - pPos))

    if cAR.is_reverse:
        assert pQPos <= 1
        if pQPos < 1:
            cigarList.append((1, -pQPos + 1))
    else:
        assert pQPos <= len(readSequence)
        if pQPos < len(readSequence):
            cigarList.append((1, len(readSequence) - pQPos))

    #Check coordinates
    #print cAR.is_reverse, sum([ length for op, length in cigarList if op in (0, 2)]),  len(refSequence), sum([ length for op, length in cigarList if op in (0, 1)]), len(readSequence), cAR.qname
    assert sum([length for op, length in cigarList
                if op in (0, 2)]) == len(refSequence)
    assert sum([length for op, length in cigarList
                if op in (0, 1)]) == len(readSequence)

    cAR.cigar = tuple(cigarList)

    return cAR
示例#30
0
samwrite=sys.argv[2]
samfile=pysam.Samfile(samuse,'rb')
reads=pysam.Samfile(samwrite, "wb", template=samfile)

def get_bit(byteval,idx):
    return ((byteval&(1<<idx))!=0);

for alignedread in samfile.fetch():
    reads.write(alignedread)
    originalflag=alignedread.flag
    if(alignedread.tags[0][0] == 'X0'):
        try:   
            for newrec in alignedread.opt('XA').split(';'):
                if(newrec != ''):
                    s=newrec.split(',')
                    a = pysam.AlignedRead()
                    a = alignedread
                    a.flag=originalflag
                    if(get_bit(alignedread.flag,4) and int(s[1]) < 0 and get_bit(alignedread.flag,7)):
                        a.flag = int(0x80)
                    elif(get_bit(alignedread.flag,4) and int(s[1]) > 0 and get_bit(alignedread.flag,7)):
                        a.flag= int(0x10) + int(0x80)
                    elif(get_bit(alignedread.flag,4) and int(s[1]) < 0 and get_bit(alignedread.flag,6)):
                        a.flag= int(0x40)
                    elif(get_bit(alignedread.flag,4) and int(s[1]) > 0 and get_bit(alignedread.flag,6)):
                        a.flag=int(0x10) + int(0x40)
                    elif(int(s[1]) > 0 and get_bit(alignedread.flag,7)):
                        a.flag= int(0x80)     
                    elif(int(s[1]) < 0 and get_bit(alignedread.flag,7)):
                        a.flag= int(0x10) + int(0x80)
                    elif(int(s[1]) > 0 and get_bit(alignedread.flag,6)):