示例#1
0
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print = []
    for aln in samfile.fetch(until_eof = True):
        if pair_is_aligned(aln, ref_ids):
            if args.read_pair == 1 and aln.is_read1:
                reads_to_print.append(aln)
            elif args.read_pair == 2 and aln.is_read2:
                reads_to_print.append(aln)
            elif args.read_pair == 0:
                reads_to_print.append(aln)
        if len(reads_to_print) >= 10000:
            # Flush the reads collected
            print_reads(reads_to_print)
            reads_to_print = []

    print_reads(reads_to_print)
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, option)
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print_1 = []
    reads_to_print_2 = []
    reads_to_print_u = []
    for aln in samfile.fetch(until_eof = True):
        if aln.tid in ref_ids: # This read is aligned
            if aln.rnext in ref_ids: # The mate is also aligned
                if aln.is_read1:
                    reads_to_print_1.append(aln)
                    reads_to_print_1 = flush_reads(reads_to_print_1, args.R1)
                elif aln.is_read2:
                    reads_to_print_2.append(aln)
                    reads_to_print_2 = flush_reads(reads_to_print_2, args.R2)
            else:
                reads_to_print_u.append(aln)
                reads_to_print_u = flush_reads(reads_to_print_u, args.u)

    print_reads(reads_to_print_1, args.R1)
    print_reads(reads_to_print_2, args.R2)
    print_reads(reads_to_print_u, args.u)
示例#3
0
def bam_surject_msa(args):
    """
    Caveats:
    - flags are remained as original statuses
    - remaining original values for MD, NM, and AS tags
    - mate are given as unmapped
    - same records are emited
    """
    skip_flag = args.skip_flag
    sam = Samfile(args.bam)
    fasta = Fasta(open(args.msa_fasta))
    mapped_ref_set = set(sam.references)

    # setup output
    if args.refnames is None:
        refnames = [
            'consensus{0}'.format(i) for i in xrange(len(args.msa_fastas))
        ]
    else:
        refnames = args.refnames
    assert len(refnames) == len(
        args.msa_fastas
    ), 'The number of refnames should be the same as that of msa_fastas.'

    logging.info('Loading MSA fastas')
    logging.info('Skip flag: %s', args.skip_flag)
    fastas = []
    ref_lens = []
    target_ref_set = set()
    for fn in args.msa_fastas:
        with open(fn) as fp:
            fasta = Fasta(fp)
            fastas.append(fasta)
            if len(fasta.contigs) == 0:
                logging.error('Fasta file %s has no contigs', fn)
                raise Exception('No contigs')
            ref_lens.append(len(fasta.contigs[0]))
            target_ref_set.update(fasta.names)

    rest_refs = [r for r in sam.references if r not in target_ref_set]
    logging.info('%s are included in surjection targets.', len(target_ref_set))
    logging.info('%s are not included in surjection targets.', len(rest_refs))
    if args.keep_rest:
        logging.info('Rest of reference will be kept in surjected BAM file')
        org_ref_len_map = dict(zip(sam.references, sam.lengths))
        refnames.extend([r for r in rest_refs])
        ref_lens.extend([org_ref_len_map[r] for r in rest_refs])
        fastas.extend([None for r in rest_refs])

    logging.info('Setting up output BAMs')
    if args.output.endswith('.bam'):
        mode = 'wb'
    else:
        mode = 'wh'
    out = pysam.Samfile(args.output,
                        mode=mode,
                        reference_names=[refname],
                        reference_lengths=[ref_length])

    # iteration
    for refname, fasta in zip(refnames, fastas):
        out_tid = out.gettid(refname)
        if fasta is None:
            logging.info('Transfering %s', refname)
            src_tid = sam.gettid(refname)
            for rec in sam.fetch(reference=refname):
                if rec.flag & skip_flag:
                    continue
                a = rec.__copy__()
                a.reference_id = out_tid
                if a.next_reference_id != src_tid:  # pair on the same refs
                    a.next_reference_id = out_tid
                else:
                    a.next_reference_id = -1  # unpair
                    a.next_reference_start = -1
                out.write(a)
            continue
        logging.info('Surjecing to %s', refname)
        query_refs = fasta.names
        cc = _CigarChecker() if args.check else None
        for qref in query_refs:
            if qref not in mapped_ref_set:
                logging.warning('%s is not found in original BAM file', qref)
                continue
            #a = pysam.AlignedSegment()
            a = rec.__copy__()
            #print (rec)
            if not rec.is_unmapped:
                org_cigar = Cigar(rec.cigartuples)
                pos, cigar = mc.convert(rec.pos, org_cigar)
                if org_cigar.query_length != cigar.query_length:
                    logging.error('Invalid cigar conversion for %s', rec.qname)
                    logging.error('org %s %s %s', rec.pos, org_cigar,
                                  org_cigar.query_length)
                    logging.error('new %s %s %s', pos, cigar,
                                  cigar.query_length)
                    s1 = pos
                    e1 = mc.get_pos(rec.pos + cigar.ref_length)
                    logging.error('ref %s-%s %s', s1, e1,
                                  mc.get_ref_cigar(s1, e1))
                    logging.error('read %s', rec.seq)
                    logging.error('qref %s', q_aln.seq[s1:e1])
                    raise Exception('Incompatible Cigar')
                cc and cc.check(rec, pos, cigar, org_cigar, mc, q_aln)
                a.cigar = cigar.values
                a.reference_start = pos
            a.reference_id = out_tid
            a.next_reference_id = -1  # this is required
            a.next_reference_start = -1  # this is required
            #a.flag = rec.flag
            #orec.seq = '*'
            #print (orec)
            out.write(a)