def main(bam_fname, sidecar_fname, max_xd=200, max_MQ=70, strict_scoring=False, max_vlen=200, processes=2): """This function rips through a BAM from simulated reads and bins reads into a three dimensional histogram. The dimensions are: Xd - alignment error [0] -max_xd, ... 0, ... +max_xd, wrong_chrom, unmapped (2 * max_xd + 3) MQ - mapping quality [1] 0, ... max_MQ (max_MQ + 1) vlen - length of variant carried by read [2] Ref, < -max_vlen , -max_vlen, ... 0, ... +max_vlen, > +max_vlen ( 2 * max_vlen + 1 + 2 + 1) :param bam_fname: :param sidecar_fname: :param max_xd: :param max_MQ: :param strict_scoring: :param max_vlen: :param processes: :return: """ # Set up the I/O queues and place all BAM contigs on the work queue work_q, result_q = Queue(), Queue() for ref in pysam.AlignmentFile(bam_fname).references: work_q.put(ref) for _ in range(processes): work_q.put(__process_stop_code__) # Start workers long_qname_table = load_qname_sidecar(sidecar_fname) p_list = [ Process(target=worker, args=(i, bam_fname, long_qname_table, max_xd, max_MQ, max_vlen, strict_scoring, work_q, result_q)) for i in range(processes) ] for p in p_list: p.start() # Sum the results from each worker together t0 = time.time() xmv_mat, tot_cnt = None, 0 for _ in range(processes): xmv_mat_shard, cnt = result_q.get() tot_cnt += cnt xmv_mat = (xmv_mat + xmv_mat_shard) if xmv_mat is not None else xmv_mat_shard t1 = time.time() logger.debug('Processed {} reads in {:.2f}s ({:.2f} r/s)'.format( tot_cnt, t1 - t0, tot_cnt / (t1 - t0))) # Orderly exit for p in p_list: p.join() return xmv_mat
def main(fastq_fname, qname_overflow_fname, max_expected_qname_length=500): long_qname_table = load_qname_sidecar(qname_overflow_fname) qname_count = [0] * (max_expected_qname_length + 1) with pysam.FastxFile(fastq_fname) as fh: for r in fh: qlen = len(long_qname_table.get(r.name.split('|', 1)[0])) if r.name[-1] != '*' else len(r.name) qname_count[min(qlen, max_expected_qname_length)] += 1 return qname_count
def main(mainfile_in, sidecar_in, mainfile_out, sidecar_out, truncate_to=240, file_type=None): """ :param mainfile_in: :param sidecar_in: :param mainfile_out: :param sidecar_out: :param truncate_to: :param file_type: If supplied ("BAM" or "FASTQ") then we will not auto detect :return: """ ft = {'BAM': 1, 'FASTQ': 0}.get(file_type or auto_detect(mainfile_in)) fp_in = pysam.AlignmentFile( mainfile_in, mode='rb') if ft else pysam.FastxFile(mainfile_in) fp_out = pysam.AlignmentFile(mainfile_out, mode='wb', header=fp_in.header) if ft else open( mainfile_out, 'w') side_car_fp = open(sidecar_out, 'w') logger.debug('Starting conversion ...') long_qname_table = load_qname_sidecar(sidecar_in) cnt, t0 = 0, time.time() for cnt, r in enumerate(fp_in): qname = r.qname if ft else r.name # Thanks pysam for the inconsistent naming. What's a few CPU cycles between friends? qname = long_qname_table.get( qname.split('|', 1)[0], qname ) # Don't pass the wrong side car file, you won't know what hit you qname = qname[:-1] + '*' # Older qnames had "|" instead of "*". "*" is more unambiguous as a termination character if len(qname) > truncate_to: side_car_fp.write('@' + qname + '\n') qname = qname[:truncate_to] if ft: r.qname = qname fp_out.write(r) else: fp_out.write('@{}\n{}\n+\n{}\n'.format(qname, r.sequence, r.quality)) if cnt % 100000 == 99999: t1 = time.time() logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format( cnt + 1, t1 - t0, (cnt + 1) / (t1 - t0))) t1 = time.time() logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format( cnt + 1, t1 - t0, (cnt + 1) / (t1 - t0)))
def main(bam_fname, sidecar_fname, max_xd=200, max_MQ=70, strict_scoring=False, max_vlen=200, processes=2): """This function rips through a BAM from simulated reads and bins reads into a three dimensional histogram. The dimensions are: Xd - alignment error [0] -max_xd, ... 0, ... +max_xd, wrong_chrom, unmapped (2 * max_xd + 3) MQ - mapping quality [1] 0, ... max_MQ (max_MQ + 1) vlen - length of variant carried by read [2] Ref, < -max_vlen , -max_vlen, ... 0, ... +max_vlen, > +max_vlen ( 2 * max_vlen + 1 + 2 + 1) :param bam_fname: :param sidecar_fname: :param max_xd: :param max_MQ: :param strict_scoring: :param max_vlen: :param processes: :return: """ # Set up the I/O queues and place all BAM contigs on the work queue work_q, result_q = Queue(), Queue() for ref in pysam.AlignmentFile(bam_fname).references: work_q.put(ref) for _ in range(processes): work_q.put(__process_stop_code__) # Start workers long_qname_table = load_qname_sidecar(sidecar_fname) p_list = [ Process(target=worker, args=(i, bam_fname, long_qname_table, max_xd, max_MQ, max_vlen, strict_scoring, work_q, result_q)) for i in range(processes) ] for p in p_list: p.start() # Sum the results from each worker together t0 = time.time() xmv_mat, tot_cnt = None, 0 for _ in range(processes): xmv_mat_shard, cnt = result_q.get() tot_cnt += cnt xmv_mat = (xmv_mat + xmv_mat_shard) if xmv_mat is not None else xmv_mat_shard t1 = time.time() logger.debug('Processed {} reads in {:.2f}s ({:.2f} r/s)'.format(tot_cnt, t1 - t0, tot_cnt/(t1 - t0))) # Orderly exit for p in p_list: p.join() return xmv_mat
def parse_read_qnames(sidecar_fname, titer): """Mutates dictionary: adds 'read_info' field to it. :param titer: :return: """ long_qname_table = load_qname_sidecar( sidecar_fname) if sidecar_fname is not None else None for template in titer: ri = parse_qname(template[0].qname, long_qname_table=long_qname_table ) if long_qname_table is not None else [None, None] yield tuple({ 'read': mate, 'read_info': ri[1 if mate.is_read2 else 0] } for mate in template)
def parse_read_qnames(sidecar_fname, titer): """Mutates dictionary: adds 'read_info' field to it. :param titer: :return: """ long_qname_table = load_qname_sidecar(sidecar_fname) if sidecar_fname is not None else None for template in titer: ri = parse_qname( template[0].qname, long_qname_table=long_qname_table ) if long_qname_table is not None else [None, None] yield tuple( { 'read': mate, 'read_info': ri[1 if mate.is_read2 else 0] } for mate in template )
def main(mainfile_in, sidecar_in, mainfile_out, sidecar_out, truncate_to=240, file_type=None): """ :param mainfile_in: :param sidecar_in: :param mainfile_out: :param sidecar_out: :param truncate_to: :param file_type: If supplied ("BAM" or "FASTQ") then we will not auto detect :return: """ ft = { 'BAM': 1, 'FASTQ': 0 }.get(file_type or auto_detect(mainfile_in)) fp_in = pysam.AlignmentFile(mainfile_in, mode='rb') if ft else pysam.FastxFile(mainfile_in) fp_out = pysam.AlignmentFile(mainfile_out, mode='wb', header=fp_in.header) if ft else open(mainfile_out, 'w') side_car_fp = open(sidecar_out, 'w') logger.debug('Starting conversion ...') long_qname_table = load_qname_sidecar(sidecar_in) cnt, t0 = 0, time.time() for cnt, r in enumerate(fp_in): qname = r.qname if ft else r.name # Thanks pysam for the inconsistent naming. What's a few CPU cycles between friends? qname = long_qname_table.get(qname.split('|', 1)[0], qname) # Don't pass the wrong side car file, you won't know what hit you qname = qname[:-1] + '*' # Older qnames had "|" instead of "*". "*" is more unambiguous as a termination character if len(qname) > truncate_to: side_car_fp.write('@' + qname + '\n') qname = qname[:truncate_to] if ft: r.qname = qname fp_out.write(r) else: fp_out.write('@{}\n{}\n+\n{}\n'.format(qname, r.sequence, r.quality)) if cnt % 100000 == 99999: t1 = time.time() logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format(cnt + 1, t1 - t0, (cnt + 1)/(t1 - t0))) t1 = time.time() logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format(cnt + 1, t1 - t0, (cnt + 1) / (t1 - t0)))
def main(bam_in_l, out_prefix, criterion, threshold, sidecar_fname=None): """ :param bam_in_l: :param out_prefix: :param criterion: {'d_err', 'MQ', 'mapped', 'p_diff'} :param threshold: :param simulated: :param sidecar_fname :return: """ assert len(bam_in_l) <= MAX_ORIGINS, "Can't do more than {} sets".format(MAX_ORIGINS) bam_fp_l = [pysam.AlignmentFile(bam_in) for bam_in in bam_in_l] long_qname_table = load_qname_sidecar(sidecar_fname) if sidecar_fname else None part_d = get_partition_description(out_prefix, len(bam_in_l)) for p in part_d: p['filehandles'] = [pysam.AlignmentFile(p['filenames'][k] + '.unsorted.bam', 'wb', header=bam_fp_l[k].header) for k in range(len(bam_fp_l))] scoring_fn = scoring_fn_dict.get(criterion)[0] incomplete_reads = {} cnt = -1 t0 = time.time() for cnt, (n, r) in enumerate(iterate_over_bams(bam_fp_l)): if (cnt + 1) % 1000000 == 0: t1 = time.time() logger.debug('Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)'.format( cnt + 1, len(incomplete_reads), 100 * len(incomplete_reads) / (cnt + 1), t1 - t0, cnt / (t1 - t0))) ky = ('1' if r.is_read1 else '2') + r.qname if ky in incomplete_reads: ir = incomplete_reads[ky] ir[n] = r if all(ir): process_these_reads(part_d, incomplete_reads.pop(ky), scoring_fn, threshold, long_qname_table) else: ir = [None] * len(bam_fp_l) ir[n] = r incomplete_reads[ky] = ir t1 = time.time() logger.debug('Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)'.format( cnt + 1, len(incomplete_reads), 100 * len(incomplete_reads)/(cnt + 1), t1 - t0, cnt / (t1 - t0))) logger.debug('Closing output files') for p in part_d: for fp in p['filehandles']: fp.close() # Nice to get this written out before the time consuming sort and index stages with open('{}_summary.txt'.format(out_prefix), 'w') as fp: for p in part_d: fp.write('{}\t{}\n'.format(p['partition_label'], p['total'])) logger.debug('Sorting and indexing output BAMs') for p in part_d: for fn in p['filenames']: logger.debug('Sort and index {}'.format(fn)) pysam.sort('-m', '1G', '-o', fn + '.bam', fn + '.unsorted.bam') os.remove(fn + '.unsorted.bam') pysam.index(fn + '.bam')
def main(bam_fname, sidecar_fname, out_fname, d_range=(-200, 200), reject_d_range=False, v_range=(-200, 200), reject_v_range=False, reject_reads_with_variants=False, reject_reference_reads=False, strict_scoring=False, do_not_index=True, processes=2): """This function extracts reads from a simulation BAM that match the filter critera :param bam_fname: :param sidecar_fname: :param out_fname: :param d_range: :param reject_d_range: :param v_range: :param reject_v_range: :param reject_reads_with_variants: :param reject_reference_reads: :param strict_scoring: :param do_not_index: :param processes: :return: """ def _filter_pass(_r): """ :param _r: :return: T/F, d_err """ ri = parse_qname( _r.qname, long_qname_table=long_qname_table)[1 if _r.is_read2 else 0] is_ref_read = len(ri.v_list) == 0 if is_ref_read and reject_reference_reads: return False, 0 if not is_ref_read and reject_reads_with_variants: return False, 0 _d_err = score_alignment_error(_r, ri=ri, max_d=max_d, strict=strict_scoring) in_d_err_range = d_range[0] <= _d_err <= d_range[1] if in_d_err_range == reject_d_range: return False, 0 if not is_ref_read: # All variants are inside/outside v_range and we want to/do not want to reject the range if all((v_range[0] <= v <= v_range[1]) == reject_v_range for v in ri.v_list): return False, 0 return True, _d_err se_bam = is_single_end_bam(bam_fname) bam_fp = pysam.AlignmentFile(bam_fname) long_qname_table = load_qname_sidecar(sidecar_fname) unsorted_out_fname = out_fname + '.unsorted' out_fp = pysam.AlignmentFile(unsorted_out_fname, 'wb', header=bam_fp.header) in_cnt = 0 max_d = d_range[1] + 10000 read_dict = {} t0 = time.time() for rd in bam_fp.fetch(until_eof=True): if rd.flag & 0b100100000000: continue # Skip supplementary or secondary alignments in_cnt += 1 if in_cnt % 1000000 == 0: t1 = time.time() logger.debug('Processed {} reads in {:2f}s ({:2f} r/s) {}'.format( in_cnt, t1 - t0, in_cnt / (t1 - t0), '' if se_bam else '(dict size {})'.format(len(read_dict)))) if se_bam: keep, d_err = _filter_pass(rd) if keep: rd.set_tag('XD', d_err) out_fp.write(rd) else: if rd.qname[:20] not in read_dict: read_dict[rd.qname[:20]] = [None, None] rl = read_dict[rd.qname[:20]] rl[0 if rd.is_read1 else 1] = rd if all(rl): keep1, d_err1 = _filter_pass(rl[0]) keep2, d_err2 = _filter_pass(rl[1]) if keep1 or keep2: rl[0].set_tag('XD', d_err1) rl[1].set_tag('XD', d_err2) out_fp.write(rl[0]) out_fp.write(rl[1]) del read_dict[rd.qname[:20]] out_fp.close() t1 = time.time() logger.debug('Processed {} reads in {:2f}s ({:2f} r/s) {}'.format( in_cnt, t1 - t0, in_cnt / (t1 - t0), '' if se_bam else '(dict size {})'.format(len(read_dict)))) logger.debug('Sorting {} -> {}'.format(unsorted_out_fname, out_fname)) t0 = time.time() pysam.sort('-m', '1G', '-o', out_fname, unsorted_out_fname) os.remove(unsorted_out_fname) t1 = time.time() logger.debug('... {:0.2f}s'.format(t1 - t0)) if not do_not_index: logger.debug('BAM index {} ...'.format(bam_fname)) t0 = time.time() pysam.index(out_fname, out_fname + '.bai') t1 = time.time() logger.debug('... {:0.2f}s'.format(t1 - t0))
def main(bam_in_l, out_prefix, criterion, threshold, sidecar_fname=None): """ :param bam_in_l: :param out_prefix: :param criterion: {'d_err', 'MQ', 'mapped', 'p_diff'} :param threshold: :param simulated: :param sidecar_fname :return: """ assert len(bam_in_l) <= MAX_ORIGINS, "Can't do more than {} sets".format( MAX_ORIGINS) bam_fp_l = [pysam.AlignmentFile(bam_in) for bam_in in bam_in_l] long_qname_table = load_qname_sidecar( sidecar_fname) if sidecar_fname else None part_d = get_partition_description(out_prefix, len(bam_in_l)) for p in part_d: p['filehandles'] = [ pysam.AlignmentFile(p['filenames'][k] + '.unsorted.bam', 'wb', header=bam_fp_l[k].header) for k in range(len(bam_fp_l)) ] scoring_fn = scoring_fn_dict.get(criterion)[0] incomplete_reads = {} cnt = -1 t0 = time.time() for cnt, (n, r) in enumerate(iterate_over_bams(bam_fp_l)): if (cnt + 1) % 1000000 == 0: t1 = time.time() logger.debug( 'Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)' .format(cnt + 1, len(incomplete_reads), 100 * len(incomplete_reads) / (cnt + 1), t1 - t0, cnt / (t1 - t0))) ky = ('1' if r.is_read1 else '2') + r.qname if ky in incomplete_reads: ir = incomplete_reads[ky] ir[n] = r if all(ir): process_these_reads(part_d, incomplete_reads.pop(ky), scoring_fn, threshold, long_qname_table) else: ir = [None] * len(bam_fp_l) ir[n] = r incomplete_reads[ky] = ir t1 = time.time() logger.debug( 'Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)'. format(cnt + 1, len(incomplete_reads), 100 * len(incomplete_reads) / (cnt + 1), t1 - t0, cnt / (t1 - t0))) logger.debug('Closing output files') for p in part_d: for fp in p['filehandles']: fp.close() # Nice to get this written out before the time consuming sort and index stages with open('{}_summary.txt'.format(out_prefix), 'w') as fp: for p in part_d: fp.write('{}\t{}\n'.format(p['partition_label'], p['total'])) logger.debug('Sorting and indexing output BAMs') for p in part_d: for fn in p['filenames']: logger.debug('Sort and index {}'.format(fn)) pysam.sort('-m', '1G', '-o', fn + '.bam', fn + '.unsorted.bam') os.remove(fn + '.unsorted.bam') pysam.index(fn + '.bam')
def main(bam_fname, sidecar_fname, out_fname, d_range=(-200, 200), reject_d_range=False, v_range=(-200, 200), reject_v_range=False, reject_reads_with_variants=False, reject_reference_reads=False, strict_scoring=False, do_not_index=True, processes=2): """This function extracts reads from a simulation BAM that match the filter critera :param bam_fname: :param sidecar_fname: :param out_fname: :param d_range: :param reject_d_range: :param v_range: :param reject_v_range: :param reject_reads_with_variants: :param reject_reference_reads: :param strict_scoring: :param do_not_index: :param processes: :return: """ def _filter_pass(_r): """ :param _r: :return: T/F, d_err """ ri = parse_qname(_r.qname, long_qname_table=long_qname_table)[1 if _r.is_read2 else 0] is_ref_read = len(ri.v_list) == 0 if is_ref_read and reject_reference_reads: return False, 0 if not is_ref_read and reject_reads_with_variants: return False, 0 _d_err = score_alignment_error(_r, ri=ri, max_d=max_d, strict=strict_scoring) in_d_err_range = d_range[0] <= _d_err <= d_range[1] if in_d_err_range == reject_d_range: return False, 0 if not is_ref_read: # All variants are inside/outside v_range and we want to/do not want to reject the range if all((v_range[0] <= v <= v_range[1]) == reject_v_range for v in ri.v_list): return False, 0 return True, _d_err se_bam = is_single_end_bam(bam_fname) bam_fp = pysam.AlignmentFile(bam_fname) long_qname_table = load_qname_sidecar(sidecar_fname) unsorted_out_fname = out_fname + '.unsorted' out_fp = pysam.AlignmentFile(unsorted_out_fname, 'wb', header=bam_fp.header) in_cnt = 0 max_d = d_range[1] + 10000 read_dict = {} t0 = time.time() for rd in bam_fp.fetch(until_eof=True): if rd.flag & 0b100100000000: continue # Skip supplementary or secondary alignments in_cnt += 1 if in_cnt % 1000000 == 0: t1 = time.time() logger.debug( 'Processed {} reads in {:2f}s ({:2f} r/s) {}'.format( in_cnt, t1 - t0, in_cnt / (t1 - t0), '' if se_bam else '(dict size {})'.format(len(read_dict)))) if se_bam: keep, d_err = _filter_pass(rd) if keep: rd.set_tag('XD', d_err) out_fp.write(rd) else: if rd.qname[:20] not in read_dict: read_dict[rd.qname[:20]] = [None, None] rl = read_dict[rd.qname[:20]] rl[0 if rd.is_read1 else 1] = rd if all(rl): keep1, d_err1 = _filter_pass(rl[0]) keep2, d_err2 = _filter_pass(rl[1]) if keep1 or keep2: rl[0].set_tag('XD', d_err1) rl[1].set_tag('XD', d_err2) out_fp.write(rl[0]) out_fp.write(rl[1]) del read_dict[rd.qname[:20]] out_fp.close() t1 = time.time() logger.debug( 'Processed {} reads in {:2f}s ({:2f} r/s) {}'.format( in_cnt, t1 - t0, in_cnt / (t1 - t0), '' if se_bam else '(dict size {})'.format(len(read_dict)))) logger.debug('Sorting {} -> {}'.format(unsorted_out_fname, out_fname)) t0 = time.time() pysam.sort('-m', '1G', '-o', out_fname, unsorted_out_fname) os.remove(unsorted_out_fname) t1 = time.time() logger.debug('... {:0.2f}s'.format(t1 - t0)) if not do_not_index: logger.debug('BAM index {} ...'.format(bam_fname)) t0 = time.time() pysam.index(out_fname, out_fname + '.bai') t1 = time.time() logger.debug('... {:0.2f}s'.format(t1 - t0))