Пример #1
0
def main(bam_fname,
         sidecar_fname,
         max_xd=200,
         max_MQ=70,
         strict_scoring=False,
         max_vlen=200,
         processes=2):
    """This function rips through a BAM from simulated reads and bins reads into a three dimensional histogram.

  The dimensions are:
    Xd - alignment error  [0]  -max_xd, ... 0, ... +max_xd, wrong_chrom, unmapped (2 * max_xd + 3)
    MQ - mapping quality  [1]  0, ... max_MQ (max_MQ + 1)
    vlen - length of variant carried by read [2]  Ref, < -max_vlen , -max_vlen, ... 0, ... +max_vlen, > +max_vlen
                                                  ( 2 * max_vlen + 1 + 2 + 1)

  :param bam_fname:
  :param sidecar_fname:
  :param max_xd:
  :param max_MQ:
  :param strict_scoring:
  :param max_vlen:
  :param processes:
  :return:
  """
    # Set up the I/O queues and place all BAM contigs on the work queue
    work_q, result_q = Queue(), Queue()
    for ref in pysam.AlignmentFile(bam_fname).references:
        work_q.put(ref)
    for _ in range(processes):
        work_q.put(__process_stop_code__)

    # Start workers
    long_qname_table = load_qname_sidecar(sidecar_fname)
    p_list = [
        Process(target=worker,
                args=(i, bam_fname, long_qname_table, max_xd, max_MQ, max_vlen,
                      strict_scoring, work_q, result_q))
        for i in range(processes)
    ]
    for p in p_list:
        p.start()

    # Sum the results from each worker together
    t0 = time.time()
    xmv_mat, tot_cnt = None, 0
    for _ in range(processes):
        xmv_mat_shard, cnt = result_q.get()
        tot_cnt += cnt
        xmv_mat = (xmv_mat +
                   xmv_mat_shard) if xmv_mat is not None else xmv_mat_shard
    t1 = time.time()
    logger.debug('Processed {} reads in {:.2f}s ({:.2f} r/s)'.format(
        tot_cnt, t1 - t0, tot_cnt / (t1 - t0)))

    # Orderly exit
    for p in p_list:
        p.join()

    return xmv_mat
Пример #2
0
def main(fastq_fname, qname_overflow_fname, max_expected_qname_length=500):
  long_qname_table = load_qname_sidecar(qname_overflow_fname)
  qname_count = [0] * (max_expected_qname_length + 1)
  with pysam.FastxFile(fastq_fname) as fh:
    for r in fh:
      qlen = len(long_qname_table.get(r.name.split('|', 1)[0])) if r.name[-1] != '*' else len(r.name)
      qname_count[min(qlen, max_expected_qname_length)] += 1

  return qname_count
Пример #3
0
def main(mainfile_in,
         sidecar_in,
         mainfile_out,
         sidecar_out,
         truncate_to=240,
         file_type=None):
    """

  :param mainfile_in:
  :param sidecar_in:
  :param mainfile_out:
  :param sidecar_out:
  :param truncate_to:
  :param file_type: If supplied ("BAM" or "FASTQ") then we will not auto detect
  :return:
  """
    ft = {'BAM': 1, 'FASTQ': 0}.get(file_type or auto_detect(mainfile_in))
    fp_in = pysam.AlignmentFile(
        mainfile_in, mode='rb') if ft else pysam.FastxFile(mainfile_in)
    fp_out = pysam.AlignmentFile(mainfile_out, mode='wb',
                                 header=fp_in.header) if ft else open(
                                     mainfile_out, 'w')
    side_car_fp = open(sidecar_out, 'w')

    logger.debug('Starting conversion ...')
    long_qname_table = load_qname_sidecar(sidecar_in)
    cnt, t0 = 0, time.time()
    for cnt, r in enumerate(fp_in):
        qname = r.qname if ft else r.name  # Thanks pysam for the inconsistent naming. What's a few CPU cycles between friends?
        qname = long_qname_table.get(
            qname.split('|', 1)[0], qname
        )  # Don't pass the wrong side car file, you won't know what hit you
        qname = qname[:-1] + '*'  # Older qnames had "|" instead of "*". "*" is more unambiguous as a termination character
        if len(qname) > truncate_to:
            side_car_fp.write('@' + qname + '\n')
            qname = qname[:truncate_to]
        if ft:
            r.qname = qname
            fp_out.write(r)
        else:
            fp_out.write('@{}\n{}\n+\n{}\n'.format(qname, r.sequence,
                                                   r.quality))

        if cnt % 100000 == 99999:
            t1 = time.time()
            logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format(
                cnt + 1, t1 - t0, (cnt + 1) / (t1 - t0)))

    t1 = time.time()
    logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format(
        cnt + 1, t1 - t0, (cnt + 1) / (t1 - t0)))
Пример #4
0
Файл: xmv.py Проект: sbg/Mitty
def main(bam_fname, sidecar_fname, max_xd=200, max_MQ=70, strict_scoring=False,  max_vlen=200, processes=2):
  """This function rips through a BAM from simulated reads and bins reads into a three dimensional histogram.

  The dimensions are:
    Xd - alignment error  [0]  -max_xd, ... 0, ... +max_xd, wrong_chrom, unmapped (2 * max_xd + 3)
    MQ - mapping quality  [1]  0, ... max_MQ (max_MQ + 1)
    vlen - length of variant carried by read [2]  Ref, < -max_vlen , -max_vlen, ... 0, ... +max_vlen, > +max_vlen
                                                  ( 2 * max_vlen + 1 + 2 + 1)

  :param bam_fname:
  :param sidecar_fname:
  :param max_xd:
  :param max_MQ:
  :param strict_scoring:
  :param max_vlen:
  :param processes:
  :return:
  """
  # Set up the I/O queues and place all BAM contigs on the work queue
  work_q, result_q = Queue(), Queue()
  for ref in pysam.AlignmentFile(bam_fname).references:
    work_q.put(ref)
  for _ in range(processes):
    work_q.put(__process_stop_code__)

  # Start workers
  long_qname_table = load_qname_sidecar(sidecar_fname)
  p_list = [
    Process(target=worker, args=(i, bam_fname, long_qname_table, max_xd, max_MQ, max_vlen, strict_scoring,
                                 work_q, result_q))
    for i in range(processes)
  ]
  for p in p_list:
    p.start()

  # Sum the results from each worker together
  t0 = time.time()
  xmv_mat, tot_cnt = None, 0
  for _ in range(processes):
    xmv_mat_shard, cnt = result_q.get()
    tot_cnt += cnt
    xmv_mat = (xmv_mat + xmv_mat_shard) if xmv_mat is not None else xmv_mat_shard
  t1 = time.time()
  logger.debug('Processed {} reads in {:.2f}s ({:.2f} r/s)'.format(tot_cnt, t1 - t0, tot_cnt/(t1 - t0)))

  # Orderly exit
  for p in p_list:
    p.join()

  return xmv_mat
Пример #5
0
def parse_read_qnames(sidecar_fname, titer):
    """Mutates dictionary: adds 'read_info' field to it.

  :param titer:
  :return:
  """
    long_qname_table = load_qname_sidecar(
        sidecar_fname) if sidecar_fname is not None else None

    for template in titer:
        ri = parse_qname(template[0].qname, long_qname_table=long_qname_table
                         ) if long_qname_table is not None else [None, None]
        yield tuple({
            'read': mate,
            'read_info': ri[1 if mate.is_read2 else 0]
        } for mate in template)
Пример #6
0
def parse_read_qnames(sidecar_fname, titer):
  """Mutates dictionary: adds 'read_info' field to it.

  :param titer:
  :return:
  """
  long_qname_table = load_qname_sidecar(sidecar_fname) if sidecar_fname is not None else None

  for template in titer:
    ri = parse_qname(
        template[0].qname,
        long_qname_table=long_qname_table
    ) if long_qname_table is not None else [None, None]
    yield tuple(
      {
        'read': mate,
        'read_info': ri[1 if mate.is_read2 else 0]
      }
      for mate in template
    )
Пример #7
0
def main(mainfile_in, sidecar_in, mainfile_out, sidecar_out, truncate_to=240, file_type=None):
  """

  :param mainfile_in:
  :param sidecar_in:
  :param mainfile_out:
  :param sidecar_out:
  :param truncate_to:
  :param file_type: If supplied ("BAM" or "FASTQ") then we will not auto detect
  :return:
  """
  ft = {
    'BAM': 1,
    'FASTQ': 0
  }.get(file_type or auto_detect(mainfile_in))
  fp_in = pysam.AlignmentFile(mainfile_in, mode='rb') if ft else pysam.FastxFile(mainfile_in)
  fp_out = pysam.AlignmentFile(mainfile_out, mode='wb', header=fp_in.header) if ft else open(mainfile_out, 'w')
  side_car_fp = open(sidecar_out, 'w')

  logger.debug('Starting conversion ...')
  long_qname_table = load_qname_sidecar(sidecar_in)
  cnt, t0 = 0, time.time()
  for cnt, r in enumerate(fp_in):
    qname = r.qname if ft else r.name  # Thanks pysam for the inconsistent naming. What's a few CPU cycles between friends?
    qname = long_qname_table.get(qname.split('|', 1)[0], qname)  # Don't pass the wrong side car file, you won't know what hit you
    qname = qname[:-1] + '*' # Older qnames had "|" instead of "*". "*" is more unambiguous as a termination character
    if len(qname) > truncate_to:
      side_car_fp.write('@' + qname + '\n')
      qname = qname[:truncate_to]
    if ft:
      r.qname = qname
      fp_out.write(r)
    else:
      fp_out.write('@{}\n{}\n+\n{}\n'.format(qname, r.sequence, r.quality))

    if cnt % 100000 == 99999:
      t1 = time.time()
      logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format(cnt + 1, t1 - t0, (cnt + 1)/(t1 - t0)))

  t1 = time.time()
  logger.debug('Processed {} reads in {:0.2f}s ({:0.2f} r/s)'.format(cnt + 1, t1 - t0, (cnt + 1) / (t1 - t0)))
Пример #8
0
def main(bam_in_l, out_prefix, criterion, threshold, sidecar_fname=None):
  """

  :param bam_in_l:
  :param out_prefix:
  :param criterion: {'d_err', 'MQ', 'mapped', 'p_diff'}
  :param threshold:
  :param simulated:
  :param sidecar_fname
  :return:
  """
  assert len(bam_in_l) <= MAX_ORIGINS, "Can't do more than {} sets".format(MAX_ORIGINS)

  bam_fp_l = [pysam.AlignmentFile(bam_in) for bam_in in bam_in_l]
  long_qname_table = load_qname_sidecar(sidecar_fname) if sidecar_fname else None

  part_d = get_partition_description(out_prefix, len(bam_in_l))
  for p in part_d:
    p['filehandles'] = [pysam.AlignmentFile(p['filenames'][k] + '.unsorted.bam', 'wb', header=bam_fp_l[k].header)
                        for k in range(len(bam_fp_l))]
  scoring_fn = scoring_fn_dict.get(criterion)[0]

  incomplete_reads = {}
  cnt = -1
  t0 = time.time()
  for cnt, (n, r) in enumerate(iterate_over_bams(bam_fp_l)):
    if (cnt + 1) % 1000000 == 0:
      t1 = time.time()
      logger.debug('Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)'.format(
        cnt + 1, len(incomplete_reads), 100 * len(incomplete_reads) / (cnt + 1), t1 - t0, cnt / (t1 - t0)))

    ky = ('1' if r.is_read1 else '2') + r.qname

    if ky in incomplete_reads:
      ir = incomplete_reads[ky]
      ir[n] = r
      if all(ir):
        process_these_reads(part_d, incomplete_reads.pop(ky), scoring_fn, threshold, long_qname_table)
    else:
      ir = [None] * len(bam_fp_l)
      ir[n] = r
      incomplete_reads[ky] = ir

  t1 = time.time()
  logger.debug('Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)'.format(
    cnt + 1, len(incomplete_reads), 100 * len(incomplete_reads)/(cnt + 1), t1 - t0, cnt / (t1 - t0)))

  logger.debug('Closing output files')
  for p in part_d:
    for fp in p['filehandles']:
      fp.close()

  # Nice to get this written out before the time consuming sort and index stages
  with open('{}_summary.txt'.format(out_prefix), 'w') as fp:
    for p in part_d:
      fp.write('{}\t{}\n'.format(p['partition_label'], p['total']))

  logger.debug('Sorting and indexing output BAMs')
  for p in part_d:
    for fn in p['filenames']:
      logger.debug('Sort and index {}'.format(fn))
      pysam.sort('-m', '1G', '-o', fn + '.bam', fn + '.unsorted.bam')
      os.remove(fn + '.unsorted.bam')
      pysam.index(fn + '.bam')
Пример #9
0
def main(bam_fname,
         sidecar_fname,
         out_fname,
         d_range=(-200, 200),
         reject_d_range=False,
         v_range=(-200, 200),
         reject_v_range=False,
         reject_reads_with_variants=False,
         reject_reference_reads=False,
         strict_scoring=False,
         do_not_index=True,
         processes=2):
    """This function extracts reads from a simulation BAM that match the filter critera

  :param bam_fname:
  :param sidecar_fname:
  :param out_fname:
  :param d_range:
  :param reject_d_range:
  :param v_range:
  :param reject_v_range:
  :param reject_reads_with_variants:
  :param reject_reference_reads:
  :param strict_scoring:
  :param do_not_index:
  :param processes:
  :return:
  """
    def _filter_pass(_r):
        """

    :param _r:
    :return: T/F, d_err
    """
        ri = parse_qname(
            _r.qname,
            long_qname_table=long_qname_table)[1 if _r.is_read2 else 0]

        is_ref_read = len(ri.v_list) == 0
        if is_ref_read and reject_reference_reads:
            return False, 0

        if not is_ref_read and reject_reads_with_variants:
            return False, 0

        _d_err = score_alignment_error(_r,
                                       ri=ri,
                                       max_d=max_d,
                                       strict=strict_scoring)

        in_d_err_range = d_range[0] <= _d_err <= d_range[1]
        if in_d_err_range == reject_d_range:
            return False, 0

        if not is_ref_read:
            # All variants are inside/outside v_range and we want to/do not want to reject the range
            if all((v_range[0] <= v <= v_range[1]) == reject_v_range
                   for v in ri.v_list):
                return False, 0

        return True, _d_err

    se_bam = is_single_end_bam(bam_fname)
    bam_fp = pysam.AlignmentFile(bam_fname)
    long_qname_table = load_qname_sidecar(sidecar_fname)

    unsorted_out_fname = out_fname + '.unsorted'
    out_fp = pysam.AlignmentFile(unsorted_out_fname,
                                 'wb',
                                 header=bam_fp.header)

    in_cnt = 0
    max_d = d_range[1] + 10000
    read_dict = {}

    t0 = time.time()
    for rd in bam_fp.fetch(until_eof=True):
        if rd.flag & 0b100100000000:
            continue  # Skip supplementary or secondary alignments
        in_cnt += 1
        if in_cnt % 1000000 == 0:
            t1 = time.time()
            logger.debug('Processed {} reads in {:2f}s ({:2f} r/s) {}'.format(
                in_cnt, t1 - t0, in_cnt / (t1 - t0),
                '' if se_bam else '(dict size {})'.format(len(read_dict))))

        if se_bam:
            keep, d_err = _filter_pass(rd)
            if keep:
                rd.set_tag('XD', d_err)
                out_fp.write(rd)
        else:
            if rd.qname[:20] not in read_dict:
                read_dict[rd.qname[:20]] = [None, None]

            rl = read_dict[rd.qname[:20]]
            rl[0 if rd.is_read1 else 1] = rd

            if all(rl):
                keep1, d_err1 = _filter_pass(rl[0])
                keep2, d_err2 = _filter_pass(rl[1])
                if keep1 or keep2:
                    rl[0].set_tag('XD', d_err1)
                    rl[1].set_tag('XD', d_err2)
                    out_fp.write(rl[0])
                    out_fp.write(rl[1])
                del read_dict[rd.qname[:20]]

    out_fp.close()
    t1 = time.time()
    logger.debug('Processed {} reads in {:2f}s ({:2f} r/s) {}'.format(
        in_cnt, t1 - t0, in_cnt / (t1 - t0),
        '' if se_bam else '(dict size {})'.format(len(read_dict))))

    logger.debug('Sorting {} -> {}'.format(unsorted_out_fname, out_fname))
    t0 = time.time()
    pysam.sort('-m', '1G', '-o', out_fname, unsorted_out_fname)
    os.remove(unsorted_out_fname)
    t1 = time.time()
    logger.debug('... {:0.2f}s'.format(t1 - t0))

    if not do_not_index:
        logger.debug('BAM index {} ...'.format(bam_fname))
        t0 = time.time()
        pysam.index(out_fname, out_fname + '.bai')
        t1 = time.time()
        logger.debug('... {:0.2f}s'.format(t1 - t0))
Пример #10
0
def main(bam_in_l, out_prefix, criterion, threshold, sidecar_fname=None):
    """

  :param bam_in_l:
  :param out_prefix:
  :param criterion: {'d_err', 'MQ', 'mapped', 'p_diff'}
  :param threshold:
  :param simulated:
  :param sidecar_fname
  :return:
  """
    assert len(bam_in_l) <= MAX_ORIGINS, "Can't do more than {} sets".format(
        MAX_ORIGINS)

    bam_fp_l = [pysam.AlignmentFile(bam_in) for bam_in in bam_in_l]
    long_qname_table = load_qname_sidecar(
        sidecar_fname) if sidecar_fname else None

    part_d = get_partition_description(out_prefix, len(bam_in_l))
    for p in part_d:
        p['filehandles'] = [
            pysam.AlignmentFile(p['filenames'][k] + '.unsorted.bam',
                                'wb',
                                header=bam_fp_l[k].header)
            for k in range(len(bam_fp_l))
        ]
    scoring_fn = scoring_fn_dict.get(criterion)[0]

    incomplete_reads = {}
    cnt = -1
    t0 = time.time()
    for cnt, (n, r) in enumerate(iterate_over_bams(bam_fp_l)):
        if (cnt + 1) % 1000000 == 0:
            t1 = time.time()
            logger.debug(
                'Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)'
                .format(cnt + 1, len(incomplete_reads),
                        100 * len(incomplete_reads) / (cnt + 1), t1 - t0,
                        cnt / (t1 - t0)))

        ky = ('1' if r.is_read1 else '2') + r.qname

        if ky in incomplete_reads:
            ir = incomplete_reads[ky]
            ir[n] = r
            if all(ir):
                process_these_reads(part_d, incomplete_reads.pop(ky),
                                    scoring_fn, threshold, long_qname_table)
        else:
            ir = [None] * len(bam_fp_l)
            ir[n] = r
            incomplete_reads[ky] = ir

    t1 = time.time()
    logger.debug(
        'Processed {} reads ({} incomplete ({:0.2}%)) in {:.2f}s ({:.2f} t/s)'.
        format(cnt + 1, len(incomplete_reads),
               100 * len(incomplete_reads) / (cnt + 1), t1 - t0,
               cnt / (t1 - t0)))

    logger.debug('Closing output files')
    for p in part_d:
        for fp in p['filehandles']:
            fp.close()

    # Nice to get this written out before the time consuming sort and index stages
    with open('{}_summary.txt'.format(out_prefix), 'w') as fp:
        for p in part_d:
            fp.write('{}\t{}\n'.format(p['partition_label'], p['total']))

    logger.debug('Sorting and indexing output BAMs')
    for p in part_d:
        for fn in p['filenames']:
            logger.debug('Sort and index {}'.format(fn))
            pysam.sort('-m', '1G', '-o', fn + '.bam', fn + '.unsorted.bam')
            os.remove(fn + '.unsorted.bam')
            pysam.index(fn + '.bam')
Пример #11
0
def main(bam_fname, sidecar_fname, out_fname,
         d_range=(-200, 200), reject_d_range=False,
         v_range=(-200, 200), reject_v_range=False,
         reject_reads_with_variants=False,
         reject_reference_reads=False,
         strict_scoring=False, do_not_index=True, processes=2):
  """This function extracts reads from a simulation BAM that match the filter critera

  :param bam_fname:
  :param sidecar_fname:
  :param out_fname:
  :param d_range:
  :param reject_d_range:
  :param v_range:
  :param reject_v_range:
  :param reject_reads_with_variants:
  :param reject_reference_reads:
  :param strict_scoring:
  :param do_not_index:
  :param processes:
  :return:
  """
  def _filter_pass(_r):
    """

    :param _r:
    :return: T/F, d_err
    """
    ri = parse_qname(_r.qname, long_qname_table=long_qname_table)[1 if _r.is_read2 else 0]

    is_ref_read = len(ri.v_list) == 0
    if is_ref_read and reject_reference_reads:
      return False, 0

    if not is_ref_read and reject_reads_with_variants:
      return False, 0

    _d_err = score_alignment_error(_r, ri=ri, max_d=max_d, strict=strict_scoring)

    in_d_err_range = d_range[0] <= _d_err <= d_range[1]
    if in_d_err_range == reject_d_range:
      return False, 0

    if not is_ref_read:
      # All variants are inside/outside v_range and we want to/do not want to reject the range
      if all((v_range[0] <= v <= v_range[1]) == reject_v_range for v in ri.v_list):
        return False, 0

    return True, _d_err

  se_bam = is_single_end_bam(bam_fname)
  bam_fp = pysam.AlignmentFile(bam_fname)
  long_qname_table = load_qname_sidecar(sidecar_fname)

  unsorted_out_fname = out_fname + '.unsorted'
  out_fp = pysam.AlignmentFile(unsorted_out_fname, 'wb', header=bam_fp.header)

  in_cnt = 0
  max_d = d_range[1] + 10000
  read_dict = {}

  t0 = time.time()
  for rd in bam_fp.fetch(until_eof=True):
    if rd.flag & 0b100100000000: continue  # Skip supplementary or secondary alignments
    in_cnt += 1
    if in_cnt % 1000000 == 0:
      t1 = time.time()
      logger.debug(
        'Processed {} reads in {:2f}s ({:2f} r/s) {}'.format(
          in_cnt, t1 - t0, in_cnt / (t1 - t0), '' if se_bam else '(dict size {})'.format(len(read_dict))))

    if se_bam:
      keep, d_err = _filter_pass(rd)
      if keep:
        rd.set_tag('XD', d_err)
        out_fp.write(rd)
    else:
      if rd.qname[:20] not in read_dict:
        read_dict[rd.qname[:20]] = [None, None]

      rl = read_dict[rd.qname[:20]]
      rl[0 if rd.is_read1 else 1] = rd

      if all(rl):
        keep1, d_err1 = _filter_pass(rl[0])
        keep2, d_err2 = _filter_pass(rl[1])
        if keep1 or keep2:
          rl[0].set_tag('XD', d_err1)
          rl[1].set_tag('XD', d_err2)
          out_fp.write(rl[0])
          out_fp.write(rl[1])
        del read_dict[rd.qname[:20]]

  out_fp.close()
  t1 = time.time()
  logger.debug(
    'Processed {} reads in {:2f}s ({:2f} r/s) {}'.format(
      in_cnt, t1 - t0, in_cnt / (t1 - t0), '' if se_bam else '(dict size {})'.format(len(read_dict))))

  logger.debug('Sorting {} -> {}'.format(unsorted_out_fname, out_fname))
  t0 = time.time()
  pysam.sort('-m', '1G', '-o', out_fname, unsorted_out_fname)
  os.remove(unsorted_out_fname)
  t1 = time.time()
  logger.debug('... {:0.2f}s'.format(t1 - t0))

  if not do_not_index:
    logger.debug('BAM index {} ...'.format(bam_fname))
    t0 = time.time()
    pysam.index(out_fname, out_fname + '.bai')
    t1 = time.time()
    logger.debug('... {:0.2f}s'.format(t1 - t0))