Пример #1
0
def main(argv):

  parser = make_argparser()
  args = parser.parse_args(argv[1:])

  logging.basicConfig(stream=args.log_file, level=args.volume, format='%(message)s')
  tone_down_logger()

  start_time = time.time()
  # If the user requested, report back some data about the start of the run.
  if args.phone_home:
    call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test,
                      fail='warn')
    call.send_data('start')
    data = {
      'stdin': args.infile is sys.stdin,
      'aligner': args.aligner,
      'processes': args.processes,
      'queue_size': args.queue_size,
    }
    if data['stdin']:
      data['input_size'] = None
    else:
      data['input_size'] = os.path.getsize(args.infile.name)
    call.send_data('prelim', run_data=data)

  # Execute as much of the script as possible in a try/except to catch any exception that occurs
  # and report it via ET.phone.
  try:
    if args.queue_size is not None and args.queue_size <= 0:
      fail('Error: --queue-size must be greater than zero.')

    # If we're using mafft, check that we can execute it.
    if args.aligner == 'mafft' and not distutils.spawn.find_executable('mafft'):
      fail('Error: Could not find "mafft" command on $PATH.')

    # Open a pool of worker processes.
    stats = {'duplexes':0, 'time':0, 'pairs':0, 'runs':0, 'failures':0, 'aligned_pairs':0}
    pool = parallel_tools.SyncAsyncPool(
      process_duplex, processes=args.processes, static_kwargs={'aligner':args.aligner},
      queue_size=args.queue_size, callback=process_result, callback_args=[stats]
    )

    try:
      # The main loop.
      align_families(args.infile, pool, stats, check_ids=args.check_ids)
    finally:
      # If an exception occurs in the parent without stopping the child processes, this will hang.
      # Make sure to kill the children in all cases.
      pool.close()
      pool.join()
      # Close input filehandle if it's open.
      if args.infile is not sys.stdin:
        args.infile.close()

    # Final stats on the run.
    run_time = int(time.time() - start_time)
    max_mem = get_max_mem()
    logging.error(
      'Processed {pairs} read pairs in {duplexes} duplexes, with {failures} alignment failures.'
      .format(**stats)
    )
    if stats['aligned_pairs'] > 0 and stats['runs'] > 0:
      per_pair = stats['time'] / stats['aligned_pairs']
      per_run = stats['time'] / stats['runs']
      logging.error(f'{per_pair:0.3f}s per pair, {per_run:0.3f}s per run.')
    logging.error(f'in {run_time}s total time and {max_mem:0.2f}MB RAM.')

  except (Exception, KeyboardInterrupt) as exception:
    if args.phone_home and call:
      try:
        exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data())
        logging.critical(parallel_tools.format_traceback(exception_data))
        exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__)
      except Exception:
        exception_data = {}
      run_time = int(time.time() - start_time)
      try:
        run_data = get_run_data(stats, pool, args.aligner)
      except (Exception, UnboundLocalError):
        run_data = {}
      try:
        run_data['mem'] = get_max_mem()
      except Exception:
        pass
      run_data['failed'] = True
      if exception_data:
        run_data['exception'] = exception_data
      call.send_data('end', run_time=run_time, run_data=run_data)
      raise exception
    else:
      raise

  if args.phone_home and call:
    run_data = get_run_data(stats, pool, args.aligner, max_mem)
    call.send_data('end', run_time=run_time, run_data=run_data)
Пример #2
0
def main(argv):

  parser = make_argparser()
  args = parser.parse_args(argv[1:])

  logging.basicConfig(stream=args.log_file, level=args.volume, format='%(message)s')
  tone_down_logger()

  start_time = time.time()
  # If the user requested, report back some data about the start of the run.
  if args.phone_home:
    call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test,
                      fail='warn')
    call.send_data('start')
    data = {
      'stdin': args.infile is sys.stdin,
      'aligner': args.aligner,
      'processes': args.processes,
      'queue_size': args.queue_size,
    }
    if data['stdin']:
      data['input_size'] = None
    else:
      data['input_size'] = os.path.getsize(args.infile.name)
    call.send_data('prelim', run_data=data)

  # Execute as much of the script as possible in a try/except to catch any exception that occurs
  # and report it via ET.phone.
  try:
    if args.queue_size is not None and args.queue_size <= 0:
      fail('Error: --queue-size must be greater than zero.')

    # If we're using mafft, check that we can execute it.
    if args.aligner == 'mafft' and not distutils.spawn.find_executable('mafft'):
      fail('Error: Could not find "mafft" command on $PATH.')

    # Open a pool of worker processes.
    stats = {'duplexes':0, 'time':0, 'pairs':0, 'runs':0, 'failures':0, 'aligned_pairs':0}
    pool = parallel_tools.SyncAsyncPool(process_duplex,
                                        processes=args.processes,
                                        static_kwargs={'aligner':args.aligner},
                                        queue_size=args.queue_size,
                                        callback=process_result,
                                        callback_args=[stats],
                                       )
    """Now the main loop.
    This processes whole duplexes (pairs of strands) at a time for a future option to align the
    whole duplex at a time.
    duplex data structure:
    duplex = {
      'ab': [
        {'name1': 'read_name1a',
         'seq1':  'GATT-ACA',
         'qual1': 'sc!0 /J*',
         'name2': 'read_name1b',
         'seq2':  'ACTGACTA',
         'qual2': '34I&SDF)'
        },
        {'name1': 'read_name2a',
         ...
        },
        ...
      ],
      'ba': [
        ...
      ]
    }
    e.g.:
    seq = duplex[order][pair_num]['seq1']"""

    try:
      duplex = collections.OrderedDict()
      family = []
      barcode = None
      order = None
      for line in args.infile:
        fields = line.rstrip('\r\n').split('\t')
        if len(fields) != 8:
          continue
        (this_barcode, this_order, name1, seq1, qual1, name2, seq2, qual2) = fields
        # If the barcode or order has changed, we're in a new family.
        # Process the reads we've previously gathered as one family and start a new family.
        if this_barcode != barcode or this_order != order:
          duplex[order] = family
          # If the barcode is different, we're at the end of the whole duplex. Process the it and start
          # a new one. If the barcode is the same, we're in the same duplex, but we've switched strands.
          if this_barcode != barcode:
            # logging.debug('processing {}: {} orders ({})'.format(barcode, len(duplex),
            #               '/'.join([str(len(duplex[o])) for o in duplex])))
            if barcode is not None:
              pool.compute(duplex, barcode)
              stats['duplexes'] += 1
            duplex = collections.OrderedDict()
          barcode = this_barcode
          order = this_order
          family = []
        pair = {'name1': name1, 'seq1':seq1, 'qual1':qual1, 'name2':name2, 'seq2':seq2, 'qual2':qual2}
        family.append(pair)
        stats['pairs'] += 1
      # Process the last family.
      duplex[order] = family
      # logging.debug('processing {}: {} orders ({}) [last]'.format(barcode, len(duplex),
      #               '/'.join([str(len(duplex[o])) for o in duplex])))
      pool.compute(duplex, barcode)
      stats['duplexes'] += 1

      # Retrieve the remaining results.
      logging.info('Flushing remaining results from worker processes..')
      pool.flush()

    finally:
      # If an exception occurs in the parent without stopping the child processes, this will hang.
      # Make sure to kill the children in all cases.
      pool.close()
      pool.join()
      # Close input filehandle if it's open.
      if args.infile is not sys.stdin:
        args.infile.close()

    # Final stats on the run.
    run_time = int(time.time() - start_time)
    max_mem = get_max_mem()
    logging.error('Processed {pairs} read pairs in {duplexes} duplexes, with {failures} alignment '
                  'failures.'.format(**stats))
    if stats['aligned_pairs'] > 0 and stats['runs'] > 0:
      per_pair = stats['time'] / stats['aligned_pairs']
      per_run = stats['time'] / stats['runs']
      logging.error('{:0.3f}s per pair, {:0.3f}s per run.'.format(per_pair, per_run))
    logging.error('in {}s total time and {:0.2f}MB RAM.'.format(run_time, max_mem))

  except (Exception, KeyboardInterrupt) as exception:
    if args.phone_home and call:
      try:
        exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data())
        logging.critical(parallel_tools.format_traceback(exception_data))
        exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__)
      except Exception:
        exception_data = {}
      run_time = int(time.time() - start_time)
      try:
        run_data = get_run_data(stats, pool, args.aligner)
      except (Exception, UnboundLocalError):
        run_data = {}
      try:
        run_data['mem'] = get_max_mem()
      except Exception:
        pass
      run_data['failed'] = True
      if exception_data:
        run_data['exception'] = exception_data
      call.send_data('end', run_time=run_time, run_data=run_data)
      raise exception
    else:
      raise

  if args.phone_home and call:
    run_data = get_run_data(stats, pool, args.aligner, max_mem)
    call.send_data('end', run_time=run_time, run_data=run_data)
Пример #3
0
def main(argv):

    parser = make_argparser()
    args = parser.parse_args(argv[1:])
    if args.help:
        parser.print_help()
        return 0

    logging.basicConfig(stream=args.log,
                        level=args.volume,
                        format='%(message)s')
    tone_down_logger()

    start_time = time.time()
    # If the user requested, report back some data about the start of the run.
    if args.phone_home:
        call = phone.Call(__file__,
                          version.get_version(),
                          platform=args.platform,
                          test=args.test,
                          fail='warn')
        call.send_data('start')
        data = {
            'stdin': args.infile is sys.stdin,
            'processes': args.processes,
            'queue_size': args.queue_size,
        }
        if data['stdin']:
            data['input_size'] = None
        else:
            data['input_size'] = os.path.getsize(args.infile.name)
        call.send_data('prelim', run_data=data)
    else:
        call = None

    # Execute as much of the script as possible in a try/except to catch any exception that occurs
    # and report it via ET.phone.
    try:
        # Process and validate arguments.
        if args.queue_size is not None and args.queue_size <= 0:
            fail('Error: --queue-size must be greater than zero.')
        qual_start = QUAL_OFFSETS[args.qual_format]
        qual_thres = chr(args.qual + qual_start)
        if args.fastq_out is None:
            # Output FASTA.
            output_qual = None
        else:
            # Output FASTQ.
            if qual_start + args.fastq_out > 126:
                fail(
                    'Error: --fastq-out PHRED score ({}) is too large.'.format(
                        args.fastq_out))
            output_qual = chr(qual_start + args.fastq_out)
        if args.min_cons_reads > args.min_reads:
            fail(
                'Error: --min-reads must be greater than --min-cons-reads (or you\'ll have a lot of '
                'consensus sequences with only N\'s!). If you want to exclude families with fewer than X '
                'reads, give --min-reads X instead of --min-cons-reads X.')
        if not any((args.dcs1, args.dcs2, args.sscs1, args.sscs2)):
            fail('Error: must specify an output file!')
        # A dict of output filehandles.
        # Indexed so we can do filehandles['dcs'][mate].
        filehandles = {
            'dcs': (args.dcs1, args.dcs2),
            'sscs': (args.sscs1, args.sscs2),
        }

        # Open a pool of worker processes.
        stats = {'time': 0, 'reads': 0, 'runs': 0, 'duplexes': 0}
        static_kwargs = {
            'min_reads': args.min_reads,
            'cons_thres': args.cons_thres,
            'min_cons_reads': args.min_cons_reads,
            'qual_thres': qual_thres,
            'output_qual': output_qual,
        }
        pool = parallel_tools.SyncAsyncPool(
            process_duplex,
            processes=args.processes,
            static_kwargs=static_kwargs,
            queue_size=args.queue_size,
            callback=process_result,
            callback_args=[filehandles, stats],
        )
        try:
            process_families(args.infile, pool, stats)
        finally:
            # If the root process encounters an exception and doesn't tell the workers to stop, it will
            # hang forever.
            pool.close()
            pool.join()
            # Close all open filehandles.
            if args.infile is not sys.stdin:
                args.infile.close()
            for fh_group in filehandles.values():
                for fh in fh_group:
                    if fh:
                        fh.close()

        # Final stats on the run.
        run_time = int(time.time() - start_time)
        max_mem = get_max_mem()
        logging.info(
            'Processed {} reads and {} duplexes in {} seconds.'.format(
                stats['total_reads'], stats['runs'], run_time))
        if stats['reads'] > 0 and stats['runs'] > 0:
            per_read = stats['time'] / stats['reads']
            per_run = stats['time'] / stats['runs']
            logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format(
                per_read, per_run))
        logging.info('in {}s total time and {:0.2f}MB RAM.'.format(
            run_time, max_mem))

    except (Exception, KeyboardInterrupt) as exception:
        if args.phone_home and call:
            try:
                exception_data = getattr(exception, 'child_context',
                                         parallel_tools.get_exception_data())
                logging.critical(
                    parallel_tools.format_traceback(exception_data))
                exception_data = parallel_tools.scrub_tb_paths(
                    exception_data, script_path=__file__)
            except Exception:
                exception_data = {}
            run_time = int(time.time() - start_time)
            try:
                run_data = get_run_data(stats, pool)
            except (Exception, UnboundLocalError):
                run_data = {}
            try:
                run_data['mem'] = get_max_mem()
            except Exception:
                pass
            run_data['failed'] = True
            if exception_data:
                run_data['exception'] = exception_data
            call.send_data('end', run_time=run_time, run_data=run_data)
            raise exception
        else:
            raise

    if args.phone_home and call:
        run_data = get_run_data(stats, pool, max_mem)
        call.send_data('end', run_time=run_time, run_data=run_data)
Пример #4
0
def main(argv):

    parser = make_argparser()
    args = parser.parse_args(argv[1:])
    if args.help:
        parser.print_help()
        return 0

    logging.basicConfig(stream=args.log,
                        level=args.volume,
                        format='%(message)s')
    tone_down_logger()

    start_time = time.time()
    # If the user requested, report back some data about the start of the run.
    if args.phone_home:
        call = phone.Call(__file__,
                          version.get_version(),
                          platform=args.platform,
                          test=args.test,
                          fail='warn')
        call.send_data('start')
        data = {
            'stdin': args.infile is sys.stdin,
            'processes': args.processes,
            'queue_size': args.queue_size,
        }
        if data['stdin']:
            data['input_size'] = None
        else:
            data['input_size'] = os.path.getsize(args.infile.name)
        call.send_data('prelim', run_data=data)
    else:
        call = None

    # Execute as much of the script as possible in a try/except to catch any exception that occurs
    # and report it via ET.phone.
    try:
        # Process and validate arguments.
        if args.queue_size is not None and args.queue_size <= 0:
            fail('Error: --queue-size must be greater than zero.')
        qual_start = QUAL_OFFSETS[args.qual_format]
        qual_thres = chr(args.qual + qual_start)
        if args.fastq_out is None:
            # Output FASTA.
            output_qual = None
        else:
            # Output FASTQ.
            if qual_start + args.fastq_out > 126:
                fail(
                    'Error: --fastq-out PHRED score ({}) is too large.'.format(
                        args.fastq_out))
            output_qual = chr(qual_start + args.fastq_out)
        if args.min_cons_reads > args.min_reads:
            fail(
                'Error: --min-reads must be greater than --min-cons-reads (or you\'ll have a lot of '
                'consensus sequences with only N\'s!). If you want to exclude families with fewer than X '
                'reads, give --min-reads X instead of --min-cons-reads X.')
        if not any((args.dcs1, args.dcs2, args.sscs1, args.sscs2)):
            fail('Error: must specify an output file!')
        # A dict of output filehandles.
        # Indexed so we can do filehandles['dcs'][mate].
        filehandles = {
            'dcs': (args.dcs1, args.dcs2),
            'sscs': (args.sscs1, args.sscs2),
        }

        # Open a pool of worker processes.
        stats = {'time': 0, 'reads': 0, 'runs': 0, 'duplexes': 0}
        static_kwargs = {
            'min_reads': args.min_reads,
            'cons_thres': args.cons_thres,
            'min_cons_reads': args.min_cons_reads,
            'qual_thres': qual_thres,
            'output_qual': output_qual,
        }
        pool = parallel_tools.SyncAsyncPool(
            process_duplex,
            processes=args.processes,
            static_kwargs=static_kwargs,
            queue_size=args.queue_size,
            callback=process_result,
            callback_args=[filehandles, stats],
        )
        try:
            total_reads = 0
            duplex = collections.OrderedDict()
            family = []
            barcode = None
            order = None
            # Note: mate is a 0-indexed integer ("mate 1" from the input file is mate 0 here).
            mate = None
            for line in args.infile:
                # Allow comments (e.g. for test input files).
                if line.startswith('#'):
                    continue
                fields = line.rstrip('\r\n').split('\t')
                if len(fields) != 6:
                    continue
                this_barcode, this_order, this_mate, name, seq, qual = fields
                this_mate = int(this_mate) - 1
                # If the barcode, order, and mate are the same, we're just continuing the add reads to the
                # current family. Otherwise, store the current family, start a new one, and process the
                # duplex if we're at the end of one.
                new_barcode = this_barcode != barcode
                new_order = this_order != order
                new_mate = this_mate != mate
                if new_barcode or new_order or new_mate:
                    if order is not None and mate is not None:
                        duplex[(order, mate)] = family
                    # If the barcode changed, process the last duplex and start a new one.
                    if new_barcode and barcode is not None:
                        assert len(duplex) <= 4, duplex.keys()
                        pool.compute(duplex, barcode)
                        stats['duplexes'] += 1
                        duplex = collections.OrderedDict()
                    barcode = this_barcode
                    order = this_order
                    mate = this_mate
                    family = []
                read = {'name': name, 'seq': seq, 'qual': qual}
                family.append(read)
                total_reads += 1
            # Process the last family.
            if order is not None and mate is not None:
                duplex[(order, mate)] = family
            assert len(duplex) <= 4, duplex.keys()
            pool.compute(duplex, barcode)
            stats['duplexes'] += 1

            # Retrieve the remaining results.
            logging.info('Flushing remaining results from worker processes..')
            pool.flush()

        finally:
            # If the root process encounters an exception and doesn't tell the workers to stop, it will
            # hang forever.
            pool.close()
            pool.join()
            # Close all open filehandles.
            if args.infile is not sys.stdin:
                args.infile.close()
            for fh_group in filehandles.values():
                for fh in fh_group:
                    if fh:
                        fh.close()

        # Final stats on the run.
        run_time = int(time.time() - start_time)
        max_mem = get_max_mem()
        logging.info(
            'Processed {} reads and {} duplexes in {} seconds.'.format(
                total_reads, stats['runs'], run_time))
        if stats['reads'] > 0 and stats['runs'] > 0:
            per_read = stats['time'] / stats['reads']
            per_run = stats['time'] / stats['runs']
            logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format(
                per_read, per_run))
        logging.info('in {}s total time and {:0.2f}MB RAM.'.format(
            run_time, max_mem))

    except (Exception, KeyboardInterrupt) as exception:
        if args.phone_home and call:
            try:
                exception_data = getattr(exception, 'child_context',
                                         parallel_tools.get_exception_data())
                logging.critical(
                    parallel_tools.format_traceback(exception_data))
                exception_data = parallel_tools.scrub_tb_paths(
                    exception_data, script_path=__file__)
            except Exception:
                exception_data = {}
            run_time = int(time.time() - start_time)
            try:
                run_data = get_run_data(stats, pool)
            except (Exception, UnboundLocalError):
                run_data = {}
            try:
                run_data['mem'] = get_max_mem()
            except Exception:
                pass
            run_data['failed'] = True
            if exception_data:
                run_data['exception'] = exception_data
            call.send_data('end', run_time=run_time, run_data=run_data)
            raise exception
        else:
            raise

    if args.phone_home and call:
        run_data = get_run_data(stats, pool, max_mem)
        call.send_data('end', run_time=run_time, run_data=run_data)