def main(argv): parser = make_argparser() args = parser.parse_args(argv[1:]) logging.basicConfig(stream=args.log_file, level=args.volume, format='%(message)s') tone_down_logger() start_time = time.time() # If the user requested, report back some data about the start of the run. if args.phone_home: call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test, fail='warn') call.send_data('start') data = { 'stdin': args.infile is sys.stdin, 'aligner': args.aligner, 'processes': args.processes, 'queue_size': args.queue_size, } if data['stdin']: data['input_size'] = None else: data['input_size'] = os.path.getsize(args.infile.name) call.send_data('prelim', run_data=data) # Execute as much of the script as possible in a try/except to catch any exception that occurs # and report it via ET.phone. try: if args.queue_size is not None and args.queue_size <= 0: fail('Error: --queue-size must be greater than zero.') # If we're using mafft, check that we can execute it. if args.aligner == 'mafft' and not distutils.spawn.find_executable('mafft'): fail('Error: Could not find "mafft" command on $PATH.') # Open a pool of worker processes. stats = {'duplexes':0, 'time':0, 'pairs':0, 'runs':0, 'failures':0, 'aligned_pairs':0} pool = parallel_tools.SyncAsyncPool( process_duplex, processes=args.processes, static_kwargs={'aligner':args.aligner}, queue_size=args.queue_size, callback=process_result, callback_args=[stats] ) try: # The main loop. align_families(args.infile, pool, stats, check_ids=args.check_ids) finally: # If an exception occurs in the parent without stopping the child processes, this will hang. # Make sure to kill the children in all cases. pool.close() pool.join() # Close input filehandle if it's open. if args.infile is not sys.stdin: args.infile.close() # Final stats on the run. run_time = int(time.time() - start_time) max_mem = get_max_mem() logging.error( 'Processed {pairs} read pairs in {duplexes} duplexes, with {failures} alignment failures.' .format(**stats) ) if stats['aligned_pairs'] > 0 and stats['runs'] > 0: per_pair = stats['time'] / stats['aligned_pairs'] per_run = stats['time'] / stats['runs'] logging.error(f'{per_pair:0.3f}s per pair, {per_run:0.3f}s per run.') logging.error(f'in {run_time}s total time and {max_mem:0.2f}MB RAM.') except (Exception, KeyboardInterrupt) as exception: if args.phone_home and call: try: exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data()) logging.critical(parallel_tools.format_traceback(exception_data)) exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__) except Exception: exception_data = {} run_time = int(time.time() - start_time) try: run_data = get_run_data(stats, pool, args.aligner) except (Exception, UnboundLocalError): run_data = {} try: run_data['mem'] = get_max_mem() except Exception: pass run_data['failed'] = True if exception_data: run_data['exception'] = exception_data call.send_data('end', run_time=run_time, run_data=run_data) raise exception else: raise if args.phone_home and call: run_data = get_run_data(stats, pool, args.aligner, max_mem) call.send_data('end', run_time=run_time, run_data=run_data)
def main(argv): parser = make_argparser() args = parser.parse_args(argv[1:]) logging.basicConfig(stream=args.log_file, level=args.volume, format='%(message)s') tone_down_logger() start_time = time.time() # If the user requested, report back some data about the start of the run. if args.phone_home: call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test, fail='warn') call.send_data('start') data = { 'stdin': args.infile is sys.stdin, 'aligner': args.aligner, 'processes': args.processes, 'queue_size': args.queue_size, } if data['stdin']: data['input_size'] = None else: data['input_size'] = os.path.getsize(args.infile.name) call.send_data('prelim', run_data=data) # Execute as much of the script as possible in a try/except to catch any exception that occurs # and report it via ET.phone. try: if args.queue_size is not None and args.queue_size <= 0: fail('Error: --queue-size must be greater than zero.') # If we're using mafft, check that we can execute it. if args.aligner == 'mafft' and not distutils.spawn.find_executable('mafft'): fail('Error: Could not find "mafft" command on $PATH.') # Open a pool of worker processes. stats = {'duplexes':0, 'time':0, 'pairs':0, 'runs':0, 'failures':0, 'aligned_pairs':0} pool = parallel_tools.SyncAsyncPool(process_duplex, processes=args.processes, static_kwargs={'aligner':args.aligner}, queue_size=args.queue_size, callback=process_result, callback_args=[stats], ) """Now the main loop. This processes whole duplexes (pairs of strands) at a time for a future option to align the whole duplex at a time. duplex data structure: duplex = { 'ab': [ {'name1': 'read_name1a', 'seq1': 'GATT-ACA', 'qual1': 'sc!0 /J*', 'name2': 'read_name1b', 'seq2': 'ACTGACTA', 'qual2': '34I&SDF)' }, {'name1': 'read_name2a', ... }, ... ], 'ba': [ ... ] } e.g.: seq = duplex[order][pair_num]['seq1']""" try: duplex = collections.OrderedDict() family = [] barcode = None order = None for line in args.infile: fields = line.rstrip('\r\n').split('\t') if len(fields) != 8: continue (this_barcode, this_order, name1, seq1, qual1, name2, seq2, qual2) = fields # If the barcode or order has changed, we're in a new family. # Process the reads we've previously gathered as one family and start a new family. if this_barcode != barcode or this_order != order: duplex[order] = family # If the barcode is different, we're at the end of the whole duplex. Process the it and start # a new one. If the barcode is the same, we're in the same duplex, but we've switched strands. if this_barcode != barcode: # logging.debug('processing {}: {} orders ({})'.format(barcode, len(duplex), # '/'.join([str(len(duplex[o])) for o in duplex]))) if barcode is not None: pool.compute(duplex, barcode) stats['duplexes'] += 1 duplex = collections.OrderedDict() barcode = this_barcode order = this_order family = [] pair = {'name1': name1, 'seq1':seq1, 'qual1':qual1, 'name2':name2, 'seq2':seq2, 'qual2':qual2} family.append(pair) stats['pairs'] += 1 # Process the last family. duplex[order] = family # logging.debug('processing {}: {} orders ({}) [last]'.format(barcode, len(duplex), # '/'.join([str(len(duplex[o])) for o in duplex]))) pool.compute(duplex, barcode) stats['duplexes'] += 1 # Retrieve the remaining results. logging.info('Flushing remaining results from worker processes..') pool.flush() finally: # If an exception occurs in the parent without stopping the child processes, this will hang. # Make sure to kill the children in all cases. pool.close() pool.join() # Close input filehandle if it's open. if args.infile is not sys.stdin: args.infile.close() # Final stats on the run. run_time = int(time.time() - start_time) max_mem = get_max_mem() logging.error('Processed {pairs} read pairs in {duplexes} duplexes, with {failures} alignment ' 'failures.'.format(**stats)) if stats['aligned_pairs'] > 0 and stats['runs'] > 0: per_pair = stats['time'] / stats['aligned_pairs'] per_run = stats['time'] / stats['runs'] logging.error('{:0.3f}s per pair, {:0.3f}s per run.'.format(per_pair, per_run)) logging.error('in {}s total time and {:0.2f}MB RAM.'.format(run_time, max_mem)) except (Exception, KeyboardInterrupt) as exception: if args.phone_home and call: try: exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data()) logging.critical(parallel_tools.format_traceback(exception_data)) exception_data = parallel_tools.scrub_tb_paths(exception_data, script_path=__file__) except Exception: exception_data = {} run_time = int(time.time() - start_time) try: run_data = get_run_data(stats, pool, args.aligner) except (Exception, UnboundLocalError): run_data = {} try: run_data['mem'] = get_max_mem() except Exception: pass run_data['failed'] = True if exception_data: run_data['exception'] = exception_data call.send_data('end', run_time=run_time, run_data=run_data) raise exception else: raise if args.phone_home and call: run_data = get_run_data(stats, pool, args.aligner, max_mem) call.send_data('end', run_time=run_time, run_data=run_data)
def main(argv): parser = make_argparser() args = parser.parse_args(argv[1:]) if args.help: parser.print_help() return 0 logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s') tone_down_logger() start_time = time.time() # If the user requested, report back some data about the start of the run. if args.phone_home: call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test, fail='warn') call.send_data('start') data = { 'stdin': args.infile is sys.stdin, 'processes': args.processes, 'queue_size': args.queue_size, } if data['stdin']: data['input_size'] = None else: data['input_size'] = os.path.getsize(args.infile.name) call.send_data('prelim', run_data=data) else: call = None # Execute as much of the script as possible in a try/except to catch any exception that occurs # and report it via ET.phone. try: # Process and validate arguments. if args.queue_size is not None and args.queue_size <= 0: fail('Error: --queue-size must be greater than zero.') qual_start = QUAL_OFFSETS[args.qual_format] qual_thres = chr(args.qual + qual_start) if args.fastq_out is None: # Output FASTA. output_qual = None else: # Output FASTQ. if qual_start + args.fastq_out > 126: fail( 'Error: --fastq-out PHRED score ({}) is too large.'.format( args.fastq_out)) output_qual = chr(qual_start + args.fastq_out) if args.min_cons_reads > args.min_reads: fail( 'Error: --min-reads must be greater than --min-cons-reads (or you\'ll have a lot of ' 'consensus sequences with only N\'s!). If you want to exclude families with fewer than X ' 'reads, give --min-reads X instead of --min-cons-reads X.') if not any((args.dcs1, args.dcs2, args.sscs1, args.sscs2)): fail('Error: must specify an output file!') # A dict of output filehandles. # Indexed so we can do filehandles['dcs'][mate]. filehandles = { 'dcs': (args.dcs1, args.dcs2), 'sscs': (args.sscs1, args.sscs2), } # Open a pool of worker processes. stats = {'time': 0, 'reads': 0, 'runs': 0, 'duplexes': 0} static_kwargs = { 'min_reads': args.min_reads, 'cons_thres': args.cons_thres, 'min_cons_reads': args.min_cons_reads, 'qual_thres': qual_thres, 'output_qual': output_qual, } pool = parallel_tools.SyncAsyncPool( process_duplex, processes=args.processes, static_kwargs=static_kwargs, queue_size=args.queue_size, callback=process_result, callback_args=[filehandles, stats], ) try: process_families(args.infile, pool, stats) finally: # If the root process encounters an exception and doesn't tell the workers to stop, it will # hang forever. pool.close() pool.join() # Close all open filehandles. if args.infile is not sys.stdin: args.infile.close() for fh_group in filehandles.values(): for fh in fh_group: if fh: fh.close() # Final stats on the run. run_time = int(time.time() - start_time) max_mem = get_max_mem() logging.info( 'Processed {} reads and {} duplexes in {} seconds.'.format( stats['total_reads'], stats['runs'], run_time)) if stats['reads'] > 0 and stats['runs'] > 0: per_read = stats['time'] / stats['reads'] per_run = stats['time'] / stats['runs'] logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format( per_read, per_run)) logging.info('in {}s total time and {:0.2f}MB RAM.'.format( run_time, max_mem)) except (Exception, KeyboardInterrupt) as exception: if args.phone_home and call: try: exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data()) logging.critical( parallel_tools.format_traceback(exception_data)) exception_data = parallel_tools.scrub_tb_paths( exception_data, script_path=__file__) except Exception: exception_data = {} run_time = int(time.time() - start_time) try: run_data = get_run_data(stats, pool) except (Exception, UnboundLocalError): run_data = {} try: run_data['mem'] = get_max_mem() except Exception: pass run_data['failed'] = True if exception_data: run_data['exception'] = exception_data call.send_data('end', run_time=run_time, run_data=run_data) raise exception else: raise if args.phone_home and call: run_data = get_run_data(stats, pool, max_mem) call.send_data('end', run_time=run_time, run_data=run_data)
def main(argv): parser = make_argparser() args = parser.parse_args(argv[1:]) if args.help: parser.print_help() return 0 logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s') tone_down_logger() start_time = time.time() # If the user requested, report back some data about the start of the run. if args.phone_home: call = phone.Call(__file__, version.get_version(), platform=args.platform, test=args.test, fail='warn') call.send_data('start') data = { 'stdin': args.infile is sys.stdin, 'processes': args.processes, 'queue_size': args.queue_size, } if data['stdin']: data['input_size'] = None else: data['input_size'] = os.path.getsize(args.infile.name) call.send_data('prelim', run_data=data) else: call = None # Execute as much of the script as possible in a try/except to catch any exception that occurs # and report it via ET.phone. try: # Process and validate arguments. if args.queue_size is not None and args.queue_size <= 0: fail('Error: --queue-size must be greater than zero.') qual_start = QUAL_OFFSETS[args.qual_format] qual_thres = chr(args.qual + qual_start) if args.fastq_out is None: # Output FASTA. output_qual = None else: # Output FASTQ. if qual_start + args.fastq_out > 126: fail( 'Error: --fastq-out PHRED score ({}) is too large.'.format( args.fastq_out)) output_qual = chr(qual_start + args.fastq_out) if args.min_cons_reads > args.min_reads: fail( 'Error: --min-reads must be greater than --min-cons-reads (or you\'ll have a lot of ' 'consensus sequences with only N\'s!). If you want to exclude families with fewer than X ' 'reads, give --min-reads X instead of --min-cons-reads X.') if not any((args.dcs1, args.dcs2, args.sscs1, args.sscs2)): fail('Error: must specify an output file!') # A dict of output filehandles. # Indexed so we can do filehandles['dcs'][mate]. filehandles = { 'dcs': (args.dcs1, args.dcs2), 'sscs': (args.sscs1, args.sscs2), } # Open a pool of worker processes. stats = {'time': 0, 'reads': 0, 'runs': 0, 'duplexes': 0} static_kwargs = { 'min_reads': args.min_reads, 'cons_thres': args.cons_thres, 'min_cons_reads': args.min_cons_reads, 'qual_thres': qual_thres, 'output_qual': output_qual, } pool = parallel_tools.SyncAsyncPool( process_duplex, processes=args.processes, static_kwargs=static_kwargs, queue_size=args.queue_size, callback=process_result, callback_args=[filehandles, stats], ) try: total_reads = 0 duplex = collections.OrderedDict() family = [] barcode = None order = None # Note: mate is a 0-indexed integer ("mate 1" from the input file is mate 0 here). mate = None for line in args.infile: # Allow comments (e.g. for test input files). if line.startswith('#'): continue fields = line.rstrip('\r\n').split('\t') if len(fields) != 6: continue this_barcode, this_order, this_mate, name, seq, qual = fields this_mate = int(this_mate) - 1 # If the barcode, order, and mate are the same, we're just continuing the add reads to the # current family. Otherwise, store the current family, start a new one, and process the # duplex if we're at the end of one. new_barcode = this_barcode != barcode new_order = this_order != order new_mate = this_mate != mate if new_barcode or new_order or new_mate: if order is not None and mate is not None: duplex[(order, mate)] = family # If the barcode changed, process the last duplex and start a new one. if new_barcode and barcode is not None: assert len(duplex) <= 4, duplex.keys() pool.compute(duplex, barcode) stats['duplexes'] += 1 duplex = collections.OrderedDict() barcode = this_barcode order = this_order mate = this_mate family = [] read = {'name': name, 'seq': seq, 'qual': qual} family.append(read) total_reads += 1 # Process the last family. if order is not None and mate is not None: duplex[(order, mate)] = family assert len(duplex) <= 4, duplex.keys() pool.compute(duplex, barcode) stats['duplexes'] += 1 # Retrieve the remaining results. logging.info('Flushing remaining results from worker processes..') pool.flush() finally: # If the root process encounters an exception and doesn't tell the workers to stop, it will # hang forever. pool.close() pool.join() # Close all open filehandles. if args.infile is not sys.stdin: args.infile.close() for fh_group in filehandles.values(): for fh in fh_group: if fh: fh.close() # Final stats on the run. run_time = int(time.time() - start_time) max_mem = get_max_mem() logging.info( 'Processed {} reads and {} duplexes in {} seconds.'.format( total_reads, stats['runs'], run_time)) if stats['reads'] > 0 and stats['runs'] > 0: per_read = stats['time'] / stats['reads'] per_run = stats['time'] / stats['runs'] logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format( per_read, per_run)) logging.info('in {}s total time and {:0.2f}MB RAM.'.format( run_time, max_mem)) except (Exception, KeyboardInterrupt) as exception: if args.phone_home and call: try: exception_data = getattr(exception, 'child_context', parallel_tools.get_exception_data()) logging.critical( parallel_tools.format_traceback(exception_data)) exception_data = parallel_tools.scrub_tb_paths( exception_data, script_path=__file__) except Exception: exception_data = {} run_time = int(time.time() - start_time) try: run_data = get_run_data(stats, pool) except (Exception, UnboundLocalError): run_data = {} try: run_data['mem'] = get_max_mem() except Exception: pass run_data['failed'] = True if exception_data: run_data['exception'] = exception_data call.send_data('end', run_time=run_time, run_data=run_data) raise exception else: raise if args.phone_home and call: run_data = get_run_data(stats, pool, max_mem) call.send_data('end', run_time=run_time, run_data=run_data)