def run(parser, args): # Catch exceptions and only print error line sys.excepthook = except_hook # Run load config to validate run_info, conditions, reference, caller_settings = get_run_info(args.toml) print("😻 Looking good!", file=sys.stdout) print("Generating experiment description - please be patient!", file=sys.stdout) mapper = Mapper(reference) for message, sev in describe_experiment(conditions, mapper): printer(textwrap.fill(message), sev, file=sys.stdout, end="\n\n")
def main(): # Catch exceptions and only print error line sys.excepthook = except_hook # Parse single positional argument parser = argparse.ArgumentParser( "Read Until TOML Validator ({})".format(__file__)) parser.add_argument("toml", help="TOML file to validate") args = parser.parse_args() # Run load config to validate run_info, conditions, reference, caller_settings = get_run_info(args.toml) print("😻 Looking good!", file=sys.stdout) print("Generating experiment description - please be patient!", file=sys.stdout) mapper = Mapper(reference) for message, sev in describe_experiment(conditions, mapper): printer(textwrap.fill(message), sev, file=sys.stdout, end="\n\n")
def simple_analysis( client, batch_size=512, throttle=0.1, unblock_duration=0.5, cl=None, pf=None, live_toml_path=None, flowcell_size=512, dry_run=False, run_info=None, conditions=None, mapper=None, caller_kwargs=None, ): """Analysis function Parameters ---------- client : read_until.ReadUntilClient An instance of the ReadUntilClient object batch_size : int The number of reads to be retrieved from the ReadUntilClient at a time throttle : int or float The number of seconds interval between requests to the ReadUntilClient unblock_duration : int or float Time, in seconds, to apply unblock voltage cl : logging.Logger Log file to log chunk data to pf : logging.Logger Log file to log alignments to live_toml_path : str Path to a `live` TOML configuration file for read until. If this exists when the run starts it will be deleted flowcell_size : int The number of channels on the flowcell, 512 for MinION and 3000 for PromethION dry_run : bool If True unblocks are replaced with `stop_receiving` commands run_info : dict Dictionary of {channel: index} where index corresponds to an index in `conditions` conditions : list Experimental conditions as List of namedtuples. mapper : mappy.Aligner caller_kwargs : dict Returns ------- None """ # Init logger for this function logger = logging.getLogger(__name__) # Delete live TOML file if it exists live_toml_path = Path(live_toml_path) if live_toml_path.is_file(): live_toml_path.unlink() # TODO: test this # Write channels.toml d = { "conditions": { str(v): { "channels": [], "name": conditions[v].name } for k, v in run_info.items() } } for k, v in run_info.items(): d["conditions"][str(v)]["channels"].append(k) channels_out = str(client.mk_run_dir / "channels.toml") with open(channels_out, "w") as fh: fh.write( "# This file is written as a record of the condition each channel is assigned.\n" ) fh.write( "# It may be changed or overwritten if you restart Read Until.\n") fh.write("# In the future this file may become a CSV file.\n") toml.dump(d, fh) caller = Caller(**caller_kwargs) # What if there is no reference or an empty MMI # DefaultDict[int: collections.deque[Tuple[str, ndarray]]] # tuple is (read_id, previous_signal) # TODO: tuple should use read_number instead previous_signal = defaultdict(functools.partial(deque, maxlen=1)) # count how often a read is seen tracker = defaultdict(Counter) # decided decided_reads = {} strand_converter = {1: "+", -1: "-"} read_id = "" # TODO: partial-ise / lambda unblock to take the unblock duration if dry_run: decision_dict = { "stop_receiving": client.stop_receiving_read, "proceed": None, "unblock": client.stop_receiving_read, } send_message(client.connection, "This is a test run. No unblocks will occur.", Severity.WARN) else: decision_dict = { "stop_receiving": client.stop_receiving_read, "proceed": None, "unblock": lambda c, n: client.unblock_read(c, n, unblock_duration, read_id), } send_message(client.connection, "This is a live run. Unblocks will occur.", Severity.WARN) decision_str = "" below_threshold = False exceeded_threshold = False l_string = ( "client_iteration", "read_in_loop", "read_id", "channel", "read_number", "seq_len", "counter", "mode", "decision", "condition", "min_threshold", "count_threshold", "start_analysis", "end_analysis", "timestamp", ) cl.debug("\t".join(l_string)) l_string = "\t".join(("{}" for _ in l_string)) loop_counter = 0 while client.is_running: if live_toml_path.is_file(): # Reload the TOML config from the *_live file run_info, conditions, new_reference, _ = get_run_info( live_toml_path, flowcell_size) # Check the reference path if different from the loaded mapper if new_reference != mapper.index: old_reference = mapper.index # Log to file and MinKNOW interface logger.info("Reloading mapper") send_message(client.connection, "Reloading mapper. Read Until paused.", Severity.INFO) # Update mapper client. mapper = CustomMapper(new_reference) # Log on success logger.info("Reloaded mapper") # If we've reloaded a reference, delete the previous one if old_reference: logger.info("Deleting old mmi {}".format(old_reference)) # We now delete the old mmi file. Path(old_reference).unlink() logger.info("Old mmi deleted.") # TODO: Fix the logging to just one of the two in use if not mapper.initialised: time.sleep(throttle) continue loop_counter += 1 t0 = timer() r = 0 for read_info, read_id, seq_len, results in mapper.map_reads_2( caller.basecall_minknow( reads=client.get_read_chunks(batch_size=batch_size, last=True), signal_dtype=client.signal_dtype, prev_signal=previous_signal, decided_reads=decided_reads, )): r += 1 read_start_time = timer() channel, read_number = read_info if read_number not in tracker[channel]: tracker[channel].clear() tracker[channel][read_number] += 1 mode = "" exceeded_threshold = False below_threshold = False log_decision = lambda: cl.debug( l_string.format( loop_counter, r, read_id, channel, read_number, seq_len, tracker[channel][read_number], mode, getattr(conditions[run_info[channel]], mode, mode), conditions[run_info[channel]].name, below_threshold, exceeded_threshold, read_start_time, timer(), time.time(), )) # Control channels if conditions[run_info[channel]].control: mode = "control" log_decision() client.stop_receiving_read(channel, read_number) continue # This is an analysis channel # Below minimum chunks if tracker[channel][read_number] <= conditions[ run_info[channel]].min_chunks: below_threshold = True # Greater than or equal to maximum chunks if tracker[channel][read_number] >= conditions[ run_info[channel]].max_chunks: exceeded_threshold = True # No mappings if not results: mode = "no_map" hits = set() for result in results: pf.debug("{}\t{}\t{}".format(read_id, seq_len, result)) hits.add(result.ctg) if hits & conditions[run_info[channel]].targets: # Mappings and targets overlap coord_match = any( between(r.r_st, c) for r in results for c in conditions[run_info[channel]].coords.get( strand_converter.get(r.strand), {}).get(r.ctg, [])) if len(hits) == 1: if coord_match: # Single match that is within coordinate range mode = "single_on" else: # Single match to a target outside coordinate range mode = "single_off" elif len(hits) > 1: if coord_match: # Multiple matches with at least one in the correct region mode = "multi_on" else: # Multiple matches to targets outside the coordinate range mode = "multi_off" else: # No matches in mappings if len(hits) > 1: # More than one, off-target, mapping mode = "multi_off" elif len(hits) == 1: # Single off-target mapping mode = "single_off" # This is where we make our decision: # Get the associated action for this condition decision_str = getattr(conditions[run_info[channel]], mode) # decision is an alias for the functions "unblock" or "stop_receiving" decision = decision_dict[decision_str] # If max_chunks has been exceeded AND we don't want to keep sequencing we unblock if exceeded_threshold and decision_str != "stop_receiving": mode = "exceeded_max_chunks_unblocked" client.unblock_read(channel, read_number, unblock_duration, read_id) # TODO: WHAT IS GOING ON?! # I think that this needs to change between enrichment and depletion # If under min_chunks AND any mapping mode seen we unblock # if below_threshold and mode in {"single_off", "multi_off"}: if below_threshold and mode in { "single_on", "single_off", "multi_on", "multi_off", }: mode = "below_min_chunks_unblocked" client.unblock_read(channel, read_number, unblock_duration, read_id) # proceed returns None, so we send no decision; otherwise unblock or stop_receiving elif decision is not None: decided_reads[channel] = read_id decision(channel, read_number) log_decision() t1 = timer() if r > 0: s1 = "{}R/{:.5f}s" logger.info(s1.format(r, t1 - t0)) # limit the rate at which we make requests if t0 + throttle > t1: time.sleep(throttle + t0 - t1) else: send_message(client.connection, "Read Until Client Stopped.", Severity.WARN) caller.disconnect() logger.info("Finished analysis of reads as client stopped.")
def main(): extra_args = ( ( "--toml", dict( metavar="TOML", required=True, help="TOML file specifying experimental parameters", ), ), ("--paf-log", dict( help="PAF log", default="paflog.log", )), ("--chunk-log", dict( help="Chunk log", default="chunk_log.log", )), ) parser, args = get_parser(extra_args=extra_args, file=__file__) # set up logging to file for DEBUG messages and above logging.basicConfig( level=logging.DEBUG, format="%(asctime)s %(name)s %(message)s", filename=args.log_file, filemode="w", ) # define a Handler that writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter(args.log_format) console.setFormatter(formatter) # add the handler to the root logger logging.getLogger("").addHandler(console) # Start by logging sys.argv and the parameters used logger = logging.getLogger("Manager") logger.info(" ".join(sys.argv)) print_args(args, logger=logger) # Setup chunk and paf logs chunk_logger = setup_logger("DEC", log_file=args.chunk_log) paf_logger = setup_logger("PAF", log_file=args.paf_log) # Parse configuration TOML # TODO: num_channels is not configurable here, should be inferred from client run_info, conditions, reference, caller_kwargs = get_run_info( args.toml, num_channels=512) live_toml = Path("{}_live".format(args.toml)) # Load Minimap2 index logger.info("Initialising minimap2 mapper") mapper = CustomMapper(reference) logger.info("Mapper initialised") read_until_client = read_until.ReadUntilClient( mk_host=args.host, mk_port=args.port, device=args.device, # one_chunk=args.one_chunk, filter_strands=True, # TODO: test cache_type by passing a function here cache_type=args.read_cache, cache_size=args.cache_size, ) send_message( read_until_client.connection, "Read Until is controlling sequencing on this device. You use it at your own risk.", Severity.WARN, ) for message, sev in describe_experiment(conditions, mapper): logger.info(message) send_message( read_until_client.connection, message, sev, ) """ This experiment has N regions on the flowcell. using reference: /path/to/ref.mmi Region i:NAME (control=bool) has X targets of which Y are found in the reference. reads will be unblocked when [u,v], sequenced when [w,x] and polled for more data when [y,z]. """ # FIXME: currently flowcell size is not included, this should be pulled from # the read_until_client analysis_worker = functools.partial( simple_analysis, read_until_client, unblock_duration=args.unblock_duration, throttle=args.throttle, batch_size=args.batch_size, cl=chunk_logger, pf=paf_logger, live_toml_path=live_toml, dry_run=args.dry_run, run_info=run_info, conditions=conditions, mapper=mapper, caller_kwargs=caller_kwargs, ) results = run_workflow( read_until_client, analysis_worker, args.workers, args.run_time, runner_kwargs={ # "min_chunk_size": args.min_chunk_size, "first_channel": min(args.channels), "last_channel": max(args.channels), }, ) # No results returned send_message( read_until_client.connection, "Read Until is disconnected from this device. Sequencing will proceed normally.", Severity.WARN, )
def simple_analysis( client, batch_size=512, throttle=0.1, unblock_duration=0.5, chunk_log=None, toml_path=None, flowcell_size=512, dry_run=False, ): """Analysis function Parameters ---------- client : read_until.ReadUntilClient An instance of the ReadUntilClient object batch_size : int The number of reads to be retrieved from the ReadUntilClient at a time throttle : int or float The number of seconds interval between requests to the ReadUntilClient unblock_duration : int or float Time, in seconds, to apply unblock voltage chunk_log : str Log file to log chunk data to toml_path : str Path to a TOML configuration file for read until flowcell_size : int The number of channels on the flowcell, 512 for MinION and 3000 for PromethION dry_run : bool If True unblocks are replaced with `stop_receiving` commands Returns ------- None """ logger = logging.getLogger(__name__) toml_dict = toml.load(toml_path) live_file = Path("{}_live".format(toml_path)) if live_file.is_file(): live_file.unlink() # There may or may not be a reference run_info, conditions, reference = get_run_info(toml_dict, num_channels=flowcell_size) # TODO: test this # Write channels.toml d = {"conditions": {str(v): {"channels": [], "name": conditions[v].name} for k, v in run_info.items()}} for k, v in run_info.items(): d["conditions"][str(v)]["channels"].append(k) channels_out = str(client.mk_run_dir / "channels.toml") with open(channels_out, "w") as fh: toml.dump(d, fh) guppy_kwargs = toml_dict.get( "guppy_connection", { "config": "dna_r9.4.1_450bps_fast", "host": "127.0.0.1", "port": 5555, "procs": 4, "inflight": 512, } ) caller = Caller(**guppy_kwargs) # What if there is no reference or an empty MMI mapper = CustomMapper(reference) # DefaultDict[int: collections.deque[Tuple[str, ndarray]]] # tuple is (read_id, previous_signal) # TODO: tuple should use read_number instead previous_signal = defaultdict(functools.partial(deque, maxlen=1)) # count how often a read is seen tracker = defaultdict(Counter) # decided decided_reads = {} strand_converter = {1: "+", -1: "-"} read_id = "" # TODO: partial-ise / lambda unblock to take the unblock duration if dry_run: decision_dict = { "stop_receiving": client.stop_receiving_read, "proceed": None, "unblock": client.stop_receiving_read, } else: decision_dict = { "stop_receiving": client.stop_receiving_read, "proceed": None, "unblock": lambda c, n: client.unblock_read(c, n, unblock_duration, read_id), } decision_str = "" below_threshold = False exceeded_threshold = False cl = setup_logger("DEC", log_file=chunk_log) pf = setup_logger("PAF", log_file="paflog.paf") l_string = ( "client_iteration", "read_in_loop", "read_id", "channel", "read_number", "seq_len", "counter", "mode", "decision", "condition", "min_threshold", "count_threshold", "start_analysis", "end_analysis", "timestamp", ) cl.debug("\t".join(l_string)) l_string = "\t".join(("{}" for _ in l_string)) loop_counter = 0 while client.is_running: if live_file.is_file(): # We may want to update the reference under certain conditions here. run_info, conditions, new_reference = get_run_info(live_file, flowcell_size) if new_reference != reference: logger.info("Reloading mapper") # We need to update our mapper client. mapper = CustomMapper(new_reference) logger.info("Reloaded mapper") if reference: logger.info("Deleting old mmi {}".format(reference)) # We now delete the old mmi file. Path(reference).unlink() logger.info("Old mmi deleted.") reference = new_reference # TODO: Fix the logging to just one of the two in use if not reference: time.sleep(throttle) continue loop_counter += 1 t0 = timer() r = 0 for read_info, read_id, seq_len, results in mapper.map_reads_2( caller.basecall_minknow( reads=client.get_read_chunks(batch_size=batch_size, last=True), signal_dtype=client.signal_dtype, prev_signal=previous_signal, decided_reads=decided_reads, ) ): r += 1 read_start_time = timer() channel, read_number = read_info if read_number not in tracker[channel]: tracker[channel].clear() tracker[channel][read_number] += 1 mode = "" exceeded_threshold = False below_threshold = False log_decision = lambda: cl.debug( l_string.format( loop_counter, r, read_id, channel, read_number, seq_len, tracker[channel][read_number], mode, getattr(conditions[run_info[channel]], mode, mode), conditions[run_info[channel]].name, below_threshold, exceeded_threshold, read_start_time, timer(), time.time(), ) ) # Control channels if conditions[run_info[channel]].control: mode = "control" log_decision() client.stop_receiving_read(channel, read_number) continue # This is an analysis channel # Below minimum chunks if tracker[channel][read_number] <= conditions[run_info[channel]].min_chunks: below_threshold = True # Greater than or equal to maximum chunks if tracker[channel][read_number] >= conditions[run_info[channel]].max_chunks: exceeded_threshold = True # No mappings if not results: mode = "no_map" hits = set() for result in results: pf.debug("{}\t{}\t{}".format(read_id, seq_len, result)) hits.add(result.ctg) if hits & conditions[run_info[channel]].targets: # Mappings and targets overlap coord_match = any( between(r.r_st, c) for r in results for c in conditions[run_info[channel]] .coords.get(strand_converter.get(r.strand), {}) .get(r.ctg, []) ) if len(hits) == 1: if coord_match: # Single match that is within coordinate range mode = "single_on" else: # Single match to a target outside coordinate range mode = "single_off" elif len(hits) > 1: if coord_match: # Multiple matches with at least one in the correct region mode = "multi_on" else: # Multiple matches to targets outside the coordinate range mode = "multi_off" else: # No matches in mappings if len(hits) > 1: # More than one, off-target, mapping mode = "multi_off" elif len(hits) == 1: # Single off-target mapping mode = "single_off" # This is where we make our decision: # Get the associated action for this condition decision_str = getattr(conditions[run_info[channel]], mode) # decision is an alias for the functions "unblock" or "stop_receiving" decision = decision_dict[decision_str] # If max_chunks has been exceeded AND we don't want to keep sequencing we unblock if exceeded_threshold and decision_str != "stop_receiving": mode = "exceeded_max_chunks_unblocked" client.unblock_read(channel, read_number, unblock_duration, read_id) # TODO: WHAT IS GOING ON?! # I think that this needs to change between enrichment and depletion # If under min_chunks AND any mapping mode seen we unblock # if below_threshold and mode in {"single_off", "multi_off"}: if below_threshold and mode in { "single_on", "single_off", "multi_on", "multi_off", }: mode = "below_min_chunks_unblocked" client.unblock_read(channel, read_number, unblock_duration, read_id) # proceed returns None, so we send no decision; otherwise unblock or stop_receiving elif decision is not None: decided_reads[channel] = read_id decision(channel, read_number) log_decision() t1 = timer() s1 = "Took {:.5f} to call and map {} reads" logger.info(s1.format(t1 - t0, r)) # limit the rate at which we make requests if t0 + throttle > t1: time.sleep(throttle + t0 - t1) else: caller.disconnect() logger.info("Finished analysis of reads as client stopped.")