def test_adjust_workers(self): """adjust_workers stops clients""" workers, client_sockets = self._setup_server_and_clients() last_sock = client_sockets[-1] qiime_config = load_qiime_config() min_per_core = int(qiime_config['denoiser_min_per_core']) # no sockets get stopped self.assertEqual( adjust_workers( 4 * min_per_core - 1, 4, client_sockets), 4) # if we can send something the socket is still alive self.assertEqual(last_sock.send("Hello"), 5) # now, kill one client self.assertEqual( adjust_workers( 3 * min_per_core - 1, 4, client_sockets), 3) # socket should be closed self.assertRaises(error, last_sock.send, "Hello")
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if(key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write("Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n") break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if(prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if(newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return(non_clustered_filename, bestscores, cluster_mapping)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if (key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write( "Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n" ) break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if (prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if ('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if (newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return (non_clustered_filename, bestscores, cluster_mapping)