def test_stop_workers(self): """stop_workers terminates all clients""" workers, client_sockets = self._setup_server_and_clients() stop_workers(client_sockets) for client_socket in client_sockets: self.assertRaises(error, client_socket.send, "hello")
def test_stop_workers_on_closed_socket(self): """stop_workers terminates all clients""" # Repeat test but this time close one of the sockets early. # simulates crashed client workers, client_sockets = self._setup_server_and_clients() client_sockets[-1].close() fake_fh = StringIO() stop_workers(client_sockets, fake_fh) self.assertEqual(fake_fh.getvalue(), "Worker 3 seems to be dead already. Check for runaways!\n") for client_socket in client_sockets: self.assertRaises(error, client_socket.send, "hello")
def test_get_flowgram_distances_on_cluster(self): """get_flowgram_distances_on_cluster computes the correct alignment score.""" self.tmp_dir = mkdtemp(dir="./", suffix="/") # setup server and workers self.socket = setup_server() workers, client_sockets = setup_workers(1, self.tmp_dir, self.socket, verbose=False) client_sockets = [a for a, b in client_sockets] scores, names, fc = get_flowgram_distances_on_cluster( "1", self.flowgram, self.flowgrams, FlowgramContainerArray(), {"FZTHQMS01CIW5N": ""}, 1, 1, [1], client_sockets) stop_workers(client_sockets) self.assertEqual(names, ["FZTHQMS01CIW5N"]) assert_almost_equal(scores, [[4.95274923, 0.7815385]], decimal=4)
def test_get_flowgram_distances_on_cluster(self): """get_flowgram_distances_on_cluster computes the correct alignment score.""" self.tmp_dir=get_tmp_filename(tmp_dir = "./", suffix="/") mkdir(self.tmp_dir) #setup server and workers self.socket = setup_server() workers, client_sockets = setup_workers(1, self.tmp_dir, self.socket, verbose=False) client_sockets = [a for a,b in client_sockets] scores, names, fc = get_flowgram_distances_on_cluster("1", self.flowgram, self.flowgrams, FlowgramContainerArray(), {"FZTHQMS01CIW5N":""}, 1, 1, [1], client_sockets) stop_workers(client_sockets) self.assertEqual(names, ["FZTHQMS01CIW5N"]) self.assertFloatEqual(scores, [4.95274923, 0.7815385])
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if(key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write("Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n") break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if(prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if(newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return(non_clustered_filename, bestscores, cluster_mapping)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if (key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write( "Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n" ) break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if (prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if ('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if (newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return (non_clustered_filename, bestscores, cluster_mapping)