def get_flowgram_distances_on_cluster( id, flowgram, flowgrams, fc, ids, num_cores, num_flows, spread, client_sockets=[]): """Computes distance scores of flowgram to all flowgrams in parser. id: The flowgram identifier, also used to name intermediate files flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterable filehandle of flowgram file fc: a sink of flowgrams, which serves as source in the next round ids: list of flowgram ids that should be used from flowgrams num_cores: number of cpus num_flows: Number of flows in parser client_sockets: A list of open sockets for client-server communication spread: historical distribution of processing runtimes """ epoch = time() check_flowgram_ali_exe() qiime_config = load_qiime_config() min_per_core = int(qiime_config['denoiser_min_per_core']) # if using from future import division this has to be checked, # as we want true integer division here per_core = max(min_per_core, (num_flows / num_cores) + 1) names = [] scores = [] # Need to call this here, since we iterate over the same iterator repeatedly. # Otherwise the call in ifilter will reset the iterator by implicitely calling __iter__. # test if iter does the same flowgrams_iter = flowgrams.__iter__() # prepare input files and commands # synchronous client-server communication workload = compute_workload(num_cores, num_flows, spread) debug_count = 0 for i in range(num_cores): socket = client_sockets[i] # send master flowgram to file first send_flowgram_to_socket(id, flowgram, socket) if(workload[i] < 1): # no data left for this poor guy save_send(socket, "--END--") continue else: # Then add all others which are still valid, i.e. in ids for (k, f) in (izip(range(workload[i]), ifilter(lambda f: f.Name in ids, flowgrams_iter))): fc.add(f) send_flowgram_to_socket(k, f, socket, trim=False) names.append(f.Name) debug_count += 1 # send the termination signal save_send(socket, "--END--") # asynchronous client-server communication # ClientHandlers write data in results results = [None] * num_cores timing = [0.0 for x in xrange(num_cores)] for i in range(num_cores): socket = client_sockets[i] ClientHandler(socket, i, results, timing) loop() # end asynchronous loop spread = adjust_processing_time(num_cores, workload, timing, epoch) # flatten list scores = [item for list in results for item in list] if (debug_count != len(scores)): raise RuntimeError("Something bad has happened! I received less " + "alignment scores %d than there are flowgrams %d. Most likely this " % (len(scores), debug_count) + "means that the alignment program is not setup correctly or corrupted. " + "Please run the test scripts to figure out the cause of the error.") return (scores, names, fc)
def get_flowgram_distances_on_cluster(id, flowgram, flowgrams, fc, ids, num_cores, num_flows, spread, client_sockets=[]): """Computes distance scores of flowgram to all flowgrams in parser. id: The flowgram identifier, also used to name intermediate files flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterable filehandle of flowgram file fc: a sink of flowgrams, which serves as source in the next round ids: list of flowgram ids that should be used from flowgrams num_cores: number of cpus num_flows: Number of flows in parser client_sockets: A list of open sockets for client-server communication spread: historical distribution of processing runtimes """ epoch = time() check_flowgram_ali_exe() qiime_config = load_qiime_config() min_per_core = int(qiime_config['denoiser_min_per_core']) # if using from future import division this has to be checked, # as we want true integer division here per_core = max(min_per_core, (num_flows / num_cores) + 1) names = [] scores = [] # Need to call this here, since we iterate over the same iterator repeatedly. # Otherwise the call in ifilter will reset the iterator by implicitely calling __iter__. # test if iter does the same flowgrams_iter = flowgrams.__iter__() # prepare input files and commands # synchronous client-server communication workload = compute_workload(num_cores, num_flows, spread) debug_count = 0 for i in range(num_cores): socket = client_sockets[i] # send master flowgram to file first send_flowgram_to_socket(id, flowgram, socket) if (workload[i] < 1): # no data left for this poor guy save_send(socket, "--END--") continue else: # Then add all others which are still valid, i.e. in ids for (k, f) in (izip(range(workload[i]), ifilter(lambda f: f.Name in ids, flowgrams_iter))): fc.add(f) send_flowgram_to_socket(k, f, socket, trim=False) names.append(f.Name) debug_count += 1 # send the termination signal save_send(socket, "--END--") # asynchronous client-server communication # ClientHandlers write data in results results = [None] * num_cores timing = [0.0 for x in xrange(num_cores)] for i in range(num_cores): socket = client_sockets[i] ClientHandler(socket, i, results, timing) loop() # end asynchronous loop spread = adjust_processing_time(num_cores, workload, timing, epoch) # flatten list scores = [item for list in results for item in list] if (debug_count != len(scores)): raise RuntimeError( "Something bad has happened! I received less " + "alignment scores %d than there are flowgrams %d. Most likely this " % (len(scores), debug_count) + "means that the alignment program is not setup correctly or corrupted. " + "Please run the test scripts to figure out the cause of the error." ) return (scores, names, fc)