def get_flowgram_distances(id, flowgram, flowgrams, fc, ids, outdir, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat'): """Computes distance scores of flowgram to all flowgrams in parser. id: The flowgram identifier, also used to name intermediate files flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterable filehandle of flowgram file fc: a sink for flowgrams, either a FlowgramContainerArray or FlowgramContainerFile object ids: dict of ids of flowgrams in flowgrams that should be aligned outdir: directory for intermediate files error_profile: path to error profile *.dat file """ check_flowgram_ali_exe() # File that serves as input for external alignment program (fh, tmpfile) = init_flowgram_file(prefix=outdir) append_to_flowgram_file(id, flowgram, fh) k = 0 names = [] for f in flowgrams: if (f.Name in ids): fc.add(f) append_to_flowgram_file(f.Name, f, fh, trim=False) k += 1 names.append(f.Name) fh.close() # TODO: capture stderr and warn user scores_fh = popen( "%s -relscore_pairid %s %s " % (get_flowgram_ali_exe(), error_profile, tmpfile), 'r') scores = [map(float, (s.split())) for s in scores_fh if s != "\n"] if (k != len(scores)): raise RuntimeError( "Something bad has happened! I received less " + "alignment scores than there are flowgrams. Most likely this " + "means that the alignment program is not setup or corrupted. " + "Please run the test scripts to figure out the cause of the error." ) remove(tmpfile) return (scores, names, fc)
def test_denoiser_supported_version(self): """denoiser aligner is ready to use """ pass_test = True try: check_flowgram_ali_exe() except (ApplicationNotFoundError, ApplicationError): pass_test = False self.assertTrue(pass_test, "Denoiser flowgram aligner not found or not executable."+\ "This may or may not be a problem depending on "+\ "which components of QIIME you plan to use.")
def get_flowgram_distances(id, flowgram, flowgrams, fc, ids, outdir, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat'): """Computes distance scores of flowgram to all flowgrams in parser. id: The flowgram identifier, also used to name intermediate files flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterable filehandle of flowgram file fc: a sink for flowgrams, either a FlowgramContainerArray or FlowgramContainerFile object ids: dict of ids of flowgrams in flowgrams that should be aligned outdir: directory for intermediate files error_profile: path to error profile *.dat file """ check_flowgram_ali_exe() # File that serves as input for external alignment program (fh, tmpfile) = init_flowgram_file(prefix=outdir) append_to_flowgram_file(id, flowgram, fh) k = 0 names = [] for f in flowgrams: if(f.Name in ids): fc.add(f) append_to_flowgram_file(f.Name, f, fh, trim=False) k += 1 names.append(f.Name) fh.close() # TODO: capture stderr and warn user scores_fh = popen("%s -relscore_pairid %s %s " % (get_flowgram_ali_exe(), error_profile, tmpfile), 'r') scores = [map(float, (s.split())) for s in scores_fh if s != "\n"] if (k != len(scores)): raise RuntimeError("Something bad has happened! I received less " + "alignment scores than there are flowgrams. Most likely this " + "means that the alignment program is not setup or corrupted. " + "Please run the test scripts to figure out the cause of the error.") remove(tmpfile) return (scores, names, fc)
def setUp(self): # abort all tests without the alignment binary check_flowgram_ali_exe() signal.signal(signal.SIGALRM, timeout) # set the 'alarm' to go off in allowed_seconds seconds signal.alarm(allowed_seconds_per_test) self.test_dir = "denoiser_main_test" + make_tmp_name() + "/" self.expected = ">FS8APND01D3TW3 | cluster size: 94 \nCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC\n" self.expected_map_string = """FS8APND01EWRS4: FS8APND01BSTVP: FS8APND01DXG45: FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN FS8APND01EFK0W: FS8APND01DCIOO: FS8APND01CKOMZ: """ self.expected_titanium_map_string = """FS8APND01EWRS4:
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append( parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs( parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir
def denoise_seqs( sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if(checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if(preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif(cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")
def get_flowgram_distances_on_cluster( id, flowgram, flowgrams, fc, ids, num_cores, num_flows, spread, client_sockets=[]): """Computes distance scores of flowgram to all flowgrams in parser. id: The flowgram identifier, also used to name intermediate files flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterable filehandle of flowgram file fc: a sink of flowgrams, which serves as source in the next round ids: list of flowgram ids that should be used from flowgrams num_cores: number of cpus num_flows: Number of flows in parser client_sockets: A list of open sockets for client-server communication spread: historical distribution of processing runtimes """ epoch = time() check_flowgram_ali_exe() qiime_config = load_qiime_config() min_per_core = int(qiime_config['denoiser_min_per_core']) # if using from future import division this has to be checked, # as we want true integer division here per_core = max(min_per_core, (num_flows / num_cores) + 1) names = [] scores = [] # Need to call this here, since we iterate over the same iterator repeatedly. # Otherwise the call in ifilter will reset the iterator by implicitely calling __iter__. # test if iter does the same flowgrams_iter = flowgrams.__iter__() # prepare input files and commands # synchronous client-server communication workload = compute_workload(num_cores, num_flows, spread) debug_count = 0 for i in range(num_cores): socket = client_sockets[i] # send master flowgram to file first send_flowgram_to_socket(id, flowgram, socket) if(workload[i] < 1): # no data left for this poor guy save_send(socket, "--END--") continue else: # Then add all others which are still valid, i.e. in ids for (k, f) in (izip(range(workload[i]), ifilter(lambda f: f.Name in ids, flowgrams_iter))): fc.add(f) send_flowgram_to_socket(k, f, socket, trim=False) names.append(f.Name) debug_count += 1 # send the termination signal save_send(socket, "--END--") # asynchronous client-server communication # ClientHandlers write data in results results = [None] * num_cores timing = [0.0 for x in xrange(num_cores)] for i in range(num_cores): socket = client_sockets[i] ClientHandler(socket, i, results, timing) loop() # end asynchronous loop spread = adjust_processing_time(num_cores, workload, timing, epoch) # flatten list scores = [item for list in results for item in list] if (debug_count != len(scores)): raise RuntimeError("Something bad has happened! I received less " + "alignment scores %d than there are flowgrams %d. Most likely this " % (len(scores), debug_count) + "means that the alignment program is not setup correctly or corrupted. " + "Please run the test scripts to figure out the cause of the error.") return (scores, names, fc)
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir
def denoise_seqs(sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if (checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if (preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif (cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")
def get_flowgram_distances_on_cluster(id, flowgram, flowgrams, fc, ids, num_cores, num_flows, spread, client_sockets=[]): """Computes distance scores of flowgram to all flowgrams in parser. id: The flowgram identifier, also used to name intermediate files flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterable filehandle of flowgram file fc: a sink of flowgrams, which serves as source in the next round ids: list of flowgram ids that should be used from flowgrams num_cores: number of cpus num_flows: Number of flows in parser client_sockets: A list of open sockets for client-server communication spread: historical distribution of processing runtimes """ epoch = time() check_flowgram_ali_exe() qiime_config = load_qiime_config() min_per_core = int(qiime_config['denoiser_min_per_core']) # if using from future import division this has to be checked, # as we want true integer division here per_core = max(min_per_core, (num_flows / num_cores) + 1) names = [] scores = [] # Need to call this here, since we iterate over the same iterator repeatedly. # Otherwise the call in ifilter will reset the iterator by implicitely calling __iter__. # test if iter does the same flowgrams_iter = flowgrams.__iter__() # prepare input files and commands # synchronous client-server communication workload = compute_workload(num_cores, num_flows, spread) debug_count = 0 for i in range(num_cores): socket = client_sockets[i] # send master flowgram to file first send_flowgram_to_socket(id, flowgram, socket) if (workload[i] < 1): # no data left for this poor guy save_send(socket, "--END--") continue else: # Then add all others which are still valid, i.e. in ids for (k, f) in (izip(range(workload[i]), ifilter(lambda f: f.Name in ids, flowgrams_iter))): fc.add(f) send_flowgram_to_socket(k, f, socket, trim=False) names.append(f.Name) debug_count += 1 # send the termination signal save_send(socket, "--END--") # asynchronous client-server communication # ClientHandlers write data in results results = [None] * num_cores timing = [0.0 for x in xrange(num_cores)] for i in range(num_cores): socket = client_sockets[i] ClientHandler(socket, i, results, timing) loop() # end asynchronous loop spread = adjust_processing_time(num_cores, workload, timing, epoch) # flatten list scores = [item for list in results for item in list] if (debug_count != len(scores)): raise RuntimeError( "Something bad has happened! I received less " + "alignment scores %d than there are flowgrams %d. Most likely this " % (len(scores), debug_count) + "means that the alignment program is not setup correctly or corrupted. " + "Please run the test scripts to figure out the cause of the error." ) return (scores, names, fc)