def test_write_sff_header(self): """write_sff_header writes a correct sff header""" expected = """Common Header: Magic Number:\t0x2E736666 Version:\t0001 Index Offset:\t7773224 Index Length:\t93365 # of Reads:\t114 Header Length:\t440 Key Length:\t4 # of Flows:\t400 Flowgram Code:\t1 Flow Chars:\tTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG Key Sequence:\tTCAG """.split('\n') header = {'Version':"0001", 'Magic Number': '0x2E736666', 'Index Offset': '7773224', 'Index Length': '93365', '# of Reads': '114', 'Header Length': '440', 'Key Length': '4', '# of Flows': '400', 'Flowgram Code': '1', 'Flow Chars': 'TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG', 'Key Sequence': 'TCAG'} tmp_name = get_tmp_filename(prefix="test_write_sff_header") fh = open(tmp_name,"w") write_sff_header(header, fh, num=400) fh.close() fh = open(tmp_name,"U") lines =list(fh) remove(tmp_name) self.assertEqualItems(lines, map(lambda a: a +"\n", expected))
def __init__(self, header, outdir="/tmp/"): # set up output file self.filename = get_tmp_filename(tmp_dir=outdir, prefix="fc", suffix=".sff.txt") self.fh = open(self.filename, "w") write_sff_header(header, self.fh) self.write_mode = True
def __init__(self, header, outdir="/tmp/"): #set up output file self.filename = get_tmp_filename(tmp_dir=outdir, prefix="fc", suffix=".sff.txt") self.fh = open(self.filename, "w") write_sff_header(header, self.fh) self.write_mode = True
def build_averaged_flowgrams(mapping, sff_fp, min_coverage=50, out_fp=None): """Build averaged flowgrams for each cluster in mapping. mapping: a cluster mapping as dictionary of lists sff_fp: pointer to sff.txt file, must be consistent with mapping min_coverage: number of flowgrams to average over for each cluster out_fp: ouput file name NOTE: This function has no test code, since it is mostly IO around tested functions """ l = len(mapping) (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) #update some values in the sff header header["# of Reads"] = l header["Index Length"] = "NA" if (out_fp): out_filename=out_fp else: out_filename = get_tmp_filename(tmp_dir="/tmp/", prefix="prefix_dereplicated", suffix = ".sff.txt") outhandle = open(out_filename, "w") #write out reduced flogram set write_sff_header(header, outhandle) seqs = {} # get a random sample for each cluster sample_keys = sample_mapped_keys(mapping, min_coverage) for ave_f,id in _average_flowgrams(mapping, flowgrams, sample_keys): outhandle.write(ave_f.createFlowHeader()+"\n") ave_f.Bases = ave_f.toSeq() seqs[id] = ave_f.Bases outhandle.close() return(out_filename, seqs)
def test_write_sff_header(self): """write_sff_header writes a correct sff header""" expected = """Common Header: Magic Number:\t0x2E736666 Version:\t0001 Index Offset:\t7773224 Index Length:\t93365 # of Reads:\t114 Header Length:\t440 Key Length:\t4 # of Flows:\t400 Flowgram Code:\t1 Flow Chars:\tTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG Key Sequence:\tTCAG """.split( "\n" ) header = { "Version": "0001", "Magic Number": "0x2E736666", "Index Offset": "7773224", "Index Length": "93365", "# of Reads": "114", "Header Length": "440", "Key Length": "4", "# of Flows": "400", "Flowgram Code": "1", "Flow Chars": "TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG", "Key Sequence": "TCAG", } tmp_name = get_tmp_filename(prefix="test_write_sff_header") fh = open(tmp_name, "w") write_sff_header(header, fh, num=400) fh.close() fh = open(tmp_name, "U") lines = list(fh) remove(tmp_name) self.assertEqualItems(lines, map(lambda a: a + "\n", expected))
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR+'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 #this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: #skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if(not cluster_mapping.has_key(key)): #this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write("Max number of rounds reached. "+ "Aborting clustering phase II and continuing with phase III.\n") break prefix_clustersize=len(cluster_mapping[key]) #abort greedy first phase if(prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the code if('N' in seqs[key]): continue #check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) #check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if(newl==0): #all flowgrams clustered break #JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() #write all remaining flowgrams into file for next step #TODO: might use abstract FlowgramContainer here as well non_clustered_filename = get_tmp_filename(tmp_dir=outdir, prefix="ff", suffix =".sff.txt") non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (ids.has_key(f.Name)): non_clustered_fh.write(f.createFlowHeader() +"\n") return(non_clustered_filename, bestscores, cluster_mapping)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR+'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 #this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: #skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if(not cluster_mapping.has_key(key)): #this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write("Max number of rounds reached. "+ "Aborting clustering phase II and continuing with phase III.\n") break prefix_clustersize=len(cluster_mapping[key]) #abort greedy first phase if(prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the code if('N' in seqs[key]): continue #check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) #check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if(newl==0): #all flowgrams clustered break #JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() #write all remaining flowgrams into file for next step #TODO: might use abstract FlowgramContainer here as well non_clustered_filename = get_tmp_filename(tmp_dir=outdir, prefix="ff", suffix =".sff.txt") non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (ids.has_key(f.Name)): non_clustered_fh.write(f.createFlowHeader() +"\n") return(non_clustered_filename, bestscores, cluster_mapping)