def test_filter_sff_file(self): """filter_sff_file filters out bad reads.""" try: fh = open(self.tiny_test) except IOError: self.fail( "Could not open test file %s. Skipping test" % self.tiny_test) # With no filters all flowgram should be in out file flowgrams, header = lazy_parse_sff_handle(fh) filter_list = [] fd, out_file_name = mkstemp( prefix="test_filter_sff_file", suffix=".sff.txt") close(fd) out_fh = open(out_file_name, "w") l = filter_sff_file(flowgrams, header, filter_list, out_fh) remove(out_file_name) fh.close() self.assertEqual(l, 114) # With good filters some should survive fh = open(self.tiny_test) flowgrams, header = lazy_parse_sff_handle(fh) filter_list = [lambda f:within_length(f, 100, 300)] fd, out_file_name = mkstemp( prefix="test_filter_sff_file", suffix=".sff.txt") close(fd) out_fh = open(out_file_name, "w") l = filter_sff_file(flowgrams, header, filter_list, out_fh) remove(out_file_name) fh.close() self.assertEqual(l, 112) # With strong filters nothing should be in fh = open(self.tiny_test) flowgrams, header = lazy_parse_sff_handle(fh) filter_list = [lambda f:within_length(f, 0, 0)] fd, out_file_name = mkstemp( prefix="test_filter_sff_file", suffix=".sff.txt") close(fd) out_fh = open(out_file_name, "w") l = filter_sff_file(flowgrams, header, filter_list, out_fh) remove(out_file_name) self.assertEqual(l, 0)
def __iter__(self): # make it read_only and reset to start of file self.write_mode = False self.fh.close() (self.flowgrams, self.header) = lazy_parse_sff_handle(open(self.filename)) return self.flowgrams
def __iter__(self): # make it read_only and reset to start of file self.write_mode = False self.fh.close() (self.flowgrams, self.header) = lazy_parse_sff_handle( open(self.filename)) return self.flowgrams
def build_averaged_flowgrams(mapping, sff_fp, min_coverage=50, out_fp=None): """Build averaged flowgrams for each cluster in mapping. mapping: a cluster mapping as dictionary of lists sff_fp: pointer to sff.txt file, must be consistent with mapping min_coverage: number of flowgrams to average over for each cluster out_fp: ouput file name NOTE: This function has no test code, since it is mostly IO around tested functions """ l = len(mapping) (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) # update some values in the sff header header["# of Reads"] = l header["Index Length"] = "NA" if (out_fp): out_filename = out_fp else: fd, out_filename = mkstemp(dir="/tmp/", prefix="prefix_dereplicated", suffix=".sff.txt") close(fd) outhandle = open(out_filename, "w") # write out reduced flogram set write_sff_header(header, outhandle) seqs = {} # get a random sample for each cluster sample_keys = sample_mapped_keys(mapping, min_coverage) for ave_f, id in _average_flowgrams(mapping, flowgrams, sample_keys): outhandle.write(ave_f.createFlowHeader() + "\n") ave_f.Bases = ave_f.toSeq() seqs[id] = ave_f.Bases outhandle.close() return(out_filename, seqs)
def build_averaged_flowgrams(mapping, sff_fp, min_coverage=50, out_fp=None): """Build averaged flowgrams for each cluster in mapping. mapping: a cluster mapping as dictionary of lists sff_fp: pointer to sff.txt file, must be consistent with mapping min_coverage: number of flowgrams to average over for each cluster out_fp: ouput file name NOTE: This function has no test code, since it is mostly IO around tested functions """ l = len(mapping) (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) # update some values in the sff header header["# of Reads"] = l header["Index Length"] = "NA" if (out_fp): out_filename = out_fp else: fd, out_filename = mkstemp(dir="/tmp/", prefix="prefix_dereplicated", suffix=".sff.txt") close(fd) outhandle = open(out_filename, "w") # write out reduced flogram set write_sff_header(header, outhandle) seqs = {} # get a random sample for each cluster sample_keys = sample_mapped_keys(mapping, min_coverage) for ave_f, id in _average_flowgrams(mapping, flowgrams, sample_keys): outhandle.write(ave_f.createFlowHeader() + "\n") ave_f.Bases = ave_f.toSeq() seqs[id] = ave_f.Bases outhandle.close() return (out_filename, seqs)
def secondary_clustering(sff_file, mapping, bestscores, log_fh, threshold=4.5, verbose=False): """Clusters sequences based on their best distance to any of the centroids. Does not actually compute distances but uses the results of the first phase stored in bestscores. sff_file: name of unclustered flowgram file mapping: preliminary mapping file, dictionary of ids to list of ids bestscores: dictionary that stores for each unclustered flowgram the best score it has to to one of the centroid previously seen and the id of the centroid. Used in the second denoising phase. threshold: Secondary clustering threshold. """ if (len(bestscores) == 0): # Either all sequence are already clustered or # we had no seq exceeding the bail out limit return (flowgrams, header) = lazy_parse_sff_handle(open(sff_file)) counter = 0 for f in flowgrams: (id, score) = bestscores[f.Name] if (score < threshold): counter += 1 # update the mapping information mapping[id].extend(mapping[f.Name]) mapping[id].append(f.Name) del (mapping[f.Name]) if verbose: log_fh.write("Secondary clustering removed %d flowgrams\n" % counter)
def store_clusters(mapping, sff_fp, outdir="/tmp/", store_members=False): """Stores fasta and flogram file for each cluster.""" # get mapping read to cluster invert_map = invert_mapping(mapping) (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) leftover_fasta_fh = open(outdir + "/singletons.fasta", "w") centroids = [] for f in flowgrams: try: key = invert_map[f.Name] except KeyError: # this flowgram has not been clustered continue if (len(mapping[key]) == 0): # do not store singletons in a separate cluster leftover_fasta_fh.write(f.toFasta() + "\n") continue elif(f.Name in mapping): # save as a centroid centroids.append((len(mapping[f.Name]) + 1, f.Name, f.toSeq())) if (store_members): flows_fh = open(outdir + key + ".flows", "a") fasta_fh = open(outdir + key + ".fasta", "a") flows_fh.write("%s\n" % f) fasta_fh.write(f.toFasta() + "\n") fasta_fh.close() flows_fh.close() leftover_fasta_fh.close() # sort and store ordered by cluster_size centroids.sort(reverse=True) centroid_fh = open(outdir + "/centroids.fasta", "w") for size, name, seq in centroids: centroid_fh.write(">%s | cluster size: %d \n%s\n" % (name, size, seq)) centroid_fh.close()
def secondary_clustering(sff_file, mapping, bestscores, log_fh, threshold=4.5, verbose=False): """Clusters sequences based on their best distance to any of the centroids. Does not actually compute distances but uses the results of the first phase stored in bestscores. sff_file: name of unclustered flowgram file mapping: preliminary mapping file, dictionary of ids to list of ids bestscores: dictionary that stores for each unclustered flowgram the best score it has to to one of the centroid previously seen and the id of the centroid. Used in the second denoising phase. threshold: Secondary clustering threshold. """ if(len(bestscores) == 0): # Either all sequence are already clustered or # we had no seq exceeding the bail out limit return (flowgrams, header) = lazy_parse_sff_handle(open(sff_file)) counter = 0 for f in flowgrams: (id, score) = bestscores[f.Name] if (score < threshold): counter += 1 # update the mapping information mapping[id].extend(mapping[f.Name]) mapping[id].append(f.Name) del(mapping[f.Name]) if verbose: log_fh.write("Secondary clustering removed %d flowgrams\n" % counter)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if (fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write("Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams( mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if(fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write( "Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if(key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write("Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n") break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if(prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if(newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return(non_clustered_filename, bestscores, cluster_mapping)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if (key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write( "Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n" ) break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if (prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if ('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if (newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return (non_clustered_filename, bestscores, cluster_mapping)