def test_cat_sff_files(self): "cat_sff_files cats sff_files" "" expected_bases = "tcagGCTAACTGTAACCCTCTTGGCACCCACTAAACGCCAATCTTGCTGGAG" +\ "TGTTTACCAGGCACCCAGCAATGTGAATAGTCActgagcgggctggcaaggc" # works with no file obs_flows, obs_header = cat_sff_files([]) self.assertEqual(len(obs_flows), 0) self.assertEqual(obs_header, None) # works with one file obs_flows, obs_header = cat_sff_files([sff_file]) obs_flows = list(obs_flows) self.assertEqual(obs_header['Magic Number'], "0x2E736666") self.assertEqual(obs_flows[0].Bases, expected_bases) self.assertEqual(len(obs_flows), 2) # works with two files obs_flows, obs_header = (cat_sff_files([sff_file, sff_file])) obs_flows = list(obs_flows) self.assertEqual(obs_header['Magic Number'], "0x2E736666") self.assertEqual(obs_flows[0].Bases, expected_bases) self.assertEqual(obs_flows[2].Bases, expected_bases) self.assertEqual(len(obs_flows), 4)
def test_cat_sff_files(self): "cat_sff_files cats sff_files""" expected_bases = "tcagGCTAACTGTAACCCTCTTGGCACCCACTAAACGCCAATCTTGCTGGAG"+\ "TGTTTACCAGGCACCCAGCAATGTGAATAGTCActgagcgggctggcaaggc" #works with no file obs_flows, obs_header = cat_sff_files([]) self.assertEqual(len(obs_flows),0) self.assertEqual(obs_header,None) #works with one file obs_flows, obs_header = cat_sff_files([sff_file]) obs_flows = list(obs_flows) self.assertEqual(obs_header['Magic Number'], "0x2E736666") self.assertEqual(obs_flows[0].Bases, expected_bases) self.assertEqual(len(obs_flows),2) #works with two files obs_flows, obs_header = (cat_sff_files([sff_file, sff_file])) obs_flows = list(obs_flows) self.assertEqual(obs_header['Magic Number'], "0x2E736666") self.assertEqual(obs_flows[0].Bases, expected_bases) self.assertEqual(obs_flows[2].Bases, expected_bases) self.assertEqual(len(obs_flows),4)
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"): """Splits a sff.txt file on barcode/mapping file.""" try: (flowgrams, header) = cat_sff_files(sff_file_handles) except ValueError: # reading in the binary sff usually shows up as ValueError raise FileFormatError( 'Wrong flogram file format. Make sure you pass the sff.txt format ' + 'produced by sffinfo. The binary .sff will not work here.') (inverse_map, map_count) = build_inverse_barcode_map(parse_fasta(map_file_handle)) filenames = [] # we might have many barcodes and reach python open file limit # therefor we go the slow way and open and close files each time # First set up all files with the headers only for barcode_id in map_count.keys(): fh = open(outdir + barcode_id, "w") write_sff_header(header, fh, map_count[barcode_id]) fh.close() filenames.append(outdir + barcode_id) # Then direct each flowgram into its barcode file for f in flowgrams: if f.Name in inverse_map: barcode_id = inverse_map[f.Name] fh = open(outdir + barcode_id, "a") fh.write(f.createFlowHeader() + "\n") return filenames
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"): """Splits a sff.txt file on barcode/mapping file.""" try: (flowgrams, header) = cat_sff_files(sff_file_handles) except ValueError: #reading in the binary sff usually shows up as ValueError raise FileFormatError, 'Wrong flogram file format. Make sure you pass the sff.txt format '+\ 'produced by sffinfo. The binary .sff will not work here.' (inverse_map, map_count) = build_inverse_barcode_map(MinimalFastaParser(map_file_handle)) filenames = [] #we might have many barcodes and reach python open file limit # therefor we go the slow way and open and close files each time #First set up all files with the headers only for barcode_id in map_count.keys(): fh = open(outdir+barcode_id, "w") write_sff_header(header, fh, map_count[barcode_id]) fh.close() filenames.append(outdir+barcode_id) #Then direct each flowgram into its barcode file for f in flowgrams: if inverse_map.has_key(f.Name): barcode_id = inverse_map[f.Name] fh = open(outdir+barcode_id, "a") fh.write(f.createFlowHeader()+"\n") return filenames
def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) if not opts.sff_fp: parser.error('Required option flowgram file path (-i) not specified') elif not files_exist(opts.sff_fp): parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.' % opts.sff_fp) if(opts.checkpoint_fp): bp_fp = opts.checkpoint_fp if not exists(bp_fp): parser.error('Specified checkpoint file does not exist: %s' % bp_fp) #peek into sff.txt files to make sure they are parseable #cat_sff_fles is lazy and only reads header flowgrams, header = cat_sff_files(map(open, opts.sff_fp.split(','))) if(opts.split and opts.preprocess_fp): parser.error('Options --split and --preprocess_fp are exclusive') if(opts.preprocess_fp): pp_fp = opts.preprocess_fp if not exists(opts.preprocess_fp): parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp) if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %(pp_fp, pp_fp)): parser.error('Specified preprocess directory does not contain expected files: ' +\ 'prefix_mapping.txt and prefix_dereplicated.fasta') if opts.titanium: opts.error_profile = DENOISER_DATA_DIR+'Titanium_error_profile.dat' opts.low_cutoff = 4 opts.high_cutoff = 5 if not exists(opts.error_profile): parser.error('Specified error profile %s does not exist' % opts.error_profile) if opts.output_dir: #make sure it always ends on / tmpoutdir=opts.output_dir+"/" else: #make random dir in current dir tmpoutdir = get_tmp_filename(tmp_dir="", prefix="denoiser_", suffix="/") create_dir(tmpoutdir, not opts.force) log_fp = 'denoiser.log' if opts.split: denoise_per_sample(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium) else: denoise_seqs(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium, opts.checkpoint_fp)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if(fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write( "Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if (fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write("Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams( mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)