def test_extract_barcodes_from_mapping(self): """extract_barcodes_from_mapping pulls out the barcodes and ids.""" # cases that are valid expected = {'FV9NWLF.01.EVGI8': 'TCGAGCGAATCT', 'FV9NWLF.01.DROG9': 'TAGTTGCGAGTC', 'FV9NWLF.01.DZTVJ': 'TCGAGCGAATCT', 'FV9NWLF.01.DI8SC': 'TCTGCTAGATGT', 'FV9NWLF.01.DW381': 'TCATCGCGATAT', 'FV9NWLF01DP96S': 'TCATCGCGATAT', 'FV9NWLF01BOY7E': 'TCGTTCACATGA', 'FV9NWLF01A0OG1': 'TAGTTGCGAGTC', 'FV9NWLF01DJZFF': 'TCACGATTAGCG', 'FV9NWLF01D4LTB': 'TCGAGCGAATCT'} obs = extract_barcodes_from_mapping(self.labels) self.assertEqual(obs, expected) # invalid sequence identifiers, each element will raise an exception for label in self.invalid_sequence_identifiers: with self.assertRaises(AttributeError): obs = extract_barcodes_from_mapping(label)
def test_extract_barcodes_from_mapping(self): """extract_barcodes_from_mapping pulls out the barcodes and ids.""" # cases that are valid expected = { 'FV9NWLF.01.EVGI8': 'TCGAGCGAATCT', 'FV9NWLF.01.DROG9': 'TAGTTGCGAGTC', 'FV9NWLF.01.DZTVJ': 'TCGAGCGAATCT', 'FV9NWLF.01.DI8SC': 'TCTGCTAGATGT', 'FV9NWLF.01.DW381': 'TCATCGCGATAT', 'FV9NWLF01DP96S': 'TCATCGCGATAT', 'FV9NWLF01BOY7E': 'TCGTTCACATGA', 'FV9NWLF01A0OG1': 'TAGTTGCGAGTC', 'FV9NWLF01DJZFF': 'TCACGATTAGCG', 'FV9NWLF01D4LTB': 'TCGAGCGAATCT' } obs = extract_barcodes_from_mapping(self.labels) self.assertEqual(obs, expected) # invalid sequence identifiers, each element will raise an exception for label in self.invalid_sequence_identifiers: with self.assertRaises(AttributeError): obs = extract_barcodes_from_mapping(label)
def test_extract_barcodes_from_mapping(self): """extract_barcodes_from_mapping pulls out the barcodes and ids.""" expected = {'FV9NWLF01EVGI8':'TCGAGCGAATCT', 'FV9NWLF01DROG9':'TAGTTGCGAGTC', 'FV9NWLF01DZTVJ':'TCGAGCGAATCT', 'FV9NWLF01DI8SC':'TCTGCTAGATGT', 'FV9NWLF01DW381':'TCATCGCGATAT', 'FV9NWLF01DP96S':'TCATCGCGATAT', 'FV9NWLF01BOY7E':'TCGTTCACATGA', 'FV9NWLF01A0OG1':'TAGTTGCGAGTC', 'FV9NWLF01DJZFF':'TCACGATTAGCG', 'FV9NWLF01D4LTB':'TCGAGCGAATCT'} obs = extract_barcodes_from_mapping(self.labels) self.assertEqual(obs, expected)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if(fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write( "Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if (fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write("Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams( mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)