Пример #1
0
    def test_extract_barcodes_from_mapping(self):
        """extract_barcodes_from_mapping pulls out the barcodes and ids."""

        # cases that are valid
        expected = {'FV9NWLF.01.EVGI8': 'TCGAGCGAATCT',
                    'FV9NWLF.01.DROG9': 'TAGTTGCGAGTC',
                    'FV9NWLF.01.DZTVJ': 'TCGAGCGAATCT',
                    'FV9NWLF.01.DI8SC': 'TCTGCTAGATGT',
                    'FV9NWLF.01.DW381': 'TCATCGCGATAT',
                    'FV9NWLF01DP96S': 'TCATCGCGATAT',
                    'FV9NWLF01BOY7E': 'TCGTTCACATGA',
                    'FV9NWLF01A0OG1': 'TAGTTGCGAGTC',
                    'FV9NWLF01DJZFF': 'TCACGATTAGCG',
                    'FV9NWLF01D4LTB': 'TCGAGCGAATCT'}
        obs = extract_barcodes_from_mapping(self.labels)
        self.assertEqual(obs, expected)

        # invalid sequence identifiers, each element will raise an exception
        for label in self.invalid_sequence_identifiers:
            with self.assertRaises(AttributeError):
                obs = extract_barcodes_from_mapping(label)
Пример #2
0
    def test_extract_barcodes_from_mapping(self):
        """extract_barcodes_from_mapping pulls out the barcodes and ids."""

        # cases that are valid
        expected = {
            'FV9NWLF.01.EVGI8': 'TCGAGCGAATCT',
            'FV9NWLF.01.DROG9': 'TAGTTGCGAGTC',
            'FV9NWLF.01.DZTVJ': 'TCGAGCGAATCT',
            'FV9NWLF.01.DI8SC': 'TCTGCTAGATGT',
            'FV9NWLF.01.DW381': 'TCATCGCGATAT',
            'FV9NWLF01DP96S': 'TCATCGCGATAT',
            'FV9NWLF01BOY7E': 'TCGTTCACATGA',
            'FV9NWLF01A0OG1': 'TAGTTGCGAGTC',
            'FV9NWLF01DJZFF': 'TCACGATTAGCG',
            'FV9NWLF01D4LTB': 'TCGAGCGAATCT'
        }
        obs = extract_barcodes_from_mapping(self.labels)
        self.assertEqual(obs, expected)

        # invalid sequence identifiers, each element will raise an exception
        for label in self.invalid_sequence_identifiers:
            with self.assertRaises(AttributeError):
                obs = extract_barcodes_from_mapping(label)
Пример #3
0
   def test_extract_barcodes_from_mapping(self):
       """extract_barcodes_from_mapping pulls out the barcodes and ids."""
       
       expected = {'FV9NWLF01EVGI8':'TCGAGCGAATCT',
                   'FV9NWLF01DROG9':'TAGTTGCGAGTC',
                   'FV9NWLF01DZTVJ':'TCGAGCGAATCT',
                   'FV9NWLF01DI8SC':'TCTGCTAGATGT',
                   'FV9NWLF01DW381':'TCATCGCGATAT',
                   'FV9NWLF01DP96S':'TCATCGCGATAT',
                   'FV9NWLF01BOY7E':'TCGTTCACATGA',
                   'FV9NWLF01A0OG1':'TAGTTGCGAGTC',
                   'FV9NWLF01DJZFF':'TCACGATTAGCG',
                   'FV9NWLF01D4LTB':'TCGAGCGAATCT'}

       obs = extract_barcodes_from_mapping(self.labels)
       self.assertEqual(obs, expected)
Пример #4
0
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/",
               verbose=False, squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if(fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp,
                                                      barcode_mapping=barcode_mapping,
                                                      primer=primer)
        if verbose:
            log_fh.write(
                "Sequences in barcode mapping: %d\n" %
                len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp, primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp,
                                                     min_coverage=1,
                                                     # averaging produces too good flowgrams
                                                     # such that the greedy clustering clusters too much.
                                                     # Use the cluster centroid
                                                     # instead by using
                                                     # min_coverage 1
                                                     out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n"
                     % (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)
Пример #5
0
def preprocess(sff_fps,
               log_fh,
               fasta_fp=None,
               out_fp="/tmp/",
               verbose=False,
               squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if (fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp,
         l) = truncate_flowgrams_in_SFF(flowgrams,
                                        header,
                                        outdir=out_fp,
                                        barcode_mapping=barcode_mapping,
                                        primer=primer)
        if verbose:
            log_fh.write("Sequences in barcode mapping: %d\n" %
                         len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams,
                                                      header,
                                                      outdir=out_fp,
                                                      primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(
        mapping,
        trunc_sff_fp,
        min_coverage=1,
        # averaging produces too good flowgrams
        # such that the greedy clustering clusters too much.
        # Use the cluster centroid
        # instead by using
        # min_coverage 1
        out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n" %
                     (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)