Exemplo n.º 1
0
    def test_cat_sff_files(self):
        "cat_sff_files cats sff_files" ""

        expected_bases =  "tcagGCTAACTGTAACCCTCTTGGCACCCACTAAACGCCAATCTTGCTGGAG" +\
            "TGTTTACCAGGCACCCAGCAATGTGAATAGTCActgagcgggctggcaaggc"

        # works with no file
        obs_flows, obs_header = cat_sff_files([])
        self.assertEqual(len(obs_flows), 0)
        self.assertEqual(obs_header, None)

        # works with one file
        obs_flows, obs_header = cat_sff_files([sff_file])
        obs_flows = list(obs_flows)
        self.assertEqual(obs_header['Magic Number'], "0x2E736666")
        self.assertEqual(obs_flows[0].Bases, expected_bases)
        self.assertEqual(len(obs_flows), 2)

        # works with two files
        obs_flows, obs_header = (cat_sff_files([sff_file, sff_file]))
        obs_flows = list(obs_flows)

        self.assertEqual(obs_header['Magic Number'], "0x2E736666")
        self.assertEqual(obs_flows[0].Bases, expected_bases)
        self.assertEqual(obs_flows[2].Bases, expected_bases)
        self.assertEqual(len(obs_flows), 4)
Exemplo n.º 2
0
   def test_cat_sff_files(self):
      "cat_sff_files cats sff_files"""

      expected_bases =  "tcagGCTAACTGTAACCCTCTTGGCACCCACTAAACGCCAATCTTGCTGGAG"+\
          "TGTTTACCAGGCACCCAGCAATGTGAATAGTCActgagcgggctggcaaggc"
 
     #works with no file
      obs_flows, obs_header = cat_sff_files([])
      self.assertEqual(len(obs_flows),0)
      self.assertEqual(obs_header,None)

      #works with one file
      obs_flows, obs_header = cat_sff_files([sff_file])    
      obs_flows = list(obs_flows)
      self.assertEqual(obs_header['Magic Number'], "0x2E736666")
      self.assertEqual(obs_flows[0].Bases, expected_bases)
      self.assertEqual(len(obs_flows),2)

      #works with two files
      obs_flows, obs_header = (cat_sff_files([sff_file, sff_file]))
      obs_flows = list(obs_flows)

      self.assertEqual(obs_header['Magic Number'], "0x2E736666")
      self.assertEqual(obs_flows[0].Bases, expected_bases)
      self.assertEqual(obs_flows[2].Bases, expected_bases)
      self.assertEqual(len(obs_flows),4)
Exemplo n.º 3
0
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"):
    """Splits a sff.txt file on barcode/mapping file."""

    try:
        (flowgrams, header) = cat_sff_files(sff_file_handles)
    except ValueError:
        # reading in the binary sff usually shows up as ValueError
        raise FileFormatError(
            'Wrong flogram file format. Make sure you pass the sff.txt format '
            + 'produced by sffinfo. The binary .sff will not work here.')

    (inverse_map,
     map_count) = build_inverse_barcode_map(parse_fasta(map_file_handle))

    filenames = []
    # we might have many barcodes and reach python open file limit
    # therefor we go the slow way and open and close files each time
    # First set up all files with the headers only
    for barcode_id in map_count.keys():
        fh = open(outdir + barcode_id, "w")
        write_sff_header(header, fh, map_count[barcode_id])
        fh.close()
        filenames.append(outdir + barcode_id)
    # Then direct each flowgram into its barcode file
    for f in flowgrams:
        if f.Name in inverse_map:
            barcode_id = inverse_map[f.Name]
            fh = open(outdir + barcode_id, "a")
            fh.write(f.createFlowHeader() + "\n")
    return filenames
Exemplo n.º 4
0
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"):
    """Splits a sff.txt file on barcode/mapping file."""
    
    try:
        (flowgrams, header) = cat_sff_files(sff_file_handles)
    except ValueError:
        #reading in the binary sff usually shows up as ValueError
        raise FileFormatError, 'Wrong flogram file format. Make sure you pass the sff.txt format '+\
            'produced by sffinfo. The binary .sff will not work here.'
  
    (inverse_map, map_count) = build_inverse_barcode_map(MinimalFastaParser(map_file_handle))
    
    filenames = []
    #we might have many barcodes and reach python open file limit
    # therefor we go the slow way and open and close files each time
    #First set up all files with the headers only
    for barcode_id in map_count.keys():
        fh = open(outdir+barcode_id, "w")
        write_sff_header(header, fh, map_count[barcode_id])
        fh.close()
        filenames.append(outdir+barcode_id)
    #Then direct each flowgram into its barcode file
    for f in flowgrams:
        if inverse_map.has_key(f.Name):
            barcode_id = inverse_map[f.Name]
            fh = open(outdir+barcode_id, "a")
            fh.write(f.createFlowHeader()+"\n")
    return filenames
Exemplo n.º 5
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    if not opts.sff_fp:
        parser.error('Required option flowgram file path (-i) not specified')
    elif not files_exist(opts.sff_fp):
        parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
                     % opts.sff_fp)

    if(opts.checkpoint_fp):
        bp_fp = opts.checkpoint_fp
        if not exists(bp_fp):
            parser.error('Specified checkpoint file does not exist: %s' % bp_fp)

    #peek into sff.txt files to make sure they are parseable
    #cat_sff_fles is lazy and only reads header
    flowgrams, header = cat_sff_files(map(open, opts.sff_fp.split(',')))
    
    if(opts.split and opts.preprocess_fp):
        parser.error('Options --split and --preprocess_fp are exclusive')

    if(opts.preprocess_fp):
        pp_fp = opts.preprocess_fp
        if not exists(opts.preprocess_fp):
            parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp)
        if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %(pp_fp, pp_fp)):
            parser.error('Specified preprocess directory does not contain expected files: ' +\
                             'prefix_mapping.txt and prefix_dereplicated.fasta')

    if opts.titanium:
        opts.error_profile = DENOISER_DATA_DIR+'Titanium_error_profile.dat'
        opts.low_cutoff = 4
        opts.high_cutoff = 5

    if not exists(opts.error_profile):
        parser.error('Specified error profile %s does not exist' % opts.error_profile)

    if opts.output_dir:
        #make sure it always ends on /
        tmpoutdir=opts.output_dir+"/"
    else:
        #make random dir in current dir
        tmpoutdir = get_tmp_filename(tmp_dir="", prefix="denoiser_", suffix="/")

    create_dir(tmpoutdir, not opts.force)
    
    log_fp = 'denoiser.log'
    
    if opts.split:
        denoise_per_sample(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.cluster,
                           opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail,
                           opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp,
                           opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter,
                           opts.titanium)
    else:
        denoise_seqs(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster,
                     opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer,
                     opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory,
                     opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium,
                     opts.checkpoint_fp)
Exemplo n.º 6
0
def main(commandline_args=None):
    parser, opts, args = parse_command_line_parameters(**script_info)

    if not opts.sff_fp:
        parser.error('Required option flowgram file path (-i) not specified')
    elif not files_exist(opts.sff_fp):
        parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.'
                     % opts.sff_fp)

    if(opts.checkpoint_fp):
        bp_fp = opts.checkpoint_fp
        if not exists(bp_fp):
            parser.error('Specified checkpoint file does not exist: %s' % bp_fp)

    #peek into sff.txt files to make sure they are parseable
    #cat_sff_fles is lazy and only reads header
    flowgrams, header = cat_sff_files(map(open, opts.sff_fp.split(',')))
    
    if(opts.split and opts.preprocess_fp):
        parser.error('Options --split and --preprocess_fp are exclusive')

    if(opts.preprocess_fp):
        pp_fp = opts.preprocess_fp
        if not exists(opts.preprocess_fp):
            parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp)
        if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %(pp_fp, pp_fp)):
            parser.error('Specified preprocess directory does not contain expected files: ' +\
                             'prefix_mapping.txt and prefix_dereplicated.fasta')

    if opts.titanium:
        opts.error_profile = DENOISER_DATA_DIR+'Titanium_error_profile.dat'
        opts.low_cutoff = 4
        opts.high_cutoff = 5

    if not exists(opts.error_profile):
        parser.error('Specified error profile %s does not exist' % opts.error_profile)

    if opts.output_dir:
        #make sure it always ends on /
        tmpoutdir=opts.output_dir+"/"
    else:
        #make random dir in current dir
        tmpoutdir = get_tmp_filename(tmp_dir="", prefix="denoiser_", suffix="/")

    create_dir(tmpoutdir, not opts.force)
    
    log_fp = 'denoiser.log'
    
    if opts.split:
        denoise_per_sample(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.cluster,
                           opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail,
                           opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp,
                           opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter,
                           opts.titanium)
    else:
        denoise_seqs(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster,
                     opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer,
                     opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory,
                     opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium,
                     opts.checkpoint_fp)
Exemplo n.º 7
0
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/",
               verbose=False, squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if(fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp,
                                                      barcode_mapping=barcode_mapping,
                                                      primer=primer)
        if verbose:
            log_fh.write(
                "Sequences in barcode mapping: %d\n" %
                len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp, primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp,
                                                     min_coverage=1,
                                                     # averaging produces too good flowgrams
                                                     # such that the greedy clustering clusters too much.
                                                     # Use the cluster centroid
                                                     # instead by using
                                                     # min_coverage 1
                                                     out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n"
                     % (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)
Exemplo n.º 8
0
def preprocess(sff_fps,
               log_fh,
               fasta_fp=None,
               out_fp="/tmp/",
               verbose=False,
               squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if (fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp,
         l) = truncate_flowgrams_in_SFF(flowgrams,
                                        header,
                                        outdir=out_fp,
                                        barcode_mapping=barcode_mapping,
                                        primer=primer)
        if verbose:
            log_fh.write("Sequences in barcode mapping: %d\n" %
                         len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams,
                                                      header,
                                                      outdir=out_fp,
                                                      primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(
        mapping,
        trunc_sff_fp,
        min_coverage=1,
        # averaging produces too good flowgrams
        # such that the greedy clustering clusters too much.
        # Use the cluster centroid
        # instead by using
        # min_coverage 1
        out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n" %
                     (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)