예제 #1
0
    def test_sort_seqs_by_clustersize(self):
        """sort_seqs_by_clustersize works"""

        seqs = {
            "0": "AAA",
            "1": "AAT",
            "2": "ATT",
            "3": "TTT",
            "4": "TAA",
            "5": "TTA",
            "6": "CCC",
            "7": "GGG",
            "8": "GCG",
        }

        mapping = {"8": ["7", "6"], "1": ["0", "2", "5"], "4": ["3"]}

        observed = list(sort_seqs_by_clustersize(seqs.iteritems(), mapping))
        expected = [
            ("1", "AAT"),
            ("8", "GCG"),
            ("4", "TAA"),
            ("7", "GGG"),
            ("6", "CCC"),
            ("5", "TTA"),
            ("3", "TTT"),
            ("2", "ATT"),
            ("0", "AAA"),
        ]
        self.assertEqual(observed, expected)
예제 #2
0
    def test_sort_seqs_by_clustersize(self):
        """sort_seqs_by_clustersize works"""

        seqs = {
            '0': "AAA",
            '1': "AAT",
            '2': "ATT",
            '3': "TTT",
            '4': "TAA",
            '5': "TTA",
            '6': "CCC",
            '7': "GGG",
            '8': "GCG"
        }

        mapping = {"8": ["7", "6"], "1": ["0", "2", "5"], "4": ["3"]}

        observed = list(sort_seqs_by_clustersize(seqs.iteritems(), mapping))
        expected = [('1', "AAT"), ('8', "GCG"), ('4', 'TAA'), ('7', 'GGG'),
                    ('6', 'CCC'), ('5', 'TTA'), ('3', 'TTT'), ('2', 'ATT'),
                    ('0', 'AAA')]
        self.assertEqual(observed, expected)
예제 #3
0
파일: test_utils.py 프로젝트: Jorge-C/qiime
   def test_sort_seqs_by_clustersize(self):
        """sort_seqs_by_clustersize works"""

        seqs = {'0': "AAA",
                '1': "AAT",
                '2': "ATT",
                '3': "TTT",
                '4': "TAA",
                '5': "TTA",
                '6': "CCC",
                '7': "GGG",
                '8': "GCG"}

        mapping = {"8":["7","6"],
                   "1":["0","2","5"],
                   "4":["3"]}

        observed = list(sort_seqs_by_clustersize(seqs.iteritems(), mapping))
        expected = [('1',"AAT"),('8',"GCG"),('4', 'TAA'), ('7', 'GGG'),
                    ('6', 'CCC'), ('5', 'TTA'), ('3', 'TTT'), ('2', 'ATT'),
                    ('0', 'AAA')]
        self.assertEqual(observed, expected)
예제 #4
0
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False,
                       num_cpus=1, squeeze=True, percent_id=0.97, bail=1,
                       primer="", low_cutoff=3.75, high_cutoff=4.5,
                       log_fp="denoiser.log", low_memory=False, verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None, titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster,
                     num_cpus, squeeze, percent_id, bail, primer,
                     low_cutoff, high_cutoff, log_fp, low_memory,
                     verbose, error_profile, max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(
            parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping),
        fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(
            parse_fasta(open(singleton_file, "r")),
            fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir
예제 #5
0
def denoise_per_sample(sff_fps,
                       fasta_fp,
                       tmpoutdir,
                       cluster=False,
                       num_cpus=1,
                       squeeze=True,
                       percent_id=0.97,
                       bail=1,
                       primer="",
                       low_cutoff=3.75,
                       high_cutoff=4.5,
                       log_fp="denoiser.log",
                       low_memory=False,
                       verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None,
                       titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus,
                     squeeze, percent_id, bail, primer, low_cutoff,
                     high_cutoff, log_fp, low_memory, verbose, error_profile,
                     max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")),
                                        fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir