Пример #1
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                      prefix='split_fasta_tests',
                                      suffix='')
        close(fd)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs),
                                                  DNA))
Пример #2
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = SequenceCollection.from_fasta_records(
            [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA)
        infile = in_seqs.to_fasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                          prefix='split_fasta_tests',
                                          suffix='')
            close(fd)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                SequenceCollection.from_fasta_records(parse_fasta(infile),
                                                      DNA),
                SequenceCollection.from_fasta_records(parse_fasta(actual_seqs),
                                                      DNA))
Пример #3
0
def fast_denoiser(
        sff_fps, fasta_fp, tmp_outdir, num_cpus, primer, verbose=True,
        titanium=False):
    """wrapper function calling methods from the Denoiser package."""
    if num_cpus > 1:
        denoise_seqs(sff_fps, fasta_fp, tmp_outdir,
                     primer=primer, cluster=True, num_cpus=num_cpus,
                     verbose=verbose, titanium=titanium)
    else:
        denoise_seqs(sff_fps, fasta_fp, tmp_outdir, primer=primer,
                     verbose=verbose, titanium=titanium)

    # read centroids and singletons
    centroids = parse_fasta(open(tmp_outdir + "/centroids.fasta"))
    singletons = parse_fasta(open(tmp_outdir + "/singletons.fasta"))

    seqs = chain(centroids, singletons)

    # read mapping
    mapping = {}
    cluster_mapping = open(tmp_outdir + "/denoiser_mapping.txt")
    for i, cluster in enumerate(cluster_mapping):
        cluster, members = cluster.split(':')
        members = members.split()
        clust = [cluster]
        clust.extend(members)
        mapping[i] = clust

    return seqs, mapping
Пример #4
0
    def test_deblur_with_non_default_error_profile(self):
        error_dist = [
            1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025,
            0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005,
            0.0000005, 0.0000005
        ]
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        # Trying with a numpy array
        error_dist = np.array([
            1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025,
            0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005,
            0.0000005, 0.0000005
        ])
        seqs_f = StringIO(TEST_SEQS_2)
        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)

        self.assertEqual(obs, exp)
Пример #5
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(fd)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
Пример #6
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                      prefix='split_fasta_tests',
                                      suffix='')
        close(fd)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs),
                                                  DNA))
Пример #7
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = SequenceCollection.from_fasta_records(
            [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA)
        infile = in_seqs.to_fasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(fd)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
                SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
Пример #8
0
def fast_denoiser(sff_fps, fasta_fp, tmp_outdir, num_cpus, primer, verbose=True, titanium=False):
    """wrapper function calling methods from the Denoiser package."""
    if num_cpus > 1:
        denoise_seqs(
            sff_fps,
            fasta_fp,
            tmp_outdir,
            primer=primer,
            cluster=True,
            num_cpus=num_cpus,
            verbose=verbose,
            titanium=titanium,
        )
    else:
        denoise_seqs(sff_fps, fasta_fp, tmp_outdir, primer=primer, verbose=verbose, titanium=titanium)

    # read centroids and singletons
    centroids = parse_fasta(open(tmp_outdir + "/centroids.fasta"))
    singletons = parse_fasta(open(tmp_outdir + "/singletons.fasta"))

    seqs = chain(centroids, singletons)

    # read mapping
    mapping = {}
    cluster_mapping = open(tmp_outdir + "/denoiser_mapping.txt")
    for i, cluster in enumerate(cluster_mapping):
        cluster, members = cluster.split(":")
        members = members.split()
        clust = [cluster]
        clust.extend(members)
        mapping[i] = clust

    return seqs, mapping
Пример #9
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(template_alignment,
                                                          DNASequence,
                                                          validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
    def setUp(self):
        """ """
        self.fasta_lines1 = fasta_lines1.split("\n")
        self.fasta_lines1_mixed_case = fasta_lines1_mixed_case.split("\n")

        self.fasta_lines1_exp = list(parse_fasta(fasta_lines1_exp.split("\n")))
        self.fasta_lines1_mixed_case_exp = list(parse_fasta(fasta_lines1_mixed_case_exp.split("\n")))
        self.fasta_lines1_exp_null_desc_mapper = list(parse_fasta(fasta_lines1_exp_null_desc_mapper.split("\n")))
Пример #11
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        try:
            template_alignment = LoadSeqs(data=template_alignment, moltype=DNA,
                                          aligned=DenseAlignment)
        except KeyError as e:
            raise KeyError('Only ACGT-. characters can be contained in template alignments.' +
                           ' The offending character was: %s' % e)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            for seq in pynast_failed:
                fail_file.write(seq.toFasta())
                fail_file.write('\n')
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            for seq in pynast_aligned:
                result_file.write(seq.toFasta())
                result_file.write('\n')
            result_file.close()
            return None
        else:
            try:
                return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment)
            except ValueError:
                return {}
Пример #12
0
    def setUp(self):
        """ """
        self.fasta_lines1 = fasta_lines1.split('\n')
        self.fasta_lines1_mixed_case = fasta_lines1_mixed_case.split('\n')

        self.fasta_lines1_exp = list(parse_fasta(fasta_lines1_exp.split('\n')))
        self.fasta_lines1_mixed_case_exp = list(
            parse_fasta(fasta_lines1_mixed_case_exp.split('\n')))
        self.fasta_lines1_exp_null_desc_mapper = list(
            parse_fasta(fasta_lines1_exp_null_desc_mapper.split('\n')))
Пример #13
0
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        with open(self.pynast_test1_input_fp, "w") as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test1_template_fp, "w") as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, "w") as f:
            f.write(pynast_test1_template_fasta.replace("-", "."))

        fd, self.pynast_test_template_w_u_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_u_fp, "w") as f:
            f.write(pynast_test1_template_fasta.replace("T", "U"))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, "w") as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.result_fp, "w").close()
        fd, self.failure_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.failure_fp, "w").close()
        fd, self.log_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".log")
        close(fd)
        open(self.log_fp, "w").close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp,
        ]

        self.pynast_test1_aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 15})

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(parse_fasta(pynast_test1_expected_alignment), DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
            parse_fasta(pynast_test1_expected_failure), DNA
        )
Пример #14
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(
                    template_alignment, DNASequence, validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
Пример #15
0
    def test_deblur_with_non_default_error_profile(self):
        error_dist = [
            1,
            0.05,
            0.000005,
            0.000005,
            0.000005,
            0.000005,
            0.0000025,
            0.0000025,
            0.0000025,
            0.0000025,
            0.0000025,
            0.0000005,
            0.0000005,
            0.0000005,
            0.0000005,
        ]
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag",
            )
        ]

        # Trying with a numpy array
        error_dist = np.array(
            [
                1,
                0.05,
                0.000005,
                0.000005,
                0.000005,
                0.000005,
                0.0000025,
                0.0000025,
                0.0000025,
                0.0000025,
                0.0000025,
                0.0000005,
                0.0000005,
                0.0000005,
                0.0000005,
            ]
        )
        seqs_f = StringIO(TEST_SEQS_2)
        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)

        self.assertEqual(obs, exp)
Пример #16
0
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

    # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in parse_fasta(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
Пример #17
0
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

     # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in parse_fasta(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
Пример #18
0
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={'Algorithm': 'first',
                                         'ChoiceF': first_id})
     app(self.tmp_seq_filepath,
         self.tmp_otu_filepath,
         self.ref_seq_filepath,
         result_path=self.result_filepath)
     with open(self.result_filepath) as f:
         actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA)
     expected = SequenceCollection.from_fasta_records(
         parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA)
     # we don't care about order in the results
     self.assertEqual(set(actual), set(expected))
def check_fasta_seqs_lens(input_fasta_fp):
    """ Creates bins of sequence lens

    Useful for checking for valid aligned sequences.

    input_fasta_fp:  input fasta filepath
    """

    seq_lens = defaultdict(int)

    input_fasta_f = open(input_fasta_fp, "U")

    for label, seq in parse_fasta(input_fasta_f):
        seq_lens[len(seq)] += 1

    input_fasta_f.close()

    formatted_seq_lens = []

    for curr_key in seq_lens:
        formatted_seq_lens.append((seq_lens[curr_key], curr_key))

    formatted_seq_lens.sort(reverse=True)

    return formatted_seq_lens
Пример #20
0
 def output_test(self, aligned_basename):
     """ Test results of test_load_zip() and test_load_gzip()
     """
     f_log = open(aligned_basename + ".log", "U")
     f_log_str = f_log.read()
     self.assertTrue("Total reads passing E-value threshold" in f_log_str)
     self.assertTrue("Total reads for de novo clustering" in f_log_str)
     self.assertTrue("Total OTUs" in f_log_str)
     f_log.seek(0)
     for line in f_log:
         if line.startswith("    Total reads passing E-value threshold"):
             num_hits = (re.split(
                 'Total reads passing E-value threshold = | \(',
                 line)[1]).strip()
         elif line.startswith("    Total reads for de novo clustering"):
             num_failures_log = (re.split(
                 'Total reads for de novo clustering = ', line)[1]).strip()
         elif line.startswith(" Total OTUs"):
             num_clusters_log = (re.split('Total OTUs = ', line)[1]).strip()
     f_log.close()
     # Correct number of reads mapped
     self.assertEqual("99999", num_hits)
     # Correct number of clusters recorded
     self.assertEqual("272", num_clusters_log)
     # Correct number of clusters in OTU-map
     with open(aligned_basename + "_otus.txt", 'U') as f_otumap:
         num_clusters_file = sum(1 for line in f_otumap)
     self.assertEqual(272, num_clusters_file)
     num_failures_file = 0
     with open(aligned_basename + "_denovo.fasta", 'U') as f_denovo:
         for label, seq in parse_fasta(f_denovo):
             num_failures_file += 1
     # Correct number of reads for de novo clustering
     self.assertEqual(num_failures_log, str(num_failures_file))
Пример #21
0
 def output_test(self, aligned_basename):
     """ Test results of test_load_zip() and test_load_gzip()
     """
     f_log = open(aligned_basename + ".log", "U")
     f_log_str = f_log.read()
     self.assertTrue("Total reads passing E-value threshold" in f_log_str)
     self.assertTrue("Total reads for de novo clustering" in f_log_str)
     self.assertTrue("Total OTUs" in f_log_str)
     f_log.seek(0)
     for line in f_log:
         if line.startswith("    Total reads passing E-value threshold"):
             num_hits = (re.split('Total reads passing E-value threshold = | \(', line)[1]).strip()
         elif line.startswith("    Total reads for de novo clustering"):
             num_failures_log = (re.split('Total reads for de novo clustering = ',
                           line)[1]).strip()
         elif line.startswith(" Total OTUs"):
             num_clusters_log = (re.split('Total OTUs = ', line)[1]).strip()
     f_log.close()
     # Correct number of reads mapped
     self.assertEqual("99999", num_hits)
     # Correct number of clusters recorded
     self.assertEqual("272", num_clusters_log)
     # Correct number of clusters in OTU-map
     with open(aligned_basename + "_otus.txt", 'U') as f_otumap:
         num_clusters_file = sum(1 for line in f_otumap)
     self.assertEqual(272, num_clusters_file)
     num_failures_file = 0
     with open(aligned_basename + "_denovo.fasta", 'U') as f_denovo:
         for label, seq in parse_fasta(f_denovo):
             num_failures_file += 1
     # Correct number of reads for de novo clustering
     self.assertEqual(num_failures_log, str(num_failures_file))
Пример #22
0
def align_two_alignments(aln1, aln2, moltype, params=None):
    """Returns an Alignment object from two existing Alignments.

    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
    used to build them.
        - Mafft profile alignment only works with aligned sequences. Alignment
        object used to handle unaligned sequences.

    params: dict of parameters to pass in to the Mafft app controller.
    """
    #create SequenceCollection object from seqs
    aln1 = Alignment(aln1,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1.getIntMap()
    #Create SequenceCollection from int_map.
    aln1_int_map = Alignment(aln1_int_map,MolType=moltype)
    
    #create Alignment object from aln
    aln2 = Alignment(aln2,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln2_int_map = Alignment(aln2_int_map,MolType=moltype)
    
    #Update aln1_int_keys with aln2_int_keys
    aln1_int_keys.update(aln2_int_keys)
    
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_paths',\
        params=params,
        SuppressStderr=False)
    app._command = 'mafft-profile'
    
    aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta())
    aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta())
    filepaths = [aln1_path,aln2_path]
    
    #Get results using int_map as input to app
    res = app(filepaths)

    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['StdOut']))
    
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        key = k.replace('_seed_','')
        new_alignment[aln1_int_keys[key]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(aln1_path)
    remove(aln2_path)
    remove('pre')
    remove('trace')
    del(aln1,aln1_int_map,aln1_int_keys,\
        aln2,aln2_int_map,aln2_int_keys,app,res,alignment)

    return new_alignment
Пример #23
0
    def test_main(self):
        """Denoiser should always give same result on test data"""

        expected = ">FS8APND01D3TW3 | cluster size: 94 \nCTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC\n"

        expected_map = """FS8APND01EWRS4:
FS8APND01DXG45:
FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN
FS8APND01BSTVP:
FS8APND01EFK0W:
FS8APND01DCIOO:
FS8APND01CKOMZ:
"""

        command = " ".join(["denoiser.py",
                            "--force", "-o", self.test_dir, "-i",
                            "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME])

        result = Popen(command, shell=True, universal_newlines=True,
                       stdout=PIPE, stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir + "centroids.fasta")))
        self.assertEqual(observed, expected)

        self.assertEqual(
            len(list(parse_fasta(open(self.result_dir + "singletons.fasta")))),
            6)

        observed = "".join(
            list(open(self.result_dir + "denoiser_mapping.txt")))
        self.assertEqual(observed, expected_map)
Пример #24
0
 def setUp(self):
     
     # create a list of files to cleanup
     self._paths_to_clean_up = []
     self._dirs_to_clean_up = []
     
     # load query seqs
     self.seqs = Alignment(parse_fasta(QUERY_SEQS.split()))
     
     # generate temp filename
     tmp_dir='/tmp'
     self.outfile = get_tmp_filename(tmp_dir)
     
     # create and write out reference sequence file
     self.outfasta=splitext(self.outfile)[0]+'.fasta'
     fastaout=open(self.outfasta,'w')
     fastaout.write(REF_SEQS)
     fastaout.close()
     self._paths_to_clean_up.append(self.outfasta)
     
     # create and write out starting tree file
     self.outtree=splitext(self.outfile)[0]+'.tree'
     treeout=open(self.outtree,'w')
     treeout.write(REF_TREE)
     treeout.close()
     self._paths_to_clean_up.append(self.outtree)
Пример #25
0
    def _split_along_prefix(self,
                            input_fp,
                            params,
                            jobs_to_start,
                            job_prefix,
                            output_dir):
        """ Split input sequences into sets with identical prefix"""
        out_files = []
        buffered_handles = {}
        prefix_length = params['prefix_length'] or 1
        for seq_id, seq in parse_fasta(open(input_fp)):

            if(len(seq) < prefix_length):
                raise ValueError("Prefix length must be equal or longer than sequence.\n"
                                 + " Found seq %s with length %d" % (seq_id, len(seq)))
            prefix = seq[:prefix_length]

            if (prefix not in buffered_handles):
                # never seen this prefix before
                out_fp = "%s/%s%s" % (output_dir, job_prefix, prefix)
                buffered_handles[prefix] = BufferedWriter(out_fp)
                out_files.append(out_fp)
                self.prefix_counts[prefix] = 0

            self.prefix_counts[prefix] += 1
            buffered_handles[prefix].write('>%s\n%s\n' % (seq_id, seq))

        # make sure all buffers are closed and flushed
        for buf_fh in buffered_handles.itervalues():
            buf_fh.close()

        remove_files = True
        return out_files, remove_files
Пример #26
0
    def _generate_training_files(self):
        """Returns a tuple of file objects suitable for passing to the
        RdpTrainer application controller.
        """
        tmp_dir = get_qiime_temp_dir()
        training_set = RdpTrainingSet()
        reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U')
        id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U')

        for seq_id, seq in parse_fasta(reference_seqs_file):
            training_set.add_sequence(seq_id, seq)

        for line in id_to_taxonomy_file:
            seq_id, lineage_str = map(strip, line.split('\t'))
            training_set.add_lineage(seq_id, lineage_str)

        training_set.dereplicate_taxa()

        rdp_taxonomy_file = NamedTemporaryFile(
            prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir)
        rdp_taxonomy_file.write(training_set.get_rdp_taxonomy())
        rdp_taxonomy_file.seek(0)

        rdp_training_seqs_file = NamedTemporaryFile(
            prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta',
            dir=tmp_dir)
        for rdp_id, seq in training_set.get_training_seqs():
            rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq))
        rdp_training_seqs_file.seek(0)

        self._training_set = training_set

        return rdp_taxonomy_file, rdp_training_seqs_file
Пример #27
0
def seqs_from_file(ids, file_lines):
    """Extract labels and seqs from file"""

    for label, seq in parse_fasta(file_lines):

        if id_from_fasta_label_line(label) in ids:
            yield label, seq
Пример #28
0
    def test_dereplicate_seqs_remove_singletons(self):
        """ Test dereplicate_seqs() method functionality with
            removing singletons
        """
        seqs = [("seq1", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq2", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq3", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq4", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCT"),
                ("seq5", "TACCAGCCCCTTAAGTGGTAGGGACGATTATTTGGCCTAAAGCGTCCG"),
                ("seq6", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT"),
                ("seq7", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")]
        seqs_fp = join(self.working_dir, "seqs.fasta")
        with open(seqs_fp, 'w') as seqs_f:
            for seq in seqs:
                seqs_f.write(">%s\n%s\n" % seq)

        output_fp = join(self.working_dir, "seqs_derep.fasta")
        log_fp = join(self.working_dir, "seqs_derep.log")

        dereplicate_seqs(seqs_fp=seqs_fp,
                         output_fp=output_fp)
        self.assertTrue(isfile(output_fp))
        self.assertTrue(isfile(log_fp))

        exp = [("seq1;size=3;",
                "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
               ("seq6;size=2;",
                "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")]

        with open(output_fp, 'U') as out_f:
            act = [item for item in parse_fasta(out_f)]

        self.assertEqual(act, exp)
Пример #29
0
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"):
    """Splits a sff.txt file on barcode/mapping file."""

    try:
        (flowgrams, header) = cat_sff_files(sff_file_handles)
    except ValueError:
        # reading in the binary sff usually shows up as ValueError
        raise FileFormatError('Wrong flogram file format. Make sure you pass the sff.txt format ' +
                              'produced by sffinfo. The binary .sff will not work here.')

    (inverse_map, map_count) = build_inverse_barcode_map(
        parse_fasta(map_file_handle))

    filenames = []
    # we might have many barcodes and reach python open file limit
    # therefor we go the slow way and open and close files each time
    # First set up all files with the headers only
    for barcode_id in map_count.keys():
        fh = open(outdir + barcode_id, "w")
        write_sff_header(header, fh, map_count[barcode_id])
        fh.close()
        filenames.append(outdir + barcode_id)
    # Then direct each flowgram into its barcode file
    for f in flowgrams:
        if f.Name in inverse_map:
            barcode_id = inverse_map[f.Name]
            fh = open(outdir + barcode_id, "a")
            fh.write(f.createFlowHeader() + "\n")
    return filenames
Пример #30
0
def get_seqs_to_keep_lookup_from_sample_ids(fasta_f, sample_ids):
    sample_ids = set(sample_ids)
    seqs_to_keep = set()
    for seq_id, seq in parse_fasta(fasta_f):
        if seq_id.split('_')[0] in sample_ids:
            seqs_to_keep.add(seq_id)
    return {}.fromkeys(seqs_to_keep)
Пример #31
0
def sort_fasta_by_abundance(fasta_lines, fasta_out_f):
    """ Sort seqs in fasta_line by abundance, write all seqs to fasta_out_f

     Note that all sequences are written out, not just unique ones.

     fasta_lines: input file handle (or similar object)
     fasta_out_f: output file handle (or similar object)

    ** The current implementation works well for fairly large data sets,
       (e.g., several combined 454 runs) but we may want to revisit if it
       chokes on very large (e.g., Illumina) files. --Greg **

    """
    seq_index = {}
    count = 0
    for seq_id, seq in parse_fasta(fasta_lines):
        count += 1
        try:
            seq_index[seq].append(seq_id)
        except KeyError:
            seq_index[seq] = [seq_id]

    seqs = []
    for k, v in seq_index.items():
        seqs.append((len(v), k, v))
        del seq_index[k]
    seqs.sort()
    for count, seq, seq_ids in seqs[::-1]:
        for seq_id in seq_ids:
            fasta_out_f.write('>%s\n%s\n' % (seq_id, seq))
Пример #32
0
def filter_fasta(input_seqs_f, output_seqs_f, seqs_to_keep, negate=False,
                 seqid_f=None):
    """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep

        input_seqs can be the output of parse_fasta or parse_fastq
    """
    if seqid_f is None:
        seqs_to_keep_lookup = {}.fromkeys([seq_id.split()[0]
                                           for seq_id in seqs_to_keep])

        # Define a function based on the value of negate
        if not negate:
            def keep_seq(seq_id):
                return seq_id.split()[0] in seqs_to_keep_lookup
        else:
            def keep_seq(seq_id):
                return seq_id.split()[0] not in seqs_to_keep_lookup

    else:
        if not negate:
            keep_seq = seqid_f
        else:
            keep_seq = lambda x: not seqid_f(x)

    for seq_id, seq in parse_fasta(input_seqs_f):
        if keep_seq(seq_id):
            output_seqs_f.write('>%s\n%s\n' % (seq_id, seq))
    output_seqs_f.close()
Пример #33
0
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95):
    """ remove sequences very different from the majority consensus

    given aligned sequences, will:
     1. calculate a majority consensus (most common symbol at each position
        of the alignment);
     2. compute the mean/std edit distance of each seq to the consensus;
     3. discard sequences whose edit dist is greater than the cutoff, which is
        defined as being `num_stds` greater than the mean.

    """
    # load the alignment and compute the consensus sequence
    aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA)
    consensus_seq = aln.majority_consensus()
    # compute the hamming distance between all sequences in the alignment
    # and the consensus sequence
    dists_to_consensus = [s.distance(consensus_seq) for s in aln]
    # compute the average and standard deviation distance from the consensus
    average_distance = mean(dists_to_consensus)
    std_distance = std(dists_to_consensus)
    # compute the distance cutoff
    dist_cutoff = average_distance + num_stds * std_distance
    # for all sequences, determine if they're distance to the consensus
    # is less then or equal to the cutoff distance. if so, add the sequence's
    # identifier to the list of sequence identifiers to keep
    seqs_to_keep = []
    for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus):
        if dist_to_consensus <= dist_cutoff:
            seqs_to_keep.append(seq_id)
    # filter the alignment to only keep the sequences identified in the step
    # above
    filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep)
    # and return the filtered alignment
    return filtered_aln
Пример #34
0
def generate_lane_mask(infile, entropy_threshold, existing_mask=None):
    """ Generates lane mask dynamically by calculating base frequencies

    infile: open file object for aligned fasta file
    entropy_threshold:  float value that designates the percentage of entropic
     positions to be removed, i.e., 0.10 means the 10% most entropic positions
     are removed.

    """
    aln = Alignment.from_fasta_records(parse_fasta(infile), DNA)
    uncertainty = aln.position_entropies(nan_on_non_standard_chars=False)

    uncertainty_sorted = sorted(uncertainty)

    cutoff_index = int(
        round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold)))

    max_uncertainty = uncertainty_sorted[cutoff_index]

    # This correction is for small datasets with a small possible number of
    # uncertainty values.
    highest_certainty = min(uncertainty_sorted)

    lane_mask = ""

    for base in uncertainty:
        if base >= max_uncertainty and base != highest_certainty:
            lane_mask += "0"
        else:
            lane_mask += "1"

    return lane_mask
def truncate_rev_primers(fasta_f,
                         output_fp,
                         reverse_primers,
                         truncate_option='truncate_only',
                         primer_mismatches=2):
    """ Locally aligns reverse primers, trucates or removes seqs

    fasta_f:  open file of fasta file
    output_fp: open filepath to write truncated fasta to
    reverse_primers: dictionary of SampleID:reverse primer sequence
    truncate_option: either truncate_only, truncate_remove
    primer_mismatches: number of allowed primer mismatches
    """

    log_data = {
        'sample_id_not_found': 0,
        'reverse_primer_not_found': 0,
        'total_seqs': 0,
        'seqs_written': 0
    }

    for label, seq in parse_fasta(fasta_f):
        curr_label = label.split('_')[0]

        log_data['total_seqs'] += 1

        # Check fasta label for valid SampleID, if not found, just write seq
        try:
            curr_rev_primer = reverse_primers[curr_label]
        except KeyError:
            log_data['sample_id_not_found'] += 1
            output_fp.write('>%s\n%s\n' % (label, seq))
            log_data['seqs_written'] += 1
            continue

        mm_tests = {}
        for rev_primer in curr_rev_primer:

            rev_primer_mm, rev_primer_index =\
                local_align_primer_seq(rev_primer, seq)

            mm_tests[rev_primer_mm] = rev_primer_index

        rev_primer_mm = min(mm_tests.keys())
        rev_primer_index = mm_tests[rev_primer_mm]

        if rev_primer_mm > primer_mismatches:
            if truncate_option == "truncate_remove":
                log_data['reverse_primer_not_found'] += 1
            else:
                log_data['reverse_primer_not_found'] += 1
                log_data['seqs_written'] += 1
                output_fp.write('>%s\n%s\n' % (label, seq))
        else:
            # Check for zero seq length after truncation, will not write seq
            if rev_primer_index > 0:
                log_data['seqs_written'] += 1
                output_fp.write('>%s\n%s\n' % (label, seq[0:rev_primer_index]))

    return log_data
Пример #36
0
    def _generate_training_files(self):
        """Returns a tuple of file objects suitable for passing to the
        RdpTrainer application controller.
        """
        tmp_dir = get_qiime_temp_dir()
        training_set = RdpTrainingSet()
        reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U')
        id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U')

        for seq_id, seq in parse_fasta(reference_seqs_file):
            training_set.add_sequence(seq_id, seq)

        for line in id_to_taxonomy_file:
            seq_id, lineage_str = map(strip, line.split('\t'))
            training_set.add_lineage(seq_id, lineage_str)

        training_set.dereplicate_taxa()

        rdp_taxonomy_file = NamedTemporaryFile(
            prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir)
        rdp_taxonomy_file.write(training_set.get_rdp_taxonomy())
        rdp_taxonomy_file.seek(0)

        rdp_training_seqs_file = NamedTemporaryFile(
            prefix='RdpTaxonAssigner_training_seqs_',
            suffix='.fasta',
            dir=tmp_dir)
        for rdp_id, seq in training_set.get_training_seqs():
            rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq))
        rdp_training_seqs_file.seek(0)

        self._training_set = training_set

        return rdp_taxonomy_file, rdp_training_seqs_file
Пример #37
0
def check_fasta_seqs_lens(input_fasta_fp):
    """ Creates bins of sequence lens

    Useful for checking for valid aligned sequences.

    input_fasta_fp:  input fasta filepath
    """

    seq_lens = defaultdict(int)

    input_fasta_f = open(input_fasta_fp, "U")

    for label, seq in parse_fasta(input_fasta_f):
        seq_lens[len(seq)] += 1

    input_fasta_f.close()

    formatted_seq_lens = []

    for curr_key in seq_lens:
        formatted_seq_lens.append((seq_lens[curr_key], curr_key))

    formatted_seq_lens.sort(reverse=True)

    return formatted_seq_lens
Пример #38
0
def seqs_from_file(ids, file_lines):
    """Extract labels and seqs from file"""

    for label, seq in parse_fasta(file_lines):

        if id_from_fasta_label_line(label) in ids:
            yield label, seq
Пример #39
0
def truncate_rev_primers(fasta_f,
                         output_fp,
                         reverse_primers,
                         truncate_option='truncate_only',
                         primer_mismatches=2):
    """ Locally aligns reverse primers, trucates or removes seqs

    fasta_f:  open file of fasta file
    output_fp: open filepath to write truncated fasta to
    reverse_primers: dictionary of SampleID:reverse primer sequence
    truncate_option: either truncate_only, truncate_remove
    primer_mismatches: number of allowed primer mismatches
    """

    log_data = {
        'sample_id_not_found': 0,
        'reverse_primer_not_found': 0,
        'total_seqs': 0,
        'seqs_written': 0
    }

    for label, seq in parse_fasta(fasta_f):
        curr_label = label.split('_')[0]

        log_data['total_seqs'] += 1

        # Check fasta label for valid SampleID, if not found, just write seq
        try:
            curr_rev_primer = reverse_primers[curr_label]
        except KeyError:
            log_data['sample_id_not_found'] += 1
            output_fp.write('>%s\n%s\n' % (label, seq))
            log_data['seqs_written'] += 1
            continue

        mm_tests = {}
        for rev_primer in curr_rev_primer:

            rev_primer_mm, rev_primer_index =\
                local_align_primer_seq(rev_primer, seq)

            mm_tests[rev_primer_mm] = rev_primer_index

        rev_primer_mm = min(mm_tests.keys())
        rev_primer_index = mm_tests[rev_primer_mm]

        if rev_primer_mm > primer_mismatches:
            if truncate_option == "truncate_remove":
                log_data['reverse_primer_not_found'] += 1
            else:
                log_data['reverse_primer_not_found'] += 1
                log_data['seqs_written'] += 1
                output_fp.write('>%s\n%s\n' % (label, seq))
        else:
            # Check for zero seq length after truncation, will not write seq
            if rev_primer_index > 0:
                log_data['seqs_written'] += 1
                output_fp.write('>%s\n%s\n' % (label, seq[0:rev_primer_index]))

    return log_data
Пример #40
0
    def getResult(self, aln_path, *args, **kwargs):
        """Returns alignment from sequences.

        Currently does not allow parameter tuning of program and uses
        default parameters -- this is bad and should be fixed.

        #TODO: allow command-line access to important aln params.
        """
        module = self.Params['Module']
        # standard qiime says we just consider the first word as the unique ID
        # the rest of the defline of the fasta alignment often doesn't match
        # the otu names in the otu table
        with open(aln_path) as aln_f:
            seqs = Alignment.from_fasta_records(
                parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]),
                DNA)
        # This ugly little line of code lets us pass a skbio Alignment when a
        # a cogent alignment is expected.
        seqs.getIntMap = seqs.int_map
        result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent)

        try:
            root_method = kwargs['root_method']
            if root_method == 'midpoint':
                result = root_midpt(result)
            elif root_method == 'tree_method_default':
                pass
        except KeyError:
            pass
        return result
Пример #41
0
    def test_dereplicate_seqs_remove_singletons(self):
        """ Test dereplicate_seqs() method functionality with
            removing singletons
        """
        seqs = [("seq1", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq2", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq3", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
                ("seq4", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCT"),
                ("seq5", "TACCAGCCCCTTAAGTGGTAGGGACGATTATTTGGCCTAAAGCGTCCG"),
                ("seq6", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT"),
                ("seq7", "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")]
        seqs_fp = join(self.working_dir, "seqs.fasta")
        with open(seqs_fp, 'w') as seqs_f:
            for seq in seqs:
                seqs_f.write(">%s\n%s\n" % seq)

        output_fp = join(self.working_dir, "seqs_derep.fasta")
        log_fp = join(self.working_dir, "seqs_derep.log")

        dereplicate_seqs(seqs_fp=seqs_fp, output_fp=output_fp)
        self.assertTrue(isfile(output_fp))
        self.assertTrue(isfile(log_fp))

        exp = [("seq1;size=3;",
                "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAAAGCGTCCG"),
               ("seq6;size=2;",
                "CTGCAAGGCTAGGGGGCGGGAGAGGCGGGTGGTACTTGAGGGGAGAAT")]

        with open(output_fp, 'U') as out_f:
            act = [item for item in parse_fasta(out_f)]

        self.assertEqual(act, exp)
Пример #42
0
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={
         'Algorithm': 'first',
         'ChoiceF': first_id
     })
     app(self.tmp_seq_filepath,
         self.tmp_otu_filepath,
         self.ref_seq_filepath,
         result_path=self.result_filepath)
     with open(self.result_filepath) as f:
         actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA)
     expected = SequenceCollection.from_fasta_records(
         parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA)
     # we don't care about order in the results
     self.assertEqual(set(actual), set(expected))
Пример #43
0
    def setUp(self):
        fd, self.infernal_test1_input_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta")
        close(fd)
        with open(self.infernal_test1_input_fp, "w") as in_f:
            in_f.write("\n".join(infernal_test1_input_fasta))

        fd, self.infernal_test1_template_fp = mkstemp(prefix="InfernalAlignerTests_", suffix="template.sto")
        close(fd)
        with open(self.infernal_test1_template_fp, "w") as in_f:
            in_f.write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.result_fp, "w").close()

        fd, self.log_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".log")
        close(fd)
        open(self.log_fp, "w").close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
        ]

        self.infernal_test1_aligner = InfernalAligner({"template_filepath": self.infernal_test1_template_fp})
        self.infernal_test1_expected_aln = Alignment.from_fasta_records(
            parse_fasta(infernal_test1_expected_alignment), DNA
        )
Пример #44
0
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"):
    """Splits a sff.txt file on barcode/mapping file."""

    try:
        (flowgrams, header) = cat_sff_files(sff_file_handles)
    except ValueError:
        # reading in the binary sff usually shows up as ValueError
        raise FileFormatError(
            'Wrong flogram file format. Make sure you pass the sff.txt format '
            + 'produced by sffinfo. The binary .sff will not work here.')

    (inverse_map,
     map_count) = build_inverse_barcode_map(parse_fasta(map_file_handle))

    filenames = []
    # we might have many barcodes and reach python open file limit
    # therefor we go the slow way and open and close files each time
    # First set up all files with the headers only
    for barcode_id in map_count.keys():
        fh = open(outdir + barcode_id, "w")
        write_sff_header(header, fh, map_count[barcode_id])
        fh.close()
        filenames.append(outdir + barcode_id)
    # Then direct each flowgram into its barcode file
    for f in flowgrams:
        if f.Name in inverse_map:
            barcode_id = inverse_map[f.Name]
            fh = open(outdir + barcode_id, "a")
            fh.write(f.createFlowHeader() + "\n")
    return filenames
Пример #45
0
def generate_lane_mask(infile, entropy_threshold, existing_mask=None):
    """ Generates lane mask dynamically by calculating base frequencies

    infile: open file object for aligned fasta file
    entropy_threshold:  float value that designates the percentage of entropic
     positions to be removed, i.e., 0.10 means the 10% most entropic positions
     are removed.

    """
    aln = Alignment.from_fasta_records(parse_fasta(infile), DNA)
    uncertainty = aln.position_entropies(nan_on_non_standard_chars=False)

    uncertainty_sorted = sorted(uncertainty)

    cutoff_index = int(round((len(uncertainty_sorted) - 1) *
                             (1 - entropy_threshold)))

    max_uncertainty = uncertainty_sorted[cutoff_index]

    # This correction is for small datasets with a small possible number of
    # uncertainty values.
    highest_certainty = min(uncertainty_sorted)

    lane_mask = ""

    for base in uncertainty:
        if base >= max_uncertainty and base != highest_certainty:
            lane_mask += "0"
        else:
            lane_mask += "1"

    return lane_mask
Пример #46
0
def get_seqs_to_keep_lookup_from_sample_ids(fasta_f, sample_ids):
    sample_ids = set(sample_ids)
    seqs_to_keep = set()
    for seq_id, seq in parse_fasta(fasta_f):
        if seq_id.split('_')[0] in sample_ids:
            seqs_to_keep.add(seq_id)
    return {}.fromkeys(seqs_to_keep)
Пример #47
0
def filter_fasta(input_seqs_f,
                 output_seqs_f,
                 seqs_to_keep,
                 negate=False,
                 seqid_f=None):
    """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep

        input_seqs can be the output of parse_fasta or parse_fastq
    """
    if seqid_f is None:
        seqs_to_keep_lookup = {}.fromkeys(
            [seq_id.split()[0] for seq_id in seqs_to_keep])

        # Define a function based on the value of negate
        if not negate:

            def keep_seq(seq_id):
                return seq_id.split()[0] in seqs_to_keep_lookup
        else:

            def keep_seq(seq_id):
                return seq_id.split()[0] not in seqs_to_keep_lookup

    else:
        if not negate:
            keep_seq = seqid_f
        else:
            keep_seq = lambda x: not seqid_f(x)

    for seq_id, seq in parse_fasta(input_seqs_f):
        if keep_seq(seq_id):
            output_seqs_f.write('>%s\n%s\n' % (seq_id, seq))
    output_seqs_f.close()
Пример #48
0
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95):
    """ remove sequences very different from the majority consensus

    given aligned sequences, will:
     1. calculate a majority consensus (most common symbol at each position
        of the alignment);
     2. compute the mean/std edit distance of each seq to the consensus;
     3. discard sequences whose edit dist is greater than the cutoff, which is
        defined as being `num_stds` greater than the mean.

    """
    # load the alignment and compute the consensus sequence
    aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA)
    consensus_seq = aln.majority_consensus()
    # compute the hamming distance between all sequences in the alignment
    # and the consensus sequence
    dists_to_consensus = [s.distance(consensus_seq) for s in aln]
    # compute the average and standard deviation distance from the consensus
    average_distance = mean(dists_to_consensus)
    std_distance = std(dists_to_consensus)
    # compute the distance cutoff
    dist_cutoff = average_distance + num_stds * std_distance
    # for all sequences, determine if they're distance to the consensus
    # is less then or equal to the cutoff distance. if so, add the sequence's
    # identifier to the list of sequence identifiers to keep
    seqs_to_keep = []
    for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus):
        if dist_to_consensus <= dist_cutoff:
            seqs_to_keep.append(seq_id)
    # filter the alignment to only keep the sequences identified in the step
    # above
    filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep)
    # and return the filtered alignment
    return filtered_aln
Пример #49
0
    def get_seqs_act_split_sequence_on_sample_ids(self, output_dir):
        """Parse output of split_sequence_file_on_sample_ids_to_files()

        Parameters
        ----------
        output_dir: string
            output directory path storing FASTA files

        Returns
        -------
        seqs_act: dict
            dictionary with keys being sample IDs and values list of
            sequences belonging to sample ID
        """
        seqs_act = {}
        for fn in listdir(output_dir):
            input_fp = join(output_dir, fn)
            sample_file = splitext(fn)[0]
            with open(input_fp, 'U') as input_f:
                for label, seq in parse_fasta(input_f):
                    sample = label.split('_')[0]
                    self.assertEqual(sample_file, sample)
                    if sample not in seqs_act:
                        seqs_act[sample] = [(label, seq)]
                    else:
                        seqs_act[sample].append((label, seq))
        return seqs_act
Пример #50
0
def sort_fasta_by_abundance(fasta_lines, fasta_out_f):
    """ Sort seqs in fasta_line by abundance, write all seqs to fasta_out_f

     Note that all sequences are written out, not just unique ones.

     fasta_lines: input file handle (or similar object)
     fasta_out_f: output file handle (or similar object)

    ** The current implementation works well for fairly large data sets,
       (e.g., several combined 454 runs) but we may want to revisit if it
       chokes on very large (e.g., Illumina) files. --Greg **

    """
    seq_index = {}
    count = 0
    for seq_id, seq in parse_fasta(fasta_lines):
        count += 1
        try:
            seq_index[seq].append(seq_id)
        except KeyError:
            seq_index[seq] = [seq_id]

    seqs = []
    for k, v in seq_index.items():
        seqs.append((len(v), k, v))
        del seq_index[k]
    seqs.sort()
    for count, seq, seq_ids in seqs[::-1]:
        for seq_id in seq_ids:
            fasta_out_f.write('>%s\n%s\n' % (seq_id, seq))
Пример #51
0
    def getResult(self, aln_path, *args, **kwargs):
        """Returns alignment from sequences.

        Currently does not allow parameter tuning of program and uses
        default parameters -- this is bad and should be fixed.

        #TODO: allow command-line access to important aln params.
        """
        module = self.Params['Module']
        # standard qiime says we just consider the first word as the unique ID
        # the rest of the defline of the fasta alignment often doesn't match
        # the otu names in the otu table
        with open(aln_path) as aln_f:
            seqs = Alignment.from_fasta_records(
                parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA)
        # This ugly little line of code lets us pass a skbio Alignment when a
        # a cogent alignment is expected.
        seqs.getIntMap = seqs.int_map
        result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent)

        try:
            root_method = kwargs['root_method']
            if root_method == 'midpoint':
                result = root_midpt(result)
            elif root_method == 'tree_method_default':
                pass
        except KeyError:
            pass
        return result
Пример #52
0
    def _split_along_prefix(self, input_fp, params, jobs_to_start, job_prefix,
                            output_dir):
        """ Split input sequences into sets with identical prefix"""
        out_files = []
        buffered_handles = {}
        prefix_length = params['prefix_length'] or 1
        for seq_id, seq in parse_fasta(open(input_fp)):

            if (len(seq) < prefix_length):
                raise ValueError(
                    "Prefix length must be equal or longer than sequence.\n" +
                    " Found seq %s with length %d" % (seq_id, len(seq)))
            prefix = seq[:prefix_length]

            if (prefix not in buffered_handles):
                # never seen this prefix before
                out_fp = "%s/%s%s" % (output_dir, job_prefix, prefix)
                buffered_handles[prefix] = BufferedWriter(out_fp)
                out_files.append(out_fp)
                self.prefix_counts[prefix] = 0

            self.prefix_counts[prefix] += 1
            buffered_handles[prefix].write('>%s\n%s\n' % (seq_id, seq))

        # make sure all buffers are closed and flushed
        for buf_fh in buffered_handles.itervalues():
            buf_fh.close()

        remove_files = True
        return out_files, remove_files
Пример #53
0
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc):
    """
    """
    for seq_id, seq in parse_fasta(fasta_lines):
        seq_id = seq_desc_mapper(seq_id)
        seq = str(DNA(seq.upper()).rc())
        yield seq_id, seq
    return
Пример #54
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    split_fasta_on_sample_ids_to_files(
        parse_fasta(open(opts.input_fasta_fp, 'U')),
        opts.output_dir,
        opts.buffer_size)
Пример #55
0
 def test_longest_id(self):
     """longest_id should return id associated with longest seq"""
     ids = \
         "R27DLI_4812 R27DLI_600  R27DLI_727  U1PLI_403   U1PLI_8969".split(
         )
     seqs = dict(
         parse_fasta(dna_seqs.splitlines(), label_to_name=label_to_name))
     self.assertEqual(longest_id(ids, seqs), 'U1PLI_403')
Пример #56
0
    def test_store_cluster(self):
        """store_clusters stores the centroid seqs for each cluster."""

        self.tmpdir = get_tmp_filename(tmp_dir="./", suffix="_store_clusters/")
        create_dir(self.tmpdir)

        self.files_to_remove.append(self.tmpdir + "singletons.fasta")
        self.files_to_remove.append(self.tmpdir + "centroids.fasta")

        # empty map results in empty files
        store_clusters({}, self.tiny_test, self.tmpdir)
        actual_centroids = list(
            parse_fasta(open(self.tmpdir + "centroids.fasta")))
        self.assertEqual(actual_centroids, [])
        actual_singletons = list(
            parse_fasta(open(self.tmpdir + "singletons.fasta")))
        self.assertEqual(actual_singletons, [])

        # non-empty map creates non-empty files, centroids sorted by size
        mapping = {
            'FZTHQMS01B8T1H': [],
            'FZTHQMS01DE1KN': ['FZTHQMS01EHAJG'],
            'FZTHQMS01EHAJG': [1, 2, 3]
        }  # content doesn't really matter

        centroids = [
            ('FZTHQMS01EHAJG | cluster size: 4',
             'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA'
             ),
            ('FZTHQMS01DE1KN | cluster size: 2',
             'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA'
             )
        ]

        singletons = [(
            'FZTHQMS01B8T1H',
            'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAATTAAACCATGCGGTTTTATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCATCACTTA'
        )]

        store_clusters(mapping, self.tiny_test, self.tmpdir)
        actual_centroids = list(
            parse_fasta(open(self.tmpdir + "centroids.fasta")))
        self.assertEqual(actual_centroids, centroids)
        actual_singletons = list(
            parse_fasta(open(self.tmpdir + "singletons.fasta")))
        self.assertEqual(actual_singletons, singletons)