示例#1
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = SequenceCollection.from_fasta_records(
            [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA)
        infile = in_seqs.to_fasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(fd)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
                SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
示例#2
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                      prefix='split_fasta_tests',
                                      suffix='')
        close(fd)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs),
                                                  DNA))
示例#3
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                      prefix='split_fasta_tests',
                                      suffix='')
        close(fd)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs),
                                                  DNA))
示例#4
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = SequenceCollection.from_fasta_records(
            [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA)
        infile = in_seqs.to_fasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                          prefix='split_fasta_tests',
                                          suffix='')
            close(fd)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                SequenceCollection.from_fasta_records(parse_fasta(infile),
                                                      DNA),
                SequenceCollection.from_fasta_records(parse_fasta(actual_seqs),
                                                      DNA))
示例#5
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(fd)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
示例#6
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(template_alignment,
                                                          DNASequence,
                                                          validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
示例#7
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(
                    template_alignment, DNASequence, validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
示例#8
0
def filter_samples(prefs, data, dir_path='', filename=None):
    """processes the filtering of the otus file and representative seq set, then
        writes filtered otus and filtered representative seq set files"""

    aln = data['aln']
    otus = data['otus']

    # filter the otus file based on which samples to remove
    new_otus_list = filter_otus(otus, prefs)

    filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \
                                    % (dir_path, filename)
    filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w')

    # Write out a new otus file
    for key in (new_otus_list):
        filtered_otus_output_filepath.write(key[0])
        for j in key[1]:
            filtered_otus_output_filepath.write('\t' + str(j))
        filtered_otus_output_filepath.write('\n')
    filtered_otus_output_filepath.close()

    # filter seq set
    filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs)

    # write a fasta containing list of sequences removed from
    # representative set
    if len(removed_seqs) > 0:
        removed_seqs = SequenceCollection.from_fasta_records(
            [(e[0], str(e[1])) for e in removed_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were removed.  Did you specify the correct Sample ID?'
        )
    output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename)
    output_file2 = open(output_filepath2, 'w')
    output_file2.write(removed_seqs.to_fasta())
    output_file2.close()

    # write a fasta containing the filtered representative seqs
    if len(filtered_seqs) > 0:
        filtered_seqs = SequenceCollection.from_fasta_records(
            [(e[0], str(e[1])) for e in filtered_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were remaining in the fasta file.  Did you remove all Sample ID\'s?'
        )

    output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename)
    output_file = open(output_filepath, 'w')
    output_file.write(filtered_seqs.to_fasta())
    output_file.close()
示例#9
0
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={'Algorithm': 'first',
                                         'ChoiceF': first_id})
     app(self.tmp_seq_filepath,
         self.tmp_otu_filepath,
         self.ref_seq_filepath,
         result_path=self.result_filepath)
     with open(self.result_filepath) as f:
         actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA)
     expected = SequenceCollection.from_fasta_records(
         parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA)
     # we don't care about order in the results
     self.assertEqual(set(actual), set(expected))
示例#10
0
def filter_samples(prefs, data, dir_path='', filename=None):
    """processes the filtering of the otus file and representative seq set, then
        writes filtered otus and filtered representative seq set files"""

    aln = data['aln']
    otus = data['otus']

    # filter the otus file based on which samples to remove
    new_otus_list = filter_otus(otus, prefs)

    filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \
                                    % (dir_path, filename)
    filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w')

    # Write out a new otus file
    for key in (new_otus_list):
        filtered_otus_output_filepath.write(key[0])
        for j in key[1]:
            filtered_otus_output_filepath.write('\t' + str(j))
        filtered_otus_output_filepath.write('\n')
    filtered_otus_output_filepath.close()

    # filter seq set
    filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs)

    # write a fasta containing list of sequences removed from
    # representative set
    if len(removed_seqs) > 0:
        removed_seqs = SequenceCollection.from_fasta_records(
                [(e[0], str(e[1])) for e in removed_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were removed.  Did you specify the correct Sample ID?')
    output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename)
    output_file2 = open(output_filepath2, 'w')
    output_file2.write(removed_seqs.to_fasta())
    output_file2.close()

    # write a fasta containing the filtered representative seqs
    if len(filtered_seqs) > 0:
        filtered_seqs = SequenceCollection.from_fasta_records(
                [(e[0], str(e[1])) for e in filtered_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were remaining in the fasta file.  Did you remove all Sample ID\'s?')

    output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename)
    output_file = open(output_filepath, 'w')
    output_file.write(filtered_seqs.to_fasta())
    output_file.close()
示例#11
0
def _qseq_to_sequence_collection(fh, constructor=BiologicalSequence,
                                 filter=_will_filter,
                                 phred_offset=_default_phred_offset,
                                 variant=_default_variant):
    return SequenceCollection(list(_qseq_to_generator(
        fh, constructor=constructor, filter=filter, phred_offset=phred_offset,
        variant=variant)))
示例#12
0
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={
         'Algorithm': 'first',
         'ChoiceF': first_id
     })
     app(self.tmp_seq_filepath,
         self.tmp_otu_filepath,
         self.ref_seq_filepath,
         result_path=self.result_filepath)
     with open(self.result_filepath) as f:
         actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA)
     expected = SequenceCollection.from_fasta_records(
         parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA)
     # we don't care about order in the results
     self.assertEqual(set(actual), set(expected))
示例#13
0
def _fastq_to_sequence_collection(fh,
                                  variant=None,
                                  phred_offset=None,
                                  constructor=BiologicalSequence):
    return SequenceCollection(
        list(
            _fastq_to_generator(fh,
                                variant=variant,
                                phred_offset=phred_offset,
                                constructor=constructor)))
示例#14
0
def _fasta_to_sequence_collection(fh,
                                  qual=FileSentinel,
                                  constructor=Sequence,
                                  **kwargs):
    return SequenceCollection(
        list(
            _fasta_to_generator(fh,
                                qual=qual,
                                constructor=constructor,
                                **kwargs)))
示例#15
0
def _fastq_to_sequence_collection(fh,
                                  variant=None,
                                  phred_offset=None,
                                  constructor=Sequence,
                                  **kwargs):
    return SequenceCollection(
        list(
            _fastq_to_generator(fh,
                                variant=variant,
                                phred_offset=phred_offset,
                                constructor=constructor,
                                **kwargs)))
示例#16
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(fd)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
示例#17
0
 def test_multiple_sequence_alignment(self):
     """Test multiple sequence alignment.
     """
     seqs = [
         DNA('caccggcggcccggtggtggccattattattgggtctaaag', id='seq_1'),
         DNA('caccggcggcccgagtggtggccattattattgggtcaagg', id='seq_2'),
         DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'),
         DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'),
         DNA('caccgggcccgagtggtggccattattattgggtctaaag', id='seq_5')
     ]
     seqs_col = SequenceCollection(seqs)
     seqs_fp = join(self.working_dir, "seqs.fna")
     with open(seqs_fp, 'w') as o:
         o.write(seqs_col.to_fasta())
     alignment = multiple_sequence_alignment(seqs_fp)
     align_exp = [
         DNA('caccggcggcccg-gtggtggccattattattgggtctaaag', id='seq_1'),
         DNA('caccggcggcccgagtggtggccattattattgggtcaagg-', id='seq_2'),
         DNA('caccggcggcccgagtgatggccattattattgggtctaaag', id='seq_3'),
         DNA('aaccggcggcccaagtggtggccattattattgggtctaaag', id='seq_4'),
         DNA('caccg--ggcccgagtggtggccattattattgggtctaaag', id='seq_5')
     ]
     self.assertItemsEqual(alignment, align_exp)
    def test_filter_aln_by_otus(self):
        """filter_aln_by_otus: determines which sequences to keep and which
sequences to remove"""

        self.sample_to_extract = 'SampleA,SampleB'
        exp1 = []
        exp1.append(('SampleA', 'AAAAAAAAAAAAAAA'))
        exp2 = []
        exp2.append(('SampleB', 'CCCCCCC'))
        exp2.append(('SampleC', 'GGGGGGGGGGGGGG'))
        aln = SequenceCollection.from_fasta_records(self.aln, DNA)

        obs1, obs2 = filter_aln_by_otus(aln, self.prefs)

        self.assertEqual(obs1, exp1)
        self.assertEqual(obs2, exp2)
示例#19
0
    def test_filter_aln_by_otus(self):
        """filter_aln_by_otus: determines which sequences to keep and which
sequences to remove"""

        self.sample_to_extract = 'SampleA,SampleB'
        exp1 = []
        exp1.append(('SampleA', 'AAAAAAAAAAAAAAA'))
        exp2 = []
        exp2.append(('SampleB', 'CCCCCCC'))
        exp2.append(('SampleC', 'GGGGGGGGGGGGGG'))
        aln = SequenceCollection.from_fasta_records(self.aln, DNA)

        obs1, obs2 = filter_aln_by_otus(aln, self.prefs)

        self.assertEqual(obs1, exp1)
        self.assertEqual(obs2, exp2)
def main():
    """opens files as necessary based on prefs"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data = {}

    fasta_file = opts.input_fasta_fp

    # load the input alignment
    data['aln'] = SequenceCollection.from_fasta_records(
        parse_fasta(open(fasta_file)), DNA)

    # Load the otu file
    otu_path = opts.otu_map_fp
    otu_f = open(otu_path, 'U')
    otus = fields_to_dict(otu_f)
    otu_f.close()

    data['otus'] = otus
    # Determine which which samples to extract from representative seqs
    # and from otus file
    if opts.samples_to_extract:
        prefs = process_extract_samples(opts.samples_to_extract)

    filepath = opts.input_fasta_fp
    filename = filepath.strip().split('/')[-1]
    filename = filename.split('.')[0]

    if opts.output_dir:
        if os.path.exists(opts.output_dir):
            dir_path = opts.output_dir
        else:
            try:
                os.mkdir(opts.output_dir)
                dir_path = opts.output_dir
            except OSError:
                pass
    else:
        dir_path = './'

    try:
        action = filter_samples
    except NameError:
        action = None
    # Place this outside try/except so we don't mask NameError in action
    if action:
        action(prefs, data, dir_path, filename)
示例#21
0
def main():
    """opens files as necessary based on prefs"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data = {}

    fasta_file = opts.input_fasta_fp

    # load the input alignment
    data['aln'] = SequenceCollection.from_fasta_records(
        parse_fasta(open(fasta_file)), DNA)

    # Load the otu file
    otu_path = opts.otu_map_fp
    otu_f = open(otu_path, 'U')
    otus = fields_to_dict(otu_f)
    otu_f.close()

    data['otus'] = otus
    # Determine which which samples to extract from representative seqs
    # and from otus file
    if opts.samples_to_extract:
        prefs = process_extract_samples(opts.samples_to_extract)

    filepath = opts.input_fasta_fp
    filename = filepath.strip().split('/')[-1]
    filename = filename.split('.')[0]

    if opts.output_dir:
        if os.path.exists(opts.output_dir):
            dir_path = opts.output_dir
        else:
            try:
                os.mkdir(opts.output_dir)
                dir_path = opts.output_dir
            except OSError:
                pass
    else:
        dir_path = './'

    try:
        action = filter_samples
    except NameError:
        action = None
    # Place this outside try/except so we don't mask NameError in action
    if action:
        action(prefs, data, dir_path, filename)
示例#22
0
    def test_call_pynast_test1_file_output_alt_params(self):
        """PyNastAligner writes correct output files when no seqs align
        """
        aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 1000})

        actual = aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        self.assertEqual(getsize(self.result_fp), 0,
                         "No alignable seqs should result in an empty file.")

        # all seqs reported to fail
        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.sequence_count(), 3)
示例#23
0
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                    parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(),
                         self.pynast_test1_expected_fail.to_fasta())
示例#24
0
    def test_call_pynast_test1_file_output_alt_params(self):
        """PyNastAligner writes correct output files when no seqs align
        """
        aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 1000})

        actual = aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        self.assertEqual(getsize(self.result_fp), 0,
                         "No alignable seqs should result in an empty file.")

        # all seqs reported to fail
        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.sequence_count(), 3)
示例#25
0
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                    parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(),
                         self.pynast_test1_expected_fail.to_fasta())
示例#26
0
def extensions_onto_foundation(otu_file_fh, extension_taxonomy_fh,
                               extension_seq_fh, foundation_alignment_fh,
                               ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fp : folder
        Output folder contains files including:
        a) The Newick formatted ghost-tree, which is the final output of the
           ghost-tree tool. This is a phylogenetic tree designed for
           downstream diversity analyses.
        b) Accession IDs from the ghost-tree.nwk file that you can use for
           downstream analyses tools
        c) log error file (this is an optional file that you can have if you
           type '--stderr')
    """
    global foundation_accession_genus_dic  # needs global assignment for flake8
    foundation_accession_genus_dic = {}
    std_output, std_error = "", ""
    process = subprocess.Popen("muscle",
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    std_output, std_error = process.communicate()
    if re.search("command not found", std_error):
        print "muscle, multiple sequence aligner, is not found. Is it" \
              " installed? Is it in your path?"
    process = subprocess.Popen("fasttree",
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    std_output, std_error = process.communicate()
    std_output, std_error = "", ""
    if re.search("command not found", std_error):
        print "fasttree, phylogenetic tree builder, is not found. Is it" \
              " installed? Is it in your path?"
    os.mkdir("tmp")
    os.mkdir(ghost_tree_fp)
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(
        foundation_alignment_fh, extension_genus_accession_list_dic),
                into=ghost_tree_fp + "/nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree, all_std_error = _make_foundation_tree(
        ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", std_error,
        ghost_tree_fp)
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            process = subprocess.Popen(
                "muscle -in tmp/mini_seq_gt.fasta" + " -out" +
                " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1",
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            std_output, std_error = process.communicate()
            process = subprocess.Popen("fasttree -nt -quiet" +
                                       " tmp/mini_alignment_gt.fasta >" +
                                       " tmp/mini_tree_gt.nwk",
                                       shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            std_output, std_error = process.communicate()
            all_std_error += "FastTree warnings for genus " + key_node + " are:\n" + std_error + "\n"
            mini_tree = read("tmp/mini_tree_gt.nwk",
                             format='newick',
                             into=TreeNode)
            node.extend(mini_tree.root_at_midpoint().children[:])
        except:
            continue
    shutil.rmtree("tmp")
    ghost_tree_nwk = open(ghost_tree_fp + "/ghost_tree.nwk", "w")
    ghost_tree_nwk.write(str(foundation_tree))
    ghost_tree_nwk.close()
    _make_accession_id_file(ghost_tree_fp)
    return str(foundation_tree).strip(), all_std_error
示例#27
0
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(prefix='PyNastAlignerTests_',
                                                 suffix='.fasta')
        close(fd)
        with open(self.pynast_test1_input_fp, 'w') as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test1_template_fp, 'w') as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('-', '.'))

        fd, self.pynast_test_template_w_u_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_u_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('T', 'U'))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix='PyNastAlignerTests_',
                                     suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()
        fd, self.failure_fp = mkstemp(prefix='PyNastAlignerTests_',
                                      suffix='.fasta')
        close(fd)
        open(self.failure_fp, 'w').close()
        fd, self.log_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp, self.result_fp, self.failure_fp,
            self.log_fp, self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(
            parse_fasta(pynast_test1_expected_alignment), DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
            parse_fasta(pynast_test1_expected_failure), DNA)
示例#28
0
tggctcagattgaacgctggcggcaggcctaacacatgcaagtcgagcggaaacgantnntntgaaccttcggggnacgatnacggcgtcgagcggcggacgggtgagtaatgcctgggaaattgccctgatgtgggggataactattggaaacgatagctaataccgcataatgtctacggaccaaagagggggaccttcgggcctctcgcttcaggatatgcccaggtgggattagctagttggtgaggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgag
"""

blast_id_to_taxonomy = \
    """AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured
EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.
EF503699\tArchaea;Crenarchaeota;uncultured;uncultured
DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium
EF503697\tArchaea;Crenarchaeota;uncultured;uncultured"""

blast_test_seqs = SequenceCollection.from_fasta_records([
    ('s1',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'),
    ('s2',
     'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'),
    ('s3',
     'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'),
    ('s4',
     'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT'),
    ('s5',
     'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC'),
    ('s6', 'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG')], DNA)


blast_reference_seqs = SequenceCollection.from_fasta_records([
    ('AY800210',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'),
    ('EU883771',
     'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'),
    ('EF503699',
     'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'),
    ('DQ260310',
示例#29
0
def _fasta_to_sequence_collection(fh, qual=FileSentinel,
                                  constructor=BiologicalSequence):
    return SequenceCollection(
        list(_fasta_to_generator(fh, qual=qual, constructor=constructor)))
示例#30
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None, cmbuild_params=None, cmalign_params=None):

        log_params = []
        # load candidate sequences
        candidate_sequences = dict(parse_fasta(open(seq_path, 'U')))

        # load template sequences
        try:
            info, template_alignment, struct = list(MinimalRfamParser(open(
                self.Params['template_filepath'], 'U'),
                seq_constructor=ChangedSequence))[0]
        except RecordError:
            raise ValueError(
                "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.")

        # Need to make separate mapping for unaligned sequences
        unaligned = SequenceCollection.from_fasta_records(
            candidate_sequences.iteritems(), DNASequence)
        mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_')
        mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()]

        # Turn on --gapthresh option in cmbuild to force alignment to full
        # model
        if cmbuild_params is None:
            cmbuild_params = {}
        cmbuild_params.update({'--gapthresh': 1.0})

        # record cmbuild parameters
        log_params.append('cmbuild parameters:')
        log_params.append(str(cmbuild_params))

        # Turn on --sub option in Infernal, since we know the unaligned sequences
        # are fragments.
        # Also turn on --gapthresh to use same gapthresh as was used to build
        # model
        if cmalign_params is None:
            cmalign_params = {}
        cmalign_params.update({'--sub': True, '--gapthresh': 1.0})

        # record cmalign parameters
        log_params.append('cmalign parameters:')
        log_params.append(str(cmalign_params))

        # Align sequences to alignment including alignment gaps.
        aligned, struct_string = cmalign_from_alignment(aln=template_alignment,
                                                        structure_string=struct,
                                                        seqs=mapped_seq_tuples,
                                                        include_aln=True,
                                                        params=cmalign_params,
                                                        cmbuild_params=cmbuild_params)

        # Pull out original sequences from full alignment.
        infernal_aligned = []
        # Get a dict of the ids to sequences (note that this is a
        # cogent alignment object, hence the call to NamedSeqs)
        aligned_dict = aligned.NamedSeqs
        for n, o in new_to_old_ids.iteritems():
            aligned_seq = aligned_dict[n]
            infernal_aligned.append((o, aligned_seq))

        # Create an Alignment object from alignment dict
        infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence)

        if log_path is not None:
            log_file = open(log_path, 'w')
            log_file.write('\n'.join(log_params))
            log_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(infernal_aligned.to_fasta())
            result_file.close()
            return None
        else:
            try:
                return infernal_aligned
            except ValueError:
                return {}
示例#31
0
    """AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured
EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.
EF503699\tArchaea;Crenarchaeota;uncultured;uncultured
DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium
EF503697\tArchaea;Crenarchaeota;uncultured;uncultured"""

blast_test_seqs = SequenceCollection.from_fasta_records([
    ('s1',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'
     ),
    ('s2',
     'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'
     ),
    ('s3',
     'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'
     ),
    ('s4',
     'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT'
     ),
    ('s5',
     'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC'
     ),
    ('s6',
     'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG'
     )
], DNA)

blast_reference_seqs = SequenceCollection.from_fasta_records([
    ('AY800210',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'
     ),
    ('EU883771',
示例#32
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None, cmbuild_params=None, cmalign_params=None):

        log_params = []
        # load candidate sequences
        candidate_sequences = dict(parse_fasta(open(seq_path, 'U')))

        # load template sequences
        try:
            info, template_alignment, struct = list(MinimalRfamParser(open(
                self.Params['template_filepath'], 'U'),
                seq_constructor=ChangedSequence))[0]
        except RecordError:
            raise ValueError(
                "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.")

        # Need to make separate mapping for unaligned sequences
        unaligned = SequenceCollection.from_fasta_records(
            candidate_sequences.iteritems(), DNASequence)
        mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_')
        mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()]

        # Turn on --gapthresh option in cmbuild to force alignment to full
        # model
        if cmbuild_params is None:
            cmbuild_params = {}
        cmbuild_params.update({'--gapthresh': 1.0})

        # record cmbuild parameters
        log_params.append('cmbuild parameters:')
        log_params.append(str(cmbuild_params))

        # Turn on --sub option in Infernal, since we know the unaligned sequences
        # are fragments.
        # Also turn on --gapthresh to use same gapthresh as was used to build
        # model
        if cmalign_params is None:
            cmalign_params = {}
        cmalign_params.update({'--sub': True, '--gapthresh': 1.0})

        # record cmalign parameters
        log_params.append('cmalign parameters:')
        log_params.append(str(cmalign_params))

        # Align sequences to alignment including alignment gaps.
        aligned, struct_string = cmalign_from_alignment(aln=template_alignment,
                                                        structure_string=struct,
                                                        seqs=mapped_seq_tuples,
                                                        include_aln=True,
                                                        params=cmalign_params,
                                                        cmbuild_params=cmbuild_params)

        # Pull out original sequences from full alignment.
        infernal_aligned = []
        # Get a dict of the ids to sequences (note that this is a
        # cogent alignment object, hence the call to NamedSeqs)
        aligned_dict = aligned.NamedSeqs
        for n, o in new_to_old_ids.iteritems():
            aligned_seq = aligned_dict[n]
            infernal_aligned.append((o, aligned_seq))

        # Create an Alignment object from alignment dict
        infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence)

        if log_path is not None:
            log_file = open(log_path, 'w')
            log_file.write('\n'.join(log_params))
            log_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(infernal_aligned.to_fasta())
            result_file.close()
            return None
        else:
            try:
                return infernal_aligned
            except ValueError:
                return {}
示例#33
0
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh,
                                        extension_seq_fh,
                                        foundation_alignment_fh,
                                        ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fh : filehandle
        The Newick formatted ghost-tree is the final output of the ghost-tree
        tool. This is a phylogenetic tree designed for downstream diversity
        analyses.

    """
    global foundation_accession_genus_dic  # needs global assignment for flake8
    foundation_accession_genus_dic = {}
    ghost_tree_output = str(ghost_tree_fp)
    ghost_tree_output = ghost_tree_output[16:-4]
    process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    output, error = process.communicate()
    if re.search("command not found", error):
        print "muscle, multiple sequence aligner, is not found. Is it" \
              " installed? Is it in your path?"
    process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    output, error = process.communicate()
    if re.search("command not found", error):
        print "fasttree, phylogenetic tree builder, is not found. Is it" \
              " installed? Is it in your path?"
    os.mkdir("tmp")
    logfile = open("ghost-tree_log_"+ghost_tree_output+".txt", "w")
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(foundation_alignment_fh,
                extension_genus_accession_list_dic),
                into="nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta",
                                            logfile)
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            process = subprocess.Popen("muscle -in tmp/mini_seq_gt.fasta" +
                                       " -out" +
                                       " tmp/mini_alignment_gt.fasta -quiet" +
                                       " -maxiters 2 -diags1", shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output, error = process.communicate()
            process = subprocess.Popen("fasttree -nt -quiet" +
                                       " tmp/mini_alignment_gt.fasta >" +
                                       " tmp/mini_tree_gt.nwk", shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output, error = process.communicate()
            logfile.write("FastTree warnings for genus "+key_node+" are:\n" +
                          error + "\n")
            mini_tree = read("tmp/mini_tree_gt.nwk", format='newick',
                             into=TreeNode)
            node.extend(mini_tree.root_at_midpoint().children[:])
        except:
            continue
    shutil.rmtree("tmp")
    ghost_tree_fp.write(str(foundation_tree))
    logfile.close()
    return str(foundation_tree).strip()
示例#34
0
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        with open(self.pynast_test1_input_fp, 'w') as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test1_template_fp, 'w') as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('-', '.'))

        fd, self.pynast_test_template_w_u_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_u_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('T', 'U'))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()
        fd, self.failure_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.failure_fp, 'w').close()
        fd, self.log_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(
                parse_fasta(pynast_test1_expected_alignment),
                    DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
                parse_fasta(pynast_test1_expected_failure), DNA)
示例#35
0
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh,
                                        extension_seq_fh,
                                        foundation_alignment_fh,
                                        ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fh : filehandle
        The Newick formatted ghost-tree is the final output of the ghost-tree
        tool. This is a phylogenetic tree designed for downstream diversity
        analyses.

    """
    os.system("mkdir tmp")
    global foundation_accession_genus_dic
    foundation_accession_genus_dic = {}
    global seqs
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(
        foundation_alignment_fh, extension_genus_accession_list_dic),
                into="nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta")
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            os.system("muscle -in tmp/mini_seq_gt.fasta -out" +
                      " tmp/mini_alignment_gt.fasta -quiet" +
                      " -maxiters 2 -diags1")
            os.system("fasttree -nt -quiet tmp/mini_alignment_gt.fasta >" +
                      " tmp/mini_tree_gt.nwk")
            mini_tree = read("tmp/mini_tree_gt.nwk",
                             format='newick',
                             into=TreeNode)
            node.extend(mini_tree.children[:])
        except:
            continue
    os.system("rm -r tmp")
    ghost_tree_fp.write(str(foundation_tree))
    return str(foundation_tree).strip()