示例#1
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = SequenceCollection.from_fasta_records(
            [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA)
        infile = in_seqs.to_fasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(fd)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
                SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
示例#2
0
 def test_init(self):
     """Initialization functions as expected with varied input types
     """
     SequenceCollection(self.seqs1)
     SequenceCollection(self.seqs2)
     SequenceCollection(self.seqs3)
     SequenceCollection([])
示例#3
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(fd)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
示例#4
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(template_alignment,
                                                          DNASequence,
                                                          validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
示例#5
0
 def test_init_validate(self):
     """initialization with validation functions as expected
     """
     SequenceCollection(self.seqs1, validate=True)
     SequenceCollection(self.seqs1, validate=True)
     # can't validate self.seqs2 as a DNASequence
     self.assertRaises(SequenceCollectionError,
                       SequenceCollection,
                       self.invalid_s1,
                       validate=True)
示例#6
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(
                    template_alignment, DNASequence, validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), identifier=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), identifier=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
示例#7
0
def filter_samples(prefs, data, dir_path='', filename=None):
    """processes the filtering of the otus file and representative seq set, then
        writes filtered otus and filtered representative seq set files"""

    aln = data['aln']
    otus = data['otus']

    # filter the otus file based on which samples to remove
    new_otus_list = filter_otus(otus, prefs)

    filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \
                                    % (dir_path, filename)
    filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w')

    # Write out a new otus file
    for key in (new_otus_list):
        filtered_otus_output_filepath.write(key[0])
        for j in key[1]:
            filtered_otus_output_filepath.write('\t' + str(j))
        filtered_otus_output_filepath.write('\n')
    filtered_otus_output_filepath.close()

    # filter seq set
    filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs)

    # write a fasta containing list of sequences removed from
    # representative set
    if len(removed_seqs) > 0:
        removed_seqs = SequenceCollection.from_fasta_records(
            [(e[0], str(e[1])) for e in removed_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were removed.  Did you specify the correct Sample ID?'
        )
    output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename)
    output_file2 = open(output_filepath2, 'w')
    output_file2.write(removed_seqs.to_fasta())
    output_file2.close()

    # write a fasta containing the filtered representative seqs
    if len(filtered_seqs) > 0:
        filtered_seqs = SequenceCollection.from_fasta_records(
            [(e[0], str(e[1])) for e in filtered_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were remaining in the fasta file.  Did you remove all Sample ID\'s?'
        )

    output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename)
    output_file = open(output_filepath, 'w')
    output_file.write(filtered_seqs.to_fasta())
    output_file.close()
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={'Algorithm': 'first',
                                         'ChoiceF': first_id})
     app(self.tmp_seq_filepath,
         self.tmp_otu_filepath,
         self.ref_seq_filepath,
         result_path=self.result_filepath)
     with open(self.result_filepath) as f:
         actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA)
     expected = SequenceCollection.from_fasta_records(
         parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA)
     # we don't care about order in the results
     self.assertEqual(set(actual), set(expected))
示例#9
0
    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs1_t]
        expected = SequenceCollection.from_fasta_records(expected, DNASequence)
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.a2.degap()
        self.assertEqual(actual, expected)
示例#10
0
    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs1_t]
        expected = SequenceCollection.from_fasta_records(expected, DNASequence)
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.a2.degap()
        self.assertEqual(actual, expected)
def filter_samples(prefs, data, dir_path='', filename=None):
    """processes the filtering of the otus file and representative seq set, then
        writes filtered otus and filtered representative seq set files"""

    aln = data['aln']
    otus = data['otus']

    # filter the otus file based on which samples to remove
    new_otus_list = filter_otus(otus, prefs)

    filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \
                                    % (dir_path, filename)
    filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w')

    # Write out a new otus file
    for key in (new_otus_list):
        filtered_otus_output_filepath.write(key[0])
        for j in key[1]:
            filtered_otus_output_filepath.write('\t' + str(j))
        filtered_otus_output_filepath.write('\n')
    filtered_otus_output_filepath.close()

    # filter seq set
    filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs)

    # write a fasta containing list of sequences removed from
    # representative set
    if len(removed_seqs) > 0:
        removed_seqs = SequenceCollection.from_fasta_records(
                [(e[0], str(e[1])) for e in removed_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were removed.  Did you specify the correct Sample ID?')
    output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename)
    output_file2 = open(output_filepath2, 'w')
    output_file2.write(removed_seqs.to_fasta())
    output_file2.close()

    # write a fasta containing the filtered representative seqs
    if len(filtered_seqs) > 0:
        filtered_seqs = SequenceCollection.from_fasta_records(
                [(e[0], str(e[1])) for e in filtered_seqs], DNA)
    else:
        raise ValueError(
            'No sequences were remaining in the fasta file.  Did you remove all Sample ID\'s?')

    output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename)
    output_file = open(output_filepath, 'w')
    output_file.write(filtered_seqs.to_fasta())
    output_file.close()
示例#12
0
 def test_call_write_to_file(self):
     """ReferenceRepSetPicker.__call__ otu map correctly written to file"""
     app = ReferenceRepSetPicker(params={
         'Algorithm': 'first',
         'ChoiceF': first_id
     })
     app(self.tmp_seq_filepath,
         self.tmp_otu_filepath,
         self.ref_seq_filepath,
         result_path=self.result_filepath)
     with open(self.result_filepath) as f:
         actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA)
     expected = SequenceCollection.from_fasta_records(
         parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA)
     # we don't care about order in the results
     self.assertEqual(set(actual), set(expected))
示例#13
0
    def test_ne(self):
        """inequality operator functions as expected
        """
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass

        # SequenceCollections of different types are not equal
        self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2]))
        self.assertTrue(self.s1 != Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
示例#14
0
    def test_distances(self):
        """distances functions as expected
        """
        s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")])
        expected = [[0, 0.25],
                    [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(hamming)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42.],
                    [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)
示例#15
0
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        with open(self.pynast_test1_input_fp, "w") as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test1_template_fp, "w") as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, "w") as f:
            f.write(pynast_test1_template_fasta.replace("-", "."))

        fd, self.pynast_test_template_w_u_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_u_fp, "w") as f:
            f.write(pynast_test1_template_fasta.replace("T", "U"))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, "w") as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.result_fp, "w").close()
        fd, self.failure_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.failure_fp, "w").close()
        fd, self.log_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".log")
        close(fd)
        open(self.log_fp, "w").close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp,
        ]

        self.pynast_test1_aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 15})

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(parse_fasta(pynast_test1_expected_alignment), DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
            parse_fasta(pynast_test1_expected_failure), DNA
        )
示例#16
0
    def setUp(self):
        """Initialize values to be used in tests
        """
        self.d1 = DNASequence('GATTACA', identifier="d1")
        self.d2 = DNASequence('TTG', identifier="d2")
        self.d1_lower = DNASequence('gattaca', identifier="d1")
        self.d2_lower = DNASequence('ttg', identifier="d2")
        self.r1 = RNASequence('GAUUACA', identifier="r1")
        self.r2 = RNASequence('UUG', identifier="r2")
        self.r3 = RNASequence('U-----UGCC--', identifier="r3")

        self.i1 = DNASequence('GATXACA', identifier="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])
示例#17
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(fd)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
    def test_filter_aln_by_otus(self):
        """filter_aln_by_otus: determines which sequences to keep and which
sequences to remove"""

        self.sample_to_extract = "SampleA,SampleB"
        exp1 = []
        exp1.append(("SampleA", "AAAAAAAAAAAAAAA"))
        exp2 = []
        exp2.append(("SampleB", "CCCCCCC"))
        exp2.append(("SampleC", "GGGGGGGGGGGGGG"))
        aln = SequenceCollection.from_fasta_records(self.aln, DNA)

        obs1, obs2 = filter_aln_by_otus(aln, self.prefs)

        self.assertEqual(obs1, exp1)
        self.assertEqual(obs2, exp2)
示例#19
0
    def test_filter_aln_by_otus(self):
        """filter_aln_by_otus: determines which sequences to keep and which
sequences to remove"""

        self.sample_to_extract = 'SampleA,SampleB'
        exp1 = []
        exp1.append(('SampleA', 'AAAAAAAAAAAAAAA'))
        exp2 = []
        exp2.append(('SampleB', 'CCCCCCC'))
        exp2.append(('SampleC', 'GGGGGGGGGGGGGG'))
        aln = SequenceCollection.from_fasta_records(self.aln, DNA)

        obs1, obs2 = filter_aln_by_otus(aln, self.prefs)

        self.assertEqual(obs1, exp1)
        self.assertEqual(obs2, exp2)
示例#20
0
def main():
    """opens files as necessary based on prefs"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    data = {}

    fasta_file = opts.input_fasta_fp

    # load the input alignment
    data['aln'] = SequenceCollection.from_fasta_records(
        parse_fasta(open(fasta_file)), DNA)

    # Load the otu file
    otu_path = opts.otu_map_fp
    otu_f = open(otu_path, 'U')
    otus = fields_to_dict(otu_f)
    otu_f.close()

    data['otus'] = otus
    # Determine which which samples to extract from representative seqs
    # and from otus file
    if opts.samples_to_extract:
        prefs = process_extract_samples(opts.samples_to_extract)

    filepath = opts.input_fasta_fp
    filename = filepath.strip().split('/')[-1]
    filename = filename.split('.')[0]

    if opts.output_dir:
        if os.path.exists(opts.output_dir):
            dir_path = opts.output_dir
        else:
            try:
                os.mkdir(opts.output_dir)
                dir_path = opts.output_dir
            except OSError:
                pass
    else:
        dir_path = './'

    try:
        action = filter_samples
    except NameError:
        action = None
    # Place this outside try/except so we don't mask NameError in action
    if action:
        action(prefs, data, dir_path, filename)
示例#21
0
    def test_call_pynast_test1_file_output_alt_params(self):
        """PyNastAligner writes correct output files when no seqs align
        """
        aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 1000})

        actual = aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp
        )

        self.assertTrue(actual is None, "Result should be None when result path provided.")

        self.assertEqual(getsize(self.result_fp), 0, "No alignable seqs should result in an empty file.")

        # all seqs reported to fail
        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.sequence_count(), 3)
示例#22
0
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp
        )

        self.assertTrue(actual is None, "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
示例#23
0
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                    parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(),
                         self.pynast_test1_expected_fail.to_fasta())
示例#24
0
    def test_call_pynast_test1_file_output_alt_params(self):
        """PyNastAligner writes correct output files when no seqs align
        """
        aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 1000})

        actual = aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        self.assertEqual(getsize(self.result_fp), 0,
                         "No alignable seqs should result in an empty file.")

        # all seqs reported to fail
        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.sequence_count(), 3)
示例#25
0
 def test_from_fasta_records(self):
     """Initialization from list of tuples functions as expected
     """
     SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence)
     SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence)
     SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)
示例#26
0
class SequenceCollectionTests(TestCase):
    """Tests of the SequenceCollection class """

    def setUp(self):
        """Initialize values to be used in tests
        """
        self.d1 = DNASequence('GATTACA', identifier="d1")
        self.d2 = DNASequence('TTG', identifier="d2")
        self.d1_lower = DNASequence('gattaca', identifier="d1")
        self.d2_lower = DNASequence('ttg', identifier="d2")
        self.r1 = RNASequence('GAUUACA', identifier="r1")
        self.r2 = RNASequence('UUG', identifier="r2")
        self.r3 = RNASequence('U-----UGCC--', identifier="r3")

        self.i1 = DNASequence('GATXACA', identifier="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])

    def test_init(self):
        """Initialization functions as expected with varied input types
        """
        SequenceCollection(self.seqs1)
        SequenceCollection(self.seqs2)
        SequenceCollection(self.seqs3)
        SequenceCollection([])

    def test_init_fail(self):
        """initialization with sequences with overlapping identifiers fails
        """
        s1 = [self.d1, self.d1]
        self.assertRaises(SequenceCollectionError, SequenceCollection, s1)

    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        SequenceCollection(self.seqs1, validate=True)
        SequenceCollection(self.seqs1, validate=True)
        # can't validate self.seqs2 as a DNASequence
        self.assertRaises(SequenceCollectionError, SequenceCollection,
                          self.invalid_s1, validate=True)

    def test_from_fasta_records(self):
        """Initialization from list of tuples functions as expected
        """
        SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence)
        SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence)
        SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)

    def test_contains(self):
        """in operator functions as expected
        """
        self.assertTrue('d1' in self.s1)
        self.assertTrue('r2' in self.s2)
        self.assertFalse('r2' in self.s1)

    def test_eq(self):
        """equality operator functions as expected
        """
        self.assertTrue(self.s1 == self.s1)
        self.assertFalse(self.s1 == self.s2)

        # different objects can be equal
        self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2]))
        self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1)

        # SequenceCollections with different number of sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2]))
        self.assertFalse(self.s1 == Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))

    def test_getitem(self):
        """getitem functions as expected
        """
        self.assertEqual(self.s1[0], self.d1)
        self.assertEqual(self.s1[1], self.d2)
        self.assertEqual(self.s2[0], self.r1)
        self.assertEqual(self.s2[1], self.r2)

        self.assertRaises(IndexError, self.empty.__getitem__, 0)
        self.assertRaises(KeyError, self.empty.__getitem__, '0')

    def test_iter(self):
        """iter functions as expected
        """
        s1_iter = iter(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_len(self):
        """len functions as expected
        """
        self.assertEqual(len(self.s1), 2)
        self.assertEqual(len(self.s2), 3)
        self.assertEqual(len(self.s3), 5)
        self.assertEqual(len(self.empty), 0)

    def test_ne(self):
        """inequality operator functions as expected
        """
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2]))
        self.assertTrue(self.s1 != Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 !=
                        SequenceCollection([self.d1, self.r1]))

    def test_repr(self):
        """repr functions as expected
        """
        self.assertEqual(repr(self.s1),
                         "<SequenceCollection: n=2; "
                         "mean +/- std length=5.00 +/- 2.00>")
        self.assertEqual(repr(self.s2),
                         "<SequenceCollection: n=3; "
                         "mean +/- std length=7.33 +/- 3.68>")
        self.assertEqual(repr(self.s3),
                         "<SequenceCollection: n=5; "
                         "mean +/- std length=6.40 +/- 3.32>")
        self.assertEqual(repr(self.empty),
                         "<SequenceCollection: n=0; "
                         "mean +/- std length=0.00 +/- 0.00>")

    def test_reversed(self):
        """reversed functions as expected
        """
        s1_iter = reversed(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1[::-1]):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_k_word_frequencies(self):
        """k_word_frequencies functions as expected
        """
        expected1 = defaultdict(int)
        expected1['A'] = 3/7.
        expected1['C'] = 1/7.
        expected1['G'] = 1/7.
        expected1['T'] = 2/7.
        expected2 = defaultdict(int)
        expected2['G'] = 1/3.
        expected2['T'] = 2/3.
        self.assertEqual(self.s1.k_word_frequencies(k=1),
                         [expected1, expected2])

        expected1 = defaultdict(int)
        expected1['GAT'] = 1/2.
        expected1['TAC'] = 1/2.
        expected2 = defaultdict(int)
        expected2['TTG'] = 1/1.
        self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False),
                         [expected1, expected2])

        self.assertEqual(self.empty.k_word_frequencies(k=1), [])

    def test_str(self):
        """str functions as expected
        """
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(str(self.s1), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(str(self.s2), exp2)
        exp4 = ""
        self.assertEqual(str(self.empty), exp4)

    def test_distribution_stats(self):
        """distribution_stats functions as expected
        """
        actual1 = self.s1.distribution_stats()
        self.assertEqual(actual1[0], 2)
        self.assertAlmostEqual(actual1[1], 5.0, 3)
        self.assertAlmostEqual(actual1[2], 2.0, 3)

        actual2 = self.s2.distribution_stats()
        self.assertEqual(actual2[0], 3)
        self.assertAlmostEqual(actual2[1], 7.333, 3)
        self.assertAlmostEqual(actual2[2], 3.682, 3)

        actual3 = self.s3.distribution_stats()
        self.assertEqual(actual3[0], 5)
        self.assertAlmostEqual(actual3[1], 6.400, 3)
        self.assertAlmostEqual(actual3[2], 3.323, 3)

        actual4 = self.empty.distribution_stats()
        self.assertEqual(actual4[0], 0)
        self.assertEqual(actual4[1], 0.0)
        self.assertEqual(actual4[2], 0.0)

    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.s2.degap()
        self.assertEqual(actual, expected)

    def test_get_seq(self):
        """getseq functions asexpected
        """
        self.assertEqual(self.s1.get_seq('d1'), self.d1)
        self.assertEqual(self.s1.get_seq('d2'), self.d2)

    def test_identifiers(self):
        """identifiers functions as expected
        """
        self.assertEqual(self.s1.identifiers(), ['d1', 'd2'])
        self.assertEqual(self.s2.identifiers(), ['r1', 'r2', 'r3'])
        self.assertEqual(self.s3.identifiers(),
                         ['d1', 'd2', 'r1', 'r2', 'r3'])
        self.assertEqual(self.empty.identifiers(), [])

    def test_int_map(self):
        """int_map functions as expected
        """
        expected1 = {"1": self.d1, "2": self.d2}
        expected2 = {"1": "d1", "2": "d2"}
        self.assertEqual(self.s1.int_map(), (expected1, expected2))

        expected1 = {"h-1": self.d1, "h-2": self.d2}
        expected2 = {"h-1": "d1", "h-2": "d2"}
        self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2))

    def test_is_empty(self):
        """is_empty functions as expected
        """
        self.assertFalse(self.s1.is_empty())
        self.assertFalse(self.s2.is_empty())
        self.assertFalse(self.s3.is_empty())

        self.assertTrue(self.empty.is_empty())

    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.s1.is_valid())
        self.assertTrue(self.s2.is_valid())
        self.assertTrue(self.s3.is_valid())
        self.assertTrue(self.empty.is_valid())

        self.assertFalse(self.invalid_s1.is_valid())

    def test_iteritems(self):
        """iteritems functions as expected
        """
        self.assertEqual(list(self.s1.iteritems()),
                         [(s.identifier, s) for s in self.s1])

    def test_lower(self):
        """lower functions as expected
        """
        self.assertEqual(self.s1.lower(), self.s1_lower)

    def test_sequence_count(self):
        """num_seqs functions as expected
        """
        self.assertEqual(self.s1.sequence_count(), 2)
        self.assertEqual(self.s2.sequence_count(), 3)
        self.assertEqual(self.s3.sequence_count(), 5)
        self.assertEqual(self.empty.sequence_count(), 0)

    def test_sequence_lengths(self):
        """sequence_lengths functions as expected
        """
        self.assertEqual(self.s1.sequence_lengths(), [7, 3])
        self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12])
        self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12])
        self.assertEqual(self.empty.sequence_lengths(), [])

    def test_to_fasta(self):
        """to_fasta functions as expected
        """
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(self.s1.to_fasta(), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(self.s2.to_fasta(), exp2)

    def test_upper(self):
        """upper functions as expected
        """
        self.assertEqual(self.s1_lower.upper(), self.s1)
示例#27
0
    """AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured
EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.
EF503699\tArchaea;Crenarchaeota;uncultured;uncultured
DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium
EF503697\tArchaea;Crenarchaeota;uncultured;uncultured"""

blast_test_seqs = SequenceCollection.from_fasta_records([
    ('s1',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'
     ),
    ('s2',
     'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'
     ),
    ('s3',
     'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'
     ),
    ('s4',
     'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT'
     ),
    ('s5',
     'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC'
     ),
    ('s6',
     'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG'
     )
], DNA)

blast_reference_seqs = SequenceCollection.from_fasta_records([
    ('AY800210',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'
     ),
    ('EU883771',
示例#28
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None, cmbuild_params=None, cmalign_params=None):

        log_params = []
        # load candidate sequences
        candidate_sequences = dict(parse_fasta(open(seq_path, 'U')))

        # load template sequences
        try:
            info, template_alignment, struct = list(MinimalRfamParser(open(
                self.Params['template_filepath'], 'U'),
                seq_constructor=ChangedSequence))[0]
        except RecordError:
            raise ValueError(
                "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.")

        moltype = self.Params['moltype']

        # Need to make separate mapping for unaligned sequences
        unaligned = SequenceCollection.from_fasta_records(
            candidate_sequences.iteritems(), DNASequence)
        mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_')
        mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()]

        # Turn on --gapthresh option in cmbuild to force alignment to full
        # model
        if cmbuild_params is None:
            cmbuild_params = {}
        cmbuild_params.update({'--gapthresh': 1.0})

        # record cmbuild parameters
        log_params.append('cmbuild parameters:')
        log_params.append(str(cmbuild_params))

        # Turn on --sub option in Infernal, since we know the unaligned sequences
        # are fragments.
        # Also turn on --gapthresh to use same gapthresh as was used to build
        # model
        if cmalign_params is None:
            cmalign_params = {}
        cmalign_params.update({'--sub': True, '--gapthresh': 1.0})

        # record cmalign parameters
        log_params.append('cmalign parameters:')
        log_params.append(str(cmalign_params))

        # Align sequences to alignment including alignment gaps.
        aligned, struct_string = cmalign_from_alignment(aln=template_alignment,
                                                        structure_string=struct,
                                                        seqs=mapped_seq_tuples,
                                                        moltype=moltype,
                                                        include_aln=True,
                                                        params=cmalign_params,
                                                        cmbuild_params=cmbuild_params)

        # Pull out original sequences from full alignment.
        infernal_aligned = []
        # Get a dict of the identifiers to sequences (note that this is a
        # cogent alignment object, hence the call to NamedSeqs)
        aligned_dict = aligned.NamedSeqs
        for n, o in new_to_old_ids.iteritems():
            aligned_seq = aligned_dict[n]
            infernal_aligned.append((o, aligned_seq))

        # Create an Alignment object from alignment dict
        infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence)

        if log_path is not None:
            log_file = open(log_path, 'w')
            log_file.write('\n'.join(log_params))
            log_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(infernal_aligned.to_fasta())
            result_file.close()
            return None
        else:
            try:
                return infernal_aligned
            except ValueError:
                return {}
示例#29
0
 def test_from_fasta_records(self):
     """Initialization from list of tuples functions as expected
     """
     SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence)
     SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence)
     SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)
tggctcagattgaacgctggcggcaggcctaacacatgcaagtcgagcggaaacgantnntntgaaccttcggggnacgatnacggcgtcgagcggcggacgggtgagtaatgcctgggaaattgccctgatgtgggggataactattggaaacgatagctaataccgcataatgtctacggaccaaagagggggaccttcgggcctctcgcttcaggatatgcccaggtgggattagctagttggtgaggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgag
"""

blast_id_to_taxonomy = \
    """AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured
EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.
EF503699\tArchaea;Crenarchaeota;uncultured;uncultured
DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium
EF503697\tArchaea;Crenarchaeota;uncultured;uncultured"""

blast_test_seqs = SequenceCollection.from_fasta_records([
    ('s1',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'),
    ('s2',
     'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'),
    ('s3',
     'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'),
    ('s4',
     'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT'),
    ('s5',
     'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC'),
    ('s6', 'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG')], DNA)


blast_reference_seqs = SequenceCollection.from_fasta_records([
    ('AY800210',
     'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'),
    ('EU883771',
     'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'),
    ('EF503699',
     'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'),
    ('DQ260310',
示例#31
0
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        with open(self.pynast_test1_input_fp, 'w') as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test1_template_fp, 'w') as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('-', '.'))

        fd, self.pynast_test_template_w_u_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_u_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('T', 'U'))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()
        fd, self.failure_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.failure_fp, 'w').close()
        fd, self.log_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(
                parse_fasta(pynast_test1_expected_alignment),
                    DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
                parse_fasta(pynast_test1_expected_failure), DNA)
示例#32
0
    def setUp(self):
        """Initialize values to be used in tests
        """
        self.d1 = DNASequence('GATTACA', id="d1")
        self.d2 = DNASequence('TTG', id="d2")
        self.d1_lower = DNASequence('gattaca', id="d1")
        self.d2_lower = DNASequence('ttg', id="d2")
        self.r1 = RNASequence('GAUUACA', id="r1")
        self.r2 = RNASequence('UUG', id="r2")
        self.r3 = RNASequence('U-----UGCC--', id="r3")

        self.i1 = DNASequence('GATXACA', id="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])
示例#33
0
class SequenceCollectionTests(TestCase):
    """Tests of the SequenceCollection class """
    def setUp(self):
        """Initialize values to be used in tests
        """
        self.d1 = DNASequence('GATTACA', id="d1")
        self.d2 = DNASequence('TTG', id="d2")
        self.d1_lower = DNASequence('gattaca', id="d1")
        self.d2_lower = DNASequence('ttg', id="d2")
        self.r1 = RNASequence('GAUUACA', id="r1")
        self.r2 = RNASequence('UUG', id="r2")
        self.r3 = RNASequence('U-----UGCC--', id="r3")

        self.i1 = DNASequence('GATXACA', id="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])

    def test_init(self):
        """Initialization functions as expected with varied input types
        """
        SequenceCollection(self.seqs1)
        SequenceCollection(self.seqs2)
        SequenceCollection(self.seqs3)
        SequenceCollection([])

    def test_init_fail(self):
        """initialization with sequences with overlapping ids fails
        """
        s1 = [self.d1, self.d1]
        self.assertRaises(SequenceCollectionError, SequenceCollection, s1)

    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        SequenceCollection(self.seqs1, validate=True)
        SequenceCollection(self.seqs1, validate=True)
        # can't validate self.seqs2 as a DNASequence
        self.assertRaises(SequenceCollectionError,
                          SequenceCollection,
                          self.invalid_s1,
                          validate=True)

    def test_from_fasta_records(self):
        """Initialization from list of tuples functions as expected
        """
        SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence)
        SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence)
        SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)

    def test_contains(self):
        """in operator functions as expected
        """
        self.assertTrue('d1' in self.s1)
        self.assertTrue('r2' in self.s2)
        self.assertFalse('r2' in self.s1)

    def test_eq(self):
        """equality operator functions as expected
        """
        self.assertTrue(self.s1 == self.s1)
        self.assertFalse(self.s1 == self.s2)

        # different objects can be equal
        self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2]))
        self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1)

        # SequenceCollections with different number of sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass

        # SequenceCollections of different types are not equal
        self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2]))
        self.assertFalse(self.s1 == Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))

    def test_getitem(self):
        """getitem functions as expected
        """
        self.assertEqual(self.s1[0], self.d1)
        self.assertEqual(self.s1[1], self.d2)
        self.assertEqual(self.s2[0], self.r1)
        self.assertEqual(self.s2[1], self.r2)

        self.assertRaises(IndexError, self.empty.__getitem__, 0)
        self.assertRaises(KeyError, self.empty.__getitem__, '0')

    def test_iter(self):
        """iter functions as expected
        """
        s1_iter = iter(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_len(self):
        """len functions as expected
        """
        self.assertEqual(len(self.s1), 2)
        self.assertEqual(len(self.s2), 3)
        self.assertEqual(len(self.s3), 5)
        self.assertEqual(len(self.empty), 0)

    def test_ne(self):
        """inequality operator functions as expected
        """
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass

        # SequenceCollections of different types are not equal
        self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2]))
        self.assertTrue(self.s1 != Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))

    def test_repr(self):
        """repr functions as expected
        """
        self.assertEqual(
            repr(self.s1), "<SequenceCollection: n=2; "
            "mean +/- std length=5.00 +/- 2.00>")
        self.assertEqual(
            repr(self.s2), "<SequenceCollection: n=3; "
            "mean +/- std length=7.33 +/- 3.68>")
        self.assertEqual(
            repr(self.s3), "<SequenceCollection: n=5; "
            "mean +/- std length=6.40 +/- 3.32>")
        self.assertEqual(
            repr(self.empty), "<SequenceCollection: n=0; "
            "mean +/- std length=0.00 +/- 0.00>")

    def test_reversed(self):
        """reversed functions as expected
        """
        s1_iter = reversed(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1[::-1]):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_k_word_frequencies(self):
        """k_word_frequencies functions as expected
        """
        expected1 = defaultdict(int)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(int)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.k_word_frequencies(k=1),
                         [expected1, expected2])

        expected1 = defaultdict(int)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(int)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False),
                         [expected1, expected2])

        self.assertEqual(self.empty.k_word_frequencies(k=1), [])

    def test_str(self):
        """str functions as expected
        """
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(str(self.s1), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(str(self.s2), exp2)
        exp4 = ""
        self.assertEqual(str(self.empty), exp4)

    def test_distribution_stats(self):
        """distribution_stats functions as expected
        """
        actual1 = self.s1.distribution_stats()
        self.assertEqual(actual1[0], 2)
        self.assertAlmostEqual(actual1[1], 5.0, 3)
        self.assertAlmostEqual(actual1[2], 2.0, 3)

        actual2 = self.s2.distribution_stats()
        self.assertEqual(actual2[0], 3)
        self.assertAlmostEqual(actual2[1], 7.333, 3)
        self.assertAlmostEqual(actual2[2], 3.682, 3)

        actual3 = self.s3.distribution_stats()
        self.assertEqual(actual3[0], 5)
        self.assertAlmostEqual(actual3[1], 6.400, 3)
        self.assertAlmostEqual(actual3[2], 3.323, 3)

        actual4 = self.empty.distribution_stats()
        self.assertEqual(actual4[0], 0)
        self.assertEqual(actual4[1], 0.0)
        self.assertEqual(actual4[2], 0.0)

    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.s2.degap()
        self.assertEqual(actual, expected)

    def test_get_seq(self):
        """getseq functions asexpected
        """
        self.assertEqual(self.s1.get_seq('d1'), self.d1)
        self.assertEqual(self.s1.get_seq('d2'), self.d2)

    def test_ids(self):
        """ids functions as expected
        """
        self.assertEqual(self.s1.ids(), ['d1', 'd2'])
        self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3'])
        self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3'])
        self.assertEqual(self.empty.ids(), [])

    def test_int_map(self):
        """int_map functions as expected
        """
        expected1 = {"1": self.d1, "2": self.d2}
        expected2 = {"1": "d1", "2": "d2"}
        self.assertEqual(self.s1.int_map(), (expected1, expected2))

        expected1 = {"h-1": self.d1, "h-2": self.d2}
        expected2 = {"h-1": "d1", "h-2": "d2"}
        self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2))

    def test_is_empty(self):
        """is_empty functions as expected
        """
        self.assertFalse(self.s1.is_empty())
        self.assertFalse(self.s2.is_empty())
        self.assertFalse(self.s3.is_empty())

        self.assertTrue(self.empty.is_empty())

    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.s1.is_valid())
        self.assertTrue(self.s2.is_valid())
        self.assertTrue(self.s3.is_valid())
        self.assertTrue(self.empty.is_valid())

        self.assertFalse(self.invalid_s1.is_valid())

    def test_iteritems(self):
        """iteritems functions as expected
        """
        self.assertEqual(list(self.s1.iteritems()),
                         [(s.id, s) for s in self.s1])

    def test_lower(self):
        """lower functions as expected
        """
        self.assertEqual(self.s1.lower(), self.s1_lower)

    def test_sequence_count(self):
        """num_seqs functions as expected
        """
        self.assertEqual(self.s1.sequence_count(), 2)
        self.assertEqual(self.s2.sequence_count(), 3)
        self.assertEqual(self.s3.sequence_count(), 5)
        self.assertEqual(self.empty.sequence_count(), 0)

    def test_sequence_lengths(self):
        """sequence_lengths functions as expected
        """
        self.assertEqual(self.s1.sequence_lengths(), [7, 3])
        self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12])
        self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12])
        self.assertEqual(self.empty.sequence_lengths(), [])

    def test_to_fasta(self):
        """to_fasta functions as expected
        """
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(self.s1.to_fasta(), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(self.s2.to_fasta(), exp2)

    def test_upper(self):
        """upper functions as expected
        """
        self.assertEqual(self.s1_lower.upper(), self.s1)