예제 #1
0
    def test_omit_gap_positions(self):
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_positions(1.0), expected)
        self.assertEqual(self.a2.omit_gap_positions(0.51), expected)

        r1 = RNA('UUAU', metadata={'id': "r1"})
        r2 = RNA('ACGU', metadata={'id': "r2"})
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.49), expected)

        r1 = RNA('UUAU', metadata={'id': "r1"})
        r2 = RNA('ACGU', metadata={'id': "r2"})
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.0), expected)

        self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Alignment.position_frequencies for more details.
        seqs = []
        for i in range(33):
            seqs.append(DNA('-.', metadata={'id': str(i)}))
        aln = Alignment(seqs)
        self.assertEqual(aln.omit_gap_positions(1 - np.finfo(float).eps),
                         Alignment([DNA('', metadata={'id': str(i)})
                                    for i in range(33)]))
예제 #2
0
    def test_update_ids_sequence_attributes_propagated(self):
        # 1 seq
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc', quality=range(4))
        ])
        exp_id_map = {'abc': 'seq1'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc', quality=range(4))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc',))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # 2 seqs
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc1', quality=range(4)),
            DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1])
        ])
        exp_id_map = {'abc': 'seq1', 'def': 'seq2'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)),
            DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def'))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)
예제 #3
0
 def test_compute_score_and_traceback_matrices_invalid(self):
     # if the sequence contains a character that is not in the
     # substitution matrix, an informative error should be raised
     m = make_identity_substitution_matrix(2, -1)
     self.assertRaises(ValueError, _compute_score_and_traceback_matrices,
                       Alignment([DNA('AWG')]), Alignment([DNA('ACGT')]), 5,
                       2, m)
예제 #4
0
def compute_distance_matrix(msa_file, csvfile="distance_mat.csv"):
    """
    load up some aligned sequences, and compute a distance matrix 
    compute distances between the sequences using the hamming function

    see also: 
    scipy.spatial.distance.hamming

    @args msa_file: multiple sequence alignment in fasta format 
    @type msa_file: str 
    @args csvfile: output distance matrix file in csv format 
    @type csvfile: str 
    """

    records = []
    for rec in SeqIO.parse(msa_file, "fasta"):
        records.append(RNA(rec.seq, rec.id))

    aln = Alignment(records)
    master_dm = aln.distances()

    ## writing the result to a csv file
    csv_header_row = [header for header in master_dm.ids]

    ## result as a list of list
    with open(csvfile, "w") as output:
        writer = csv.writer(output, lineterminator="\n")

        writer.writerows([csv_header_row])
        writer.writerows(master_dm)

    output.close()
예제 #5
0
    def test_to_phylip_no_positions(self):
        d1 = DNASequence('', id="d1")
        d2 = DNASequence('', id="d2")
        a = Alignment([d1, d2])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()
예제 #6
0
    def test_to_phylip_no_positions(self):
        d1 = DNASequence('', id="d1")
        d2 = DNASequence('', id="d2")
        a = Alignment([d1, d2])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()
예제 #7
0
def compute_distance_matrix(msa_file, csvfile="distance_mat.csv"):
    """
    load up some aligned sequences, and compute a distance matrix 
    compute distances between the sequences using the hamming function

    see also: 
    scipy.spatial.distance.hamming

    @args msa_file: multiple sequence alignment in fasta format 
    @type msa_file: str 
    @args csvfile: output distance matrix file in csv format 
    @type csvfile: str 
    """
    
    records = [] 
    for rec in SeqIO.parse(msa_file, "fasta"):
        records.append(RNA(rec.seq, rec.id))

    aln = Alignment(records)
    master_dm = aln.distances() 

    ## writing the result to a csv file 
    csv_header_row = [header for header in master_dm.ids] 

    ## result as a list of list 
    with open(csvfile, "w") as output:
        writer = csv.writer(output, lineterminator="\n")

        writer.writerows([csv_header_row])
        writer.writerows(master_dm)
    
    output.close() 
예제 #8
0
 def test_init_matrices_sw(self):
     expected_score_m = np.zeros((5, 4))
     expected_tback_m = [[0, 0, 0, 0], [0, -1, -1, -1], [0, -1, -1, -1],
                         [0, -1, -1, -1], [0, -1, -1, -1]]
     actual_score_m, actual_tback_m = _init_matrices_sw(
         Alignment([DNA('AAA')]), Alignment([DNA('AAAA')]), 5, 2)
     np.testing.assert_array_equal(actual_score_m, expected_score_m)
     np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
예제 #9
0
    def test_to_phylip_unequal_sequence_lengths(self):
        d1 = DNASequence('A-CT', id="d1")
        d2 = DNASequence('TTA', id="d2")
        d3 = DNASequence('.-AC', id="d3")
        a = Alignment([d1, d2, d3])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()
예제 #10
0
    def test_to_phylip_unequal_sequence_lengths(self):
        d1 = DNASequence('A-CT', id="d1")
        d2 = DNASequence('TTA', id="d2")
        d3 = DNASequence('.-AC', id="d3")
        a = Alignment([d1, d2, d3])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()
예제 #11
0
 def test_init_matrices_nw(self):
     expected_score_m = [[0, -5, -7, -9], [-5, 0, 0, 0], [-7, 0, 0, 0],
                         [-9, 0, 0, 0], [-11, 0, 0, 0]]
     expected_tback_m = [[0, 3, 3, 3], [2, -1, -1, -1], [2, -1, -1, -1],
                         [2, -1, -1, -1], [2, -1, -1, -1]]
     actual_score_m, actual_tback_m = _init_matrices_nw(
         Alignment([DNA('AAA')]), Alignment([DNA('AAAA')]), 5, 2)
     np.testing.assert_array_equal(actual_score_m, expected_score_m)
     np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
예제 #12
0
    def test_validate_lengths(self):
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(Alignment([
            DNASequence('TTT', id="d1")])._validate_lengths())
        self.assertFalse(Alignment([
            DNASequence('TTT', id="d1"),
            DNASequence('TT', id="d2")])._validate_lengths())
예제 #13
0
    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence("..ACC-GTTGG..", id="d1")
        d2 = DNASequence("TTACCGGT-GGCC", id="d2")
        d3 = DNASequence(".-ACC-GTTGC--", id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {"s1": "d1", "s3": "d3", "s2": "d2"})
        expected = "\n".join(["3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)
예제 #14
0
    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence("..ACC-GTTGG..", id="d1")
        d2 = DNASequence("TTACCGGT-GGCC", id="d2")
        d3 = DNASequence(".-ACC-GTTGC--", id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {"d1": "d1", "d3": "d3", "d2": "d2"})
        expected = "\n".join(["3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)
예제 #15
0
    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'})
        expected = "\n".join([
            "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)
예제 #16
0
    def test_majority_consensus(self):
        d1 = DNASequence('TTT', id="d1")
        d2 = DNASequence('TT-', id="d2")
        d3 = DNASequence('TC-', id="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertTrue(a1.majority_consensus().equals(DNASequence('TT-')))

        d1 = DNASequence('T', id="d1")
        d2 = DNASequence('A', id="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNASequence('T'), DNASequence('A')])

        self.assertEqual(self.empty.majority_consensus(), '')
예제 #17
0
    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'})
        expected = "\n".join([
            "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)
예제 #18
0
    def test_update_ids_sequence_attributes_propagated(self):
        # 1 seq
        exp_sc = Alignment([
            DNA('ACGT',
                metadata={
                    'id': "abc",
                    'description': 'desc'
                },
                positional_metadata={'quality': range(4)})
        ])
        exp_id_map = {'abc': 'seq1'}

        obj = Alignment([
            DNA('ACGT',
                metadata={
                    'id': "seq1",
                    'description': 'desc'
                },
                positional_metadata={'quality': range(4)})
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', ))
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # 2 seqs
        exp_sc = Alignment([
            DNA('ACGT',
                metadata={
                    'id': "abc",
                    'description': 'desc1'
                },
                positional_metadata={'quality': range(4)}),
            DNA('TGCA',
                metadata={
                    'id': "def",
                    'description': 'desc2'
                },
                positional_metadata={'quality': range(4)[::-1]})
        ])
        exp_id_map = {'abc': 'seq1', 'def': 'seq2'}

        obj = Alignment([
            DNA('ACGT',
                metadata={
                    'id': "seq1",
                    'description': 'desc1'
                },
                positional_metadata={'quality': (0, 1, 2, 3)}),
            DNA('TGCA',
                metadata={
                    'id': "seq2",
                    'description': 'desc2'
                },
                positional_metadata={'quality': (3, 2, 1, 0)})
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def'))
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)
예제 #19
0
    def test_update_ids_sequence_attributes_propagated(self):
        # 1 seq
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc', quality=range(4))
        ])
        exp_id_map = {'abc': 'seq1'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc', quality=range(4))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc',))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # 2 seqs
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc1', quality=range(4)),
            DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1])
        ])
        exp_id_map = {'abc': 'seq1', 'def': 'seq2'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)),
            DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def'))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)
예제 #20
0
    def setUp(self):
        # ids all same length, seqs longer than 10 chars
        dna_3_seqs = Alignment([
            DNA('..ACC-GTTGG..', id="d1"),
            DNA('TTACCGGT-GGCC', id="d2"),
            DNA('.-ACC-GTTGC--', id="d3")
        ])

        # id lengths from 0 to 10, with mixes of numbers, characters, and
        # spaces. sequence characters are a mix of cases and gap characters.
        # sequences are shorter than 10 chars
        variable_length_ids = Alignment([
            RNA('.-ACGU'),
            RNA('UGCA-.', id='a'),
            RNA('.ACGU-', id='bb'),
            RNA('ugca-.', id='1'),
            RNA('AaAaAa', id='abcdefghij'),
            RNA('GGGGGG', id='ab def42ij')
        ])

        # sequences with 20 chars = exactly two chunks of size 10
        two_chunks = Alignment([
            DNA('..ACC-GTTGG..AATGC.C', id='foo'),
            DNA('TTACCGGT-GGCCTA-GCAT', id='bar')
        ])

        # single sequence with more than two chunks
        single_seq_long = Alignment(
            [DNA('..ACC-GTTGG..AATGC.C----', id='foo')])

        # single sequence with only a single character (minimal writeable
        # alignment)
        single_seq_short = Alignment([DNA('-')])

        # alignments that can be written in phylip format
        self.objs = [
            dna_3_seqs, variable_length_ids, two_chunks, single_seq_long,
            single_seq_short
        ]
        self.fps = map(get_data_path, [
            'phylip_dna_3_seqs', 'phylip_variable_length_ids',
            'phylip_two_chunks', 'phylip_single_seq_long',
            'phylip_single_seq_short'
        ])

        # alignments that cannot be written in phylip format, paired with their
        # expected error message regexps
        self.invalid_objs = [
            # no seqs
            (Alignment([]), 'one sequence'),

            # no positions
            (Alignment([DNA('', id="d1"), DNA('', id="d2")]), 'one position'),

            # ids too long
            (Alignment(
                [RNA('ACGU', id="foo"),
                 RNA('UGCA', id="alongsequenceid")]), '10.*alongsequenceid')
        ]
예제 #21
0
    def test_majority_consensus(self):
        """majority_consensus functions as expected
        """
        d1 = DNASequence("TTT", id="d1")
        d2 = DNASequence("TT-", id="d2")
        d3 = DNASequence("TC-", id="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNASequence("TT-"))

        d1 = DNASequence("T", id="d1")
        d2 = DNASequence("A", id="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in [DNASequence("T"), DNASequence("A")])

        self.assertEqual(self.empty.majority_consensus(), "")
예제 #22
0
    def test_subalignment(self):
        """subalignment functions as expected
        """
        # keep seqs by ids
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by ids (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d2 = DNASequence('TAC', id="d2")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNASequence('.C-GTTGG..', id="d1")
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        d3 = DNASequence('-C-GTTGC--', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        expected = Alignment([d2])
        self.assertEqual(actual, expected)
예제 #23
0
    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.a1.is_valid())
        self.assertTrue(self.a2.is_valid())
        self.assertTrue(self.empty.is_valid())

        # invalid because of length mismatch
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

        # invalid because of invalid charaters
        d1 = DNASequence('..ACC-GTXGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())
예제 #24
0
    def test_fastq_to_alignment(self):
        for valid_files, kwargs, components in self.valid_configurations:
            for valid in valid_files:
                for observed_kwargs in kwargs:
                    _drop_kwargs(observed_kwargs, 'seq_num')
                    constructor = observed_kwargs.get('constructor', Sequence)

                    expected_kwargs = {}
                    expected_kwargs['lowercase'] = 'introns'
                    observed_kwargs['lowercase'] = 'introns'

                    expected = Alignment([
                        constructor(c[2],
                                    metadata={
                                        'id': c[0],
                                        'description': c[1]
                                    },
                                    positional_metadata={
                                        'quality': np.array(c[3],
                                                            dtype=np.uint8)
                                    },
                                    **expected_kwargs) for c in components
                    ])

                    observed = _fastq_to_alignment(valid, **observed_kwargs)
                    self.assertEqual(observed, expected)
예제 #25
0
    def test_generate_lane_mask(self):

        sample_alignment = """>1
        AAAAT
        >2
        AAAGG
        >3
        AACCC
        >4
        A----""".split('\n')
        aln = Alignment.from_fasta_records(parse_fasta(sample_alignment), DNA)

        actual_lanemask = generate_lane_mask(aln, 0.00)
        self.assertEqual(actual_lanemask, "11111")
        actual_lanemask = generate_lane_mask(aln, 0.10)
        self.assertEqual(actual_lanemask, "11100")
        actual_lanemask = generate_lane_mask(aln, 0.20)
        self.assertEqual(actual_lanemask, "11100")
        actual_lanemask = generate_lane_mask(aln, 0.40)
        self.assertEqual(actual_lanemask, "11000")
        actual_lanemask = generate_lane_mask(aln, 0.60)
        self.assertEqual(actual_lanemask, "11000")
        actual_lanemask = generate_lane_mask(aln, 0.80)
        self.assertEqual(actual_lanemask, "10000")
        actual_lanemask = generate_lane_mask(aln, 1.00)
        self.assertEqual(actual_lanemask, "00000")
예제 #26
0
def align_two_alignments(aln1_fp, aln2_fp, moltype, params=None):
    """Returns an Alignment object from two existing Alignments.

    Parameters
    ----------
    aln1_fp : string
        file path of 1st alignment
    aln2_fp : string
        file path of 2nd alignment
    params : dict of parameters to pass in to the Mafft app controller.

    Returns
    -------
        The aligned sequences.
    """

    # Create Mafft app.
    app = Mafft(InputHandler='_input_as_paths',
                params=params,
                SuppressStderr=False)
    app._command = 'mafft-profile'

    # Get results using int_map as input to app
    res = app([aln1_fp, aln2_fp])

    return Alignment.read(res['StdOut'], constructor=moltype)
예제 #27
0
    def test_validate_lengths(self):
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(
            Alignment([DNA('TTT', metadata={'id': "d1"})])._validate_lengths())
예제 #28
0
    def test_fastq_to_alignment(self):
        for valid_files, kwargs, components in self.valid_configurations:
            for valid in valid_files:
                for observed_kwargs in kwargs:
                    _drop_kwargs(observed_kwargs, 'seq_num')
                    constructor = observed_kwargs.get('constructor', Sequence)

                    # Can't use partials for this because the read
                    # function below can't operate on partials
                    expected_kwargs = {}
                    if hasattr(constructor, 'lowercase'):
                        expected_kwargs['lowercase'] = 'introns'
                        observed_kwargs['lowercase'] = 'introns'

                    expected = Alignment(
                        [constructor(
                            c[2], metadata={'id': c[0],
                                            'description': c[1]},
                            positional_metadata={'quality': np.array(c[3],
                                                 dtype=np.uint8)},
                            **expected_kwargs)
                         for c in components])

                    observed = _fastq_to_alignment(valid, **observed_kwargs)
                    self.assertEqual(observed, expected)
예제 #29
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label =\
         '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2 =\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.sequence_objects_a = [
         DNASequence('ACTCGAGATC', 'seq1'),
         DNASequence('GGCCT', 'seq2')
     ]
     self.sequence_objects_b = [
         BiologicalSequence('ACTCGAGATC', 'seq1'),
         BiologicalSequence('GGCCT', 'seq2')
     ]
     seqs = [
         DNASequence("ACC--G-GGTA..", id="seq1"),
         DNASequence("TCC--G-GGCA..", id="seqs2")
     ]
     self.alignment = Alignment(seqs)
예제 #30
0
    def test_generate_lane_mask(self):

        sample_alignment = """>1
        AAAAT
        >2
        AAAGG
        >3
        AACCC
        >4
        A----""".split('\n')
        aln = Alignment.from_fasta_records(parse_fasta(sample_alignment), DNA)

        actual_lanemask = generate_lane_mask(aln, 0.00)
        self.assertEqual(actual_lanemask, "11111")
        actual_lanemask = generate_lane_mask(aln, 0.10)
        self.assertEqual(actual_lanemask, "11100")
        actual_lanemask = generate_lane_mask(aln, 0.20)
        self.assertEqual(actual_lanemask, "11100")
        actual_lanemask = generate_lane_mask(aln, 0.40)
        self.assertEqual(actual_lanemask, "11000")
        actual_lanemask = generate_lane_mask(aln, 0.60)
        self.assertEqual(actual_lanemask, "11000")
        actual_lanemask = generate_lane_mask(aln, 0.80)
        self.assertEqual(actual_lanemask, "10000")
        actual_lanemask = generate_lane_mask(aln, 1.00)
        self.assertEqual(actual_lanemask, "00000")
예제 #31
0
    def test_to_phylip_no_positions(self):
        d1 = DNASequence('', id="d1")
        d2 = DNASequence('', id="d2")
        a = Alignment([d1, d2])

        with self.assertRaises(SequenceCollectionError):
            npt.assert_warns(DeprecationWarning, a.to_phylip)
예제 #32
0
    def test_subalignment(self):
        # keep seqs by ids
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by ids (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNA('.AC', metadata={'id': "d1"})
        d2 = DNA('TAC', metadata={'id': "d2"})
        d3 = DNA('.AC', metadata={'id': "d3"})
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNA('.C-GTTGG..', metadata={'id': "d1"})
        d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"})
        d3 = DNA('-C-GTTGC--', metadata={'id': "d3"})
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNA('.AC', metadata={'id': "d1"})
        d3 = DNA('.AC', metadata={'id': "d3"})
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"})
        expected = Alignment([d2])
        self.assertEqual(actual, expected)
예제 #33
0
    def test_omit_gap_sequences(self):
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Alignment.position_frequencies for more details.
        aln = Alignment([DNA('.' * 33, id='abc'), DNA('-' * 33, id='def')])
        self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps),
                         Alignment([]))
예제 #34
0
    def test_omit_gap_sequences(self):
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Alignment.position_frequencies for more details.
        aln = Alignment([DNA('.' * 33, id='abc'), DNA('-' * 33, id='def')])
        self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps),
                         Alignment([]))
예제 #35
0
    def test_majority_consensus_constructor(self):
        d1 = DNASequence('TTT', id="d1")
        d2 = DNASequence('TT-', id="d2")
        d3 = DNASequence('TC-', id="d3")
        a1 = Alignment([d1, d2, d3])

        obs = npt.assert_warns(UserWarning, a1.majority_consensus,
                               constructor=str)
        self.assertEqual(obs, 'TT-')
예제 #36
0
 def test_phylip_to_alignment_valid_files(self):
     for valid_files, components in self.valid_configurations:
         for valid in valid_files:
             observed = _phylip_to_alignment(valid)
             expected = Alignment([
                 Sequence(seq, metadata={'id': ID})
                 for (seq, ID) in components
             ])
             self.assertEqual(observed, expected)
예제 #37
0
    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        Alignment(self.seqs1, validate=True)

        # invalid DNA character
        invalid_seqs1 = [self.d1, self.d2, self.d3,
                         DNASequence('.-ACC-GTXGC--', id="i1")]
        self.assertRaises(SequenceCollectionError, Alignment,
                          invalid_seqs1, validate=True)
예제 #38
0
    def setUp(self):
        self.d1 = DNASequence("..ACC-GTTGG..", id="d1")
        self.d2 = DNASequence("TTACCGGT-GGCC", id="d2")
        self.d3 = DNASequence(".-ACC-GTTGC--", id="d3")

        self.r1 = RNASequence("UUAU-", id="r1")
        self.r2 = RNASequence("ACGUU", id="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [("d1", "..ACC-GTTGG.."), ("d2", "TTACCGGT-GGCC"), ("d3", ".-ACC-GTTGC--")]
        self.seqs2_t = [("r1", "UUAU-"), ("r2", "ACGUU")]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)])
        self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)])
        self.empty = Alignment([])
예제 #39
0
    def test_omit_gap_positions(self):
        """omitting gap positions functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_positions(1.0), expected)
        self.assertEqual(self.a2.omit_gap_positions(0.51), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.49), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.0), expected)

        self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)
예제 #40
0
    def test_subalignment_filter_out_everything(self):
        exp = Alignment([])

        # no sequences
        obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True)
        self.assertEqual(obs, exp)

        # no positions
        obs = self.a1.subalignment(positions_to_keep=None,
                                   invert_positions_to_keep=True)
        self.assertEqual(obs, exp)
예제 #41
0
 def test_position_frequencies_floating_point_precision(self):
     # Test that a position with no variation yields a frequency of exactly
     # 1.0. Note that it is important to use self.assertEqual here instead
     # of self.assertAlmostEqual because we want to test for exactly 1.0. A
     # previous implementation of Alignment.position_frequencies added
     # (1 / sequence_count) for each occurrence of a character in a position
     # to compute the frequencies (see
     # https://github.com/biocore/scikit-bio/issues/801). In certain cases,
     # this yielded a frequency slightly less than 1.0 due to roundoff
     # error. The test case here uses an alignment of 10 sequences with no
     # variation at a position. This test case exposes the roundoff error
     # present in the previous implementation because 1/10 added 10 times
     # yields a number slightly less than 1.0. This occurs because 1/10
     # cannot be represented exactly as a floating point number.
     seqs = []
     for i in range(10):
         seqs.append(DNA('A', id=str(i)))
     aln = Alignment(seqs)
     self.assertEqual(aln.position_frequencies(),
                      [defaultdict(float, {'A': 1.0})])
예제 #42
0
 def test_position_frequencies_floating_point_precision(self):
     # Test that a position with no variation yields a frequency of exactly
     # 1.0. Note that it is important to use self.assertEqual here instead
     # of self.assertAlmostEqual because we want to test for exactly 1.0. A
     # previous implementation of Alignment.position_frequencies added
     # (1 / sequence_count) for each occurrence of a character in a position
     # to compute the frequencies (see
     # https://github.com/biocore/scikit-bio/issues/801). In certain cases,
     # this yielded a frequency slightly less than 1.0 due to roundoff
     # error. The test case here uses an alignment of 10 sequences with no
     # variation at a position. This test case exposes the roundoff error
     # present in the previous implementation because 1/10 added 10 times
     # yields a number slightly less than 1.0. This occurs because 1/10
     # cannot be represented exactly as a floating point number.
     seqs = []
     for i in range(10):
         seqs.append(DNA('A', metadata={'id': str(i)}))
     aln = Alignment(seqs)
     self.assertEqual(aln.position_frequencies(),
                      [defaultdict(float, {'A': 1.0})])
예제 #43
0
 def test_filter_gap_high_entropy_low(self):
     result = filter_positions(self.alignment_with_gaps,
                               self.maximum_gap_frequency_100,
                               self.maximum_position_entropy_10)
     aln = Alignment([
         BiologicalSequence('A-', id="seq1"),
         BiologicalSequence('A-', id="seq2"),
         BiologicalSequence('A-', id="seq3"),
         BiologicalSequence('A-', id="seq4")
     ])
     self.assertEqual(result, aln)
예제 #44
0
    def test_majority_consensus(self):
        # empty cases
        self.assertEqual(
            self.empty.majority_consensus(), Sequence(''))
        self.assertEqual(
            self.no_positions.majority_consensus(), RNA(''))

        # alignment where all sequences are the same
        aln = Alignment([DNA('AG', metadata={'id': 'a'}),
                         DNA('AG', metadata={'id': 'b'})])
        self.assertEqual(aln.majority_consensus(), DNA('AG'))

        # no ties
        d1 = DNA('TTT', metadata={'id': "d1"})
        d2 = DNA('TT-', metadata={'id': "d2"})
        d3 = DNA('TC-', metadata={'id': "d3"})
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNA('TT-'))

        # ties
        d1 = DNA('T', metadata={'id': "d1"})
        d2 = DNA('A', metadata={'id': "d2"})
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNA('T'), DNA('A')])
예제 #45
0
    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', id="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', id="d3")

        self.r1 = RNASequence('UUAU-', id="r1")
        self.r2 = RNASequence('ACGUU', id="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.a3 = Alignment(self.seqs2, score=42.0,
                            start_end_positions=[(0, 3), (5, 9)])
        self.a4 = Alignment(self.seqs2, score=-42.0,
                            start_end_positions=[(1, 4), (6, 10)])

        # no sequences
        self.empty = Alignment([])

        # sequences, but no positions
        self.no_positions = Alignment([RNA('', id='a'), RNA('', id='b')])
예제 #46
0
    def test_majority_consensus(self):
        # empty cases
        self.assertTrue(
            self.empty.majority_consensus().equals(Sequence('')))
        self.assertTrue(
            self.no_positions.majority_consensus().equals(RNA('')))

        # alignment where all sequences are the same
        aln = Alignment([DNA('AG', id='a'),
                         DNA('AG', id='b')])
        self.assertTrue(aln.majority_consensus().equals(DNA('AG')))

        # no ties
        d1 = DNA('TTT', id="d1")
        d2 = DNA('TT-', id="d2")
        d3 = DNA('TC-', id="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertTrue(a1.majority_consensus().equals(DNA('TT-')))

        # ties
        d1 = DNA('T', id="d1")
        d2 = DNA('A', id="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNA('T'), DNA('A')])
예제 #47
0
    def test_traceback(self):
        score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1],
                   [-9, -5, -1, 6], [-11, -7, -3, 1]]
        score_m = np.array(score_m)
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        # start at bottom-right
        expected = ([BiologicalSequence("ACG-")], [BiologicalSequence("ACGT")],
                    1, 0, 0)
        actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]),
                            Alignment([DNA('ACGT')]), 4, 3)
        self.assertEqual(actual, expected)

        # four sequences in two alignments
        score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1],
                   [-9, -5, -1, 6], [-11, -7, -3, 1]]
        score_m = np.array(score_m)
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        # start at bottom-right
        expected = ([BiologicalSequence("ACG-"),
                     BiologicalSequence("ACG-")],
                    [BiologicalSequence("ACGT"),
                     BiologicalSequence("ACGT")], 1, 0, 0)
        actual = _traceback(tback_m, score_m,
                            Alignment([DNA('ACG', 's1'),
                                       DNA('ACG', 's2')]),
                            Alignment([DNA('ACGT', 's3'),
                                       DNA('ACGT', 's4')]), 4, 3)
        self.assertEqual(actual, expected)

        # start at highest-score
        expected = ([BiologicalSequence("ACG")], [BiologicalSequence("ACG")],
                    6, 0, 0)
        actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]),
                            Alignment([DNA('ACGT')]), 3, 3)
        self.assertEqual(actual, expected)

        # terminate traceback before top-right
        tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 0, 3], [2, 2, 2, 1],
                   [2, 2, 2, 2]]
        tback_m = np.array(tback_m)
        expected = ("G", "G", 6, 2, 2)
        expected = ([BiologicalSequence("G")], [BiologicalSequence("G")], 6, 2,
                    2)
        actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]),
                            Alignment([DNA('ACGT')]), 3, 3)
        self.assertEqual(actual, expected)
예제 #48
0
def align_unaligned_seqs(seqs_fp, moltype=DNA, params=None, accurate=False):
    """Aligns unaligned sequences

    Parameters
    ----------
    seqs_fp : string
        file path of the input fasta file
    moltype : {skbio.DNA, skbio.RNA, skbio.Protein}
    params : dict-like type
        It pass the additional parameter settings to the application.
        Default is None.
    accurate : boolean
        Perform accurate alignment or not. It will sacrifice performance
        if set to True. Default is False.

    Returns
    -------
    Alignment object
        The aligned sequences.

    See Also
    --------
    skbio.Alignment
    skbio.DNA
    skbio.RNA
    skbio.Protein
    """
    # Create Mafft app.
    app = Mafft(InputHandler='_input_as_path', params=params)

    # Turn on correct sequence type
    app.Parameters[MOLTYPE_MAP[moltype]].on()

    # Do not report progress
    app.Parameters['--quiet'].on()

    # More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value = 1000

    # Get results using int_map as input to app
    res = app(seqs_fp)

    # Get alignment as dict out of results
    alignment = Alignment.read(res['StdOut'], constructor=moltype)

    # Clean up
    res.cleanUp()

    return alignment
예제 #49
0
def parse_deblur_output(seqs_fp, derep_clusters):
    """ Parse deblur output file into an OTU map.

    Parameters
    ----------
    seqs_fp: string
        file path to deblurred sequences
    derep_clusters: dictionary
        dictionary of dereplicated sequences map

    Returns
    -------
    clusters: dictionary
        dictionary of clusters including dereplicated sequence labels

    Notes
    -----
    For each deblurred sequence in seqs_fp, use the sequence label to
    obtain all dereplicated sequence labels belonging to it
    (from derep_clusters) to create entries in a new dictionary where the keys
    are actual sequences (not the labels). Note not all sequences
    in derep_clusters will be in seqs_fp since they could have been removed in
    the artifact filtering step.
    """
    clusters = {}
    # Replace representative sequence name with actual sequence in cluster
    msa_fa = Alignment.read(seqs_fp, format='fasta')
    for label, seq in Alignment.iteritems(msa_fa):
        cluster_id = label.split(';')[0]
        seq2 = str(seq.degap())
        if seq2 not in clusters:
            clusters[seq2] = []
        if cluster_id not in derep_clusters:
            raise ValueError(
                'Seed ID %s does not exist in .uc file' % cluster_id)
        else:
            clusters[seq2].extend(derep_clusters[cluster_id])
    return clusters
예제 #50
0
    def test_update_ids_sequence_attributes_propagated(self):
        # 1 seq
        exp_sc = Alignment([
            DNA('ACGT', metadata={'id': "abc", 'description': 'desc'},
                positional_metadata={'quality': range(4)})
        ])
        exp_id_map = {'abc': 'seq1'}

        obj = Alignment([
            DNA('ACGT', metadata={'id': "seq1", 'description': 'desc'},
                positional_metadata={'quality': range(4)})
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc',))
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # 2 seqs
        exp_sc = Alignment([
            DNA('ACGT', metadata={'id': "abc", 'description': 'desc1'},
                positional_metadata={'quality': range(4)}),
            DNA('TGCA', metadata={'id': "def", 'description': 'desc2'},
                positional_metadata={'quality': range(4)[::-1]})
        ])
        exp_id_map = {'abc': 'seq1', 'def': 'seq2'}

        obj = Alignment([
            DNA('ACGT', metadata={'id': "seq1", 'description': 'desc1'},
                positional_metadata={'quality': (0, 1, 2, 3)}),
            DNA('TGCA', metadata={'id': "seq2", 'description': 'desc2'},
                positional_metadata={'quality': (3, 2, 1, 0)})
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def'))
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)
예제 #51
0
def reformat_treepuzzle(gene_tree,
                        species_tree,
                        gene_msa_fa_fp,
                        output_tree_fp,
                        output_msa_phy_fp):
    """ Reformat input trees to the format accepted by Tree-Puzzle

    Parameters
    ----------
    gene_tree: skbio.TreeNode
        TreeNode instance for gene tree
    species_tree_fp: skbio.TreeNode
        TreeNode instance for species tree
    gene_msa_fa_fp: string
        file path to gene alignments in FASTA format
    output_tree_fp: string
        file path to output trees (Nexus format)
    output_msa_phy_fp: string
        file path to output MSA in PHYLIP format

    See Also
    --------
    skbio.TreeNode
    """
    # remove the root branch length (output with ALF)
    for node in gene_tree.postorder():
        if node.is_root():
            node.length = None
    for node in species_tree.postorder():
        if node.is_root():
            node.length = None
    # trim gene tree leaves to exclude '_GENENAME' (if exists)
    trim_gene_tree_leaves(gene_tree)
    join_trees(gene_tree,
        species_tree,
        output_tree_fp)
    # trim FASTA sequence labels to exclude '/GENENAME' (if exists)
    msa_fa = Alignment.read(gene_msa_fa_fp, format='fasta')
    msa_fa_update_ids, new_to_old_ids = msa_fa.update_ids(func=id_mapper)
    msa_fa_update_ids.write(output_msa_phy_fp, format='phylip')
예제 #52
0
    def setUp(self):
        self.d1 = DNA('..ACC-GTTGG..', metadata={'id': "d1"})
        self.d2 = DNA('TTACCGGT-GGCC', metadata={'id': "d2"})
        self.d3 = DNA('.-ACC-GTTGC--', metadata={'id': "d3"})

        self.r1 = RNA('UUAU-', metadata={'id': "r1"})
        self.r2 = RNA('ACGUU', metadata={'id': "r2"})

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.a3 = Alignment(self.seqs2, score=42.0,
                            start_end_positions=[(0, 3), (5, 9)])
        self.a4 = Alignment(self.seqs2, score=-42.0,
                            start_end_positions=[(1, 4), (6, 10)])

        # no sequences
        self.empty = Alignment([])

        # sequences, but no positions
        self.no_positions = Alignment([RNA('', metadata={'id': 'a'}),
                                       RNA('', metadata={'id': 'b'})])
예제 #53
0
class AlignmentTests(TestCase):

    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', id="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', id="d3")

        self.r1 = RNASequence('UUAU-', id="r1")
        self.r2 = RNASequence('ACGUU', id="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.a3 = Alignment(self.seqs2, score=42.0,
                            start_end_positions=[(0, 3), (5, 9)])
        self.a4 = Alignment(self.seqs2, score=-42.0,
                            start_end_positions=[(1, 4), (6, 10)])
        self.empty = Alignment([])

    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs1_t]
        expected = SequenceCollection.from_fasta_records(expected, DNASequence)
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.a2.degap()
        self.assertEqual(actual, expected)

    def test_distances(self):
        """distances functions as expected
        """
        expected = [[0, 6. / 13, 4. / 13],
                    [6. / 13, 0, 7. / 13],
                    [4. / 13, 7. / 13, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances()
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42., 42.],
                    [42., 0, 42.],
                    [42., 42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances(dumb_distance)
        self.assertEqual(actual, expected)

    def test_score(self):
        self.assertEqual(self.a3.score(), 42.0)
        self.assertEqual(self.a4.score(), -42.0)

    def test_start_end_positions(self):
        self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)])
        self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)])

    def test_subalignment(self):
        """subalignment functions as expected
        """
        # keep seqs by ids
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by ids (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d2 = DNASequence('TAC', id="d2")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNASequence('.C-GTTGG..', id="d1")
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        d3 = DNASequence('-C-GTTGC--', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        expected = Alignment([d2])
        self.assertEqual(actual, expected)

    def test_subalignment_filter_out_everything(self):
        exp = Alignment([])

        # no sequences
        obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True)
        self.assertEqual(obs, exp)

        # no positions
        obs = self.a1.subalignment(positions_to_keep=None,
                                   invert_positions_to_keep=True)
        self.assertEqual(obs, exp)

    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        Alignment(self.seqs1, validate=True)

        # invalid DNA character
        invalid_seqs1 = [self.d1, self.d2, self.d3,
                         DNASequence('.-ACC-GTXGC--', id="i1")]
        self.assertRaises(SequenceCollectionError, Alignment,
                          invalid_seqs1, validate=True)

        # invalid lengths (they're not all equal)
        invalid_seqs2 = [self.d1, self.d2, self.d3,
                         DNASequence('.-ACC-GTGC--', id="i2")]
        self.assertRaises(SequenceCollectionError, Alignment,
                          invalid_seqs2, validate=True)

    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.a1.is_valid())
        self.assertTrue(self.a2.is_valid())
        self.assertTrue(self.empty.is_valid())

        # invalid because of length mismatch
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

        # invalid because of invalid charaters
        d1 = DNASequence('..ACC-GTXGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

    def test_iter_positions(self):
        """iter_positions functions as expected
        """
        actual = list(self.a2.iter_positions())
        expected = [[RNASequence(j) for j in i] for i in
                    ['UA', 'UC', 'AG', 'UU', '-U']]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]
        self.assertEqual(actual, expected)

        actual = list(self.a2.iter_positions(constructor=str))
        expected = [list('UA'),
                    list('UC'),
                    list('AG'),
                    list('UU'),
                    list('-U')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]
        self.assertEqual(actual, expected)

    def test_majority_consensus(self):
        """majority_consensus functions as expected
        """
        d1 = DNASequence('TTT', id="d1")
        d2 = DNASequence('TT-', id="d2")
        d3 = DNASequence('TC-', id="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNASequence('TT-'))

        d1 = DNASequence('T', id="d1")
        d2 = DNASequence('A', id="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNASequence('T'), DNASequence('A')])

        self.assertEqual(self.empty.majority_consensus(), '')

    def test_omit_gap_positions(self):
        """omitting gap positions functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_positions(1.0), expected)
        self.assertEqual(self.a2.omit_gap_positions(0.51), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.49), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.0), expected)

        self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)

    def test_omit_gap_sequences(self):
        """omitting gap sequences functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)

    def test_position_counters(self):
        """position_counters functions as expected
        """
        expected = [Counter({'U': 1, 'A': 1}),
                    Counter({'U': 1, 'C': 1}),
                    Counter({'A': 1, 'G': 1}),
                    Counter({'U': 2}),
                    Counter({'-': 1, 'U': 1})]
        self.assertEqual(self.a2.position_counters(), expected)

        self.assertEqual(self.empty.position_counters(), [])

    def test_position_frequencies(self):
        """computing position frequencies functions as expected
        """
        expected = [defaultdict(int, {'U': 0.5, 'A': 0.5}),
                    defaultdict(int, {'U': 0.5, 'C': 0.5}),
                    defaultdict(int, {'A': 0.5, 'G': 0.5}),
                    defaultdict(int, {'U': 1.0}),
                    defaultdict(int, {'-': 0.5, 'U': 0.5})]
        self.assertEqual(self.a2.position_frequencies(), expected)

        self.assertEqual(self.empty.position_frequencies(), [])

    def test_position_entropies(self):
        """computing positional uncertainties functions as expected

        tested by calculating values as described in this post:
         http://stackoverflow.com/a/15476958/3424666
        """
        expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(),
                                       expected, 5)

        expected = [1.0, 1.0, 1.0, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(base=2),
                                       expected, 5)

        np.testing.assert_almost_equal(self.empty.position_entropies(base=2),
                                       [])

    def test_k_word_frequencies(self):
        """k_word_frequencies functions as expected
        """
        expected = [defaultdict(int, {'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5}),
                    defaultdict(int, {'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5,
                                      'U': 2 / 5})]
        actual = self.a2.k_word_frequencies(k=1)
        for a, e in zip(actual, expected):
            self.assertEqual(sorted(a), sorted(e), 5)
            np.testing.assert_almost_equal(sorted(a.values()),
                                           sorted(e.values()), 5)

    def test_sequence_length(self):
        """sequence_length functions as expected
        """
        self.assertEqual(self.a1.sequence_length(), 13)
        self.assertEqual(self.a2.sequence_length(), 5)
        self.assertEqual(self.empty.sequence_length(), 0)

    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {'d1': 'd1',
                                  'd3': 'd3',
                                  'd2': 'd2'})
        expected = "\n".join(["3 13",
                              "d1 ..ACC-GTTGG..",
                              "d2 TTACCGGT-GGCC",
                              "d3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)

    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {'s1': 'd1',
                                  's3': 'd3',
                                  's2': 'd2'})
        expected = "\n".join(["3 13",
                              "s1 ..ACC-GTTGG..",
                              "s2 TTACCGGT-GGCC",
                              "s3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)

    def test_to_phylip_unequal_sequence_lengths(self):
        d1 = DNASequence('A-CT', id="d1")
        d2 = DNASequence('TTA', id="d2")
        d3 = DNASequence('.-AC', id="d3")
        a = Alignment([d1, d2, d3])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()

    def test_to_phylip_no_sequences(self):
        with self.assertRaises(SequenceCollectionError):
            Alignment([]).to_phylip()

    def test_to_phylip_no_positions(self):
        d1 = DNASequence('', id="d1")
        d2 = DNASequence('', id="d2")
        a = Alignment([d1, d2])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()

    def test_validate_lengths(self):
        """
        """
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(Alignment([
            DNASequence('TTT', id="d1")])._validate_lengths())
        self.assertFalse(Alignment([
            DNASequence('TTT', id="d1"),
            DNASequence('TT', id="d2")])._validate_lengths())
예제 #54
0
class AlignmentTests(TestCase):

    def setUp(self):
        self.d1 = DNA('..ACC-GTTGG..', metadata={'id': "d1"})
        self.d2 = DNA('TTACCGGT-GGCC', metadata={'id': "d2"})
        self.d3 = DNA('.-ACC-GTTGC--', metadata={'id': "d3"})

        self.r1 = RNA('UUAU-', metadata={'id': "r1"})
        self.r2 = RNA('ACGUU', metadata={'id': "r2"})

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.a3 = Alignment(self.seqs2, score=42.0,
                            start_end_positions=[(0, 3), (5, 9)])
        self.a4 = Alignment(self.seqs2, score=-42.0,
                            start_end_positions=[(1, 4), (6, 10)])

        # no sequences
        self.empty = Alignment([])

        # sequences, but no positions
        self.no_positions = Alignment([RNA('', metadata={'id': 'a'}),
                                       RNA('', metadata={'id': 'b'})])

    def test_degap(self):
        expected = SequenceCollection([
            DNA('ACCGTTGG', metadata={'id': "d1"}),
            DNA('TTACCGGTGGCC', metadata={'id': "d2"}),
            DNA('ACCGTTGC', metadata={'id': "d3"})])
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = SequenceCollection([
            RNA('UUAU', metadata={'id': "r1"}),
            RNA('ACGUU', metadata={'id': "r2"})])
        actual = self.a2.degap()
        self.assertEqual(actual, expected)

    def test_distances(self):
        expected = [[0, 6. / 13, 4. / 13],
                    [6. / 13, 0, 7. / 13],
                    [4. / 13, 7. / 13, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances()
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42., 42.],
                    [42., 0, 42.],
                    [42., 42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances(dumb_distance)
        self.assertEqual(actual, expected)

    def test_score(self):
        self.assertEqual(self.a3.score(), 42.0)
        self.assertEqual(self.a4.score(), -42.0)

    def test_start_end_positions(self):
        self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)])
        self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)])

    def test_subalignment(self):
        # keep seqs by ids
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by ids (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNA('.AC', metadata={'id': "d1"})
        d2 = DNA('TAC', metadata={'id': "d2"})
        d3 = DNA('.AC', metadata={'id': "d3"})
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNA('.C-GTTGG..', metadata={'id': "d1"})
        d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"})
        d3 = DNA('-C-GTTGC--', metadata={'id': "d3"})
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNA('.AC', metadata={'id': "d1"})
        d3 = DNA('.AC', metadata={'id': "d3"})
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"})
        expected = Alignment([d2])
        self.assertEqual(actual, expected)

    def test_subalignment_filter_out_everything(self):
        exp = Alignment([])

        # no sequences
        obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True)
        self.assertEqual(obs, exp)

        # no positions
        obs = self.a1.subalignment(positions_to_keep=None,
                                   invert_positions_to_keep=True)
        self.assertEqual(obs, exp)

    def test_init_not_equal_lengths(self):
        invalid_seqs = [self.d1, self.d2, self.d3,
                        DNA('.-ACC-GTGC--', metadata={'id': "i2"})]
        self.assertRaises(AlignmentError, Alignment,
                          invalid_seqs)

    def test_init_equal_lengths(self):
        seqs = [self.d1, self.d2, self.d3]
        Alignment(seqs)

    def test_iter_positions(self):
        actual = list(self.a2.iter_positions())
        expected = [
            [RNA('U', metadata={'id': 'r1'}), RNA('A', metadata={'id': 'r2'})],
            [RNA('U', metadata={'id': 'r1'}), RNA('C', metadata={'id': 'r2'})],
            [RNA('A', metadata={'id': 'r1'}), RNA('G', metadata={'id': 'r2'})],
            [RNA('U', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})],
            [RNA('-', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})]
        ]
        self.assertEqual(actual, expected)

        actual = list(self.a2.iter_positions(constructor=str))
        expected = [list('UA'),
                    list('UC'),
                    list('AG'),
                    list('UU'),
                    list('-U')]
        self.assertEqual(actual, expected)

    def test_majority_consensus(self):
        # empty cases
        self.assertEqual(
            self.empty.majority_consensus(), Sequence(''))
        self.assertEqual(
            self.no_positions.majority_consensus(), RNA(''))

        # alignment where all sequences are the same
        aln = Alignment([DNA('AG', metadata={'id': 'a'}),
                         DNA('AG', metadata={'id': 'b'})])
        self.assertEqual(aln.majority_consensus(), DNA('AG'))

        # no ties
        d1 = DNA('TTT', metadata={'id': "d1"})
        d2 = DNA('TT-', metadata={'id': "d2"})
        d3 = DNA('TC-', metadata={'id': "d3"})
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNA('TT-'))

        # ties
        d1 = DNA('T', metadata={'id': "d1"})
        d2 = DNA('A', metadata={'id': "d2"})
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNA('T'), DNA('A')])

    def test_omit_gap_positions(self):
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_positions(1.0), expected)
        self.assertEqual(self.a2.omit_gap_positions(0.51), expected)

        r1 = RNA('UUAU', metadata={'id': "r1"})
        r2 = RNA('ACGU', metadata={'id': "r2"})
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.49), expected)

        r1 = RNA('UUAU', metadata={'id': "r1"})
        r2 = RNA('ACGU', metadata={'id': "r2"})
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.0), expected)

        self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Alignment.position_frequencies for more details.
        seqs = []
        for i in range(33):
            seqs.append(DNA('-.', metadata={'id': str(i)}))
        aln = Alignment(seqs)
        self.assertEqual(aln.omit_gap_positions(1 - np.finfo(float).eps),
                         Alignment([DNA('', metadata={'id': str(i)})
                                    for i in range(33)]))

    def test_omit_gap_sequences(self):
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Alignment.position_frequencies for more details.
        aln = Alignment([DNA('.' * 33, metadata={'id': 'abc'}),
                         DNA('-' * 33, metadata={'id': 'def'})])
        self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps),
                         Alignment([]))

    def test_position_counters(self):
        self.assertEqual(self.empty.position_counters(), [])

        self.assertEqual(self.no_positions.position_counters(), [])

        expected = [Counter({'U': 1, 'A': 1}),
                    Counter({'U': 1, 'C': 1}),
                    Counter({'A': 1, 'G': 1}),
                    Counter({'U': 2}),
                    Counter({'-': 1, 'U': 1})]
        self.assertEqual(self.a2.position_counters(), expected)

    def test_position_frequencies(self):
        self.assertEqual(self.empty.position_frequencies(), [])

        self.assertEqual(self.no_positions.position_frequencies(), [])

        expected = [defaultdict(float, {'U': 0.5, 'A': 0.5}),
                    defaultdict(float, {'U': 0.5, 'C': 0.5}),
                    defaultdict(float, {'A': 0.5, 'G': 0.5}),
                    defaultdict(float, {'U': 1.0}),
                    defaultdict(float, {'-': 0.5, 'U': 0.5})]
        self.assertEqual(self.a2.position_frequencies(), expected)

    def test_position_frequencies_floating_point_precision(self):
        # Test that a position with no variation yields a frequency of exactly
        # 1.0. Note that it is important to use self.assertEqual here instead
        # of self.assertAlmostEqual because we want to test for exactly 1.0. A
        # previous implementation of Alignment.position_frequencies added
        # (1 / sequence_count) for each occurrence of a character in a position
        # to compute the frequencies (see
        # https://github.com/biocore/scikit-bio/issues/801). In certain cases,
        # this yielded a frequency slightly less than 1.0 due to roundoff
        # error. The test case here uses an alignment of 10 sequences with no
        # variation at a position. This test case exposes the roundoff error
        # present in the previous implementation because 1/10 added 10 times
        # yields a number slightly less than 1.0. This occurs because 1/10
        # cannot be represented exactly as a floating point number.
        seqs = []
        for i in range(10):
            seqs.append(DNA('A', metadata={'id': str(i)}))
        aln = Alignment(seqs)
        self.assertEqual(aln.position_frequencies(),
                         [defaultdict(float, {'A': 1.0})])

    def test_position_entropies(self):
        # tested by calculating values as described in this post:
        #  http://stackoverflow.com/a/15476958/3424666
        expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(),
                                       expected, 5)

        expected = [1.0, 1.0, 1.0, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(base=2),
                                       expected, 5)

        np.testing.assert_almost_equal(self.empty.position_entropies(base=2),
                                       [])

    def test_kmer_frequencies(self):
        expected = [defaultdict(float, {'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5}),
                    defaultdict(float, {'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5,
                                        'U': 2 / 5})]
        actual = self.a2.kmer_frequencies(k=1, relative=True)
        for a, e in zip(actual, expected):
            self.assertEqual(sorted(a), sorted(e), 5)
            np.testing.assert_almost_equal(sorted(a.values()),
                                           sorted(e.values()), 5)

    def test_sequence_length(self):
        self.assertEqual(self.a1.sequence_length(), 13)
        self.assertEqual(self.a2.sequence_length(), 5)
        self.assertEqual(self.empty.sequence_length(), 0)

    def test_validate_lengths(self):
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(Alignment([
            DNA('TTT', metadata={'id': "d1"})])._validate_lengths())