Пример #1
0
 def test_multi_alpha_diagonalness_of_blockdiagonal_blocks(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet
     from multicov.align_io import load_fasta
     from multicov.binary import binary_index_map
     from multicov.statistics import Statistics, MaxentModel
     from os.path import join
     align1 = load_fasta(join('test_data', 'test_aln1.fasta'),
                         protein_alphabet,
                         invalid_letter_policy='gap')
     align2 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         dna_alphabet,
                         invalid_letter_policy='gap')
     align3 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         rna_alphabet,
                         invalid_letter_policy='gap')
     align = Alignment(align1)
     align.add(align2).add(align3)
     stats = Statistics(align, regularization_amount=0.5)
     maxent = MaxentModel(stats)
     bin_map = binary_index_map(stats)
     for crt_range in bin_map:
         crt_slice = slice(crt_range[0], crt_range[1])
         crt_block = maxent.couplings[crt_slice, crt_slice]
         self.assertLess(
             np.max(np.abs(crt_block - np.diag(np.diag(crt_block)))), 1e-10)
Пример #2
0
    def test_delayed_evaluation(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)

        stats = Statistics(align)

        old_expected_f1 = _slow_get_freq1(align)
        old_expected_f2 = _slow_get_freq2(align)
        old_expected_cmat = old_expected_f2 - np.outer(old_expected_f1,
                                                       old_expected_f1)

        # modify align, and test that the statistics are calculated for the modified one
        align.data[1, :] = list('CMPRY')
        align.data[10, :] = list('KHRCD')

        expected_f1 = _slow_get_freq1(align)
        expected_f2 = _slow_get_freq2(align)
        expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1)

        self.assertFalse(np.allclose(expected_f1, old_expected_f1))
        self.assertFalse(np.allclose(expected_f2, old_expected_f2))
        self.assertFalse(np.allclose(expected_cmat, old_expected_cmat))

        self.assertTrue(np.allclose(stats.freq1, expected_f1))
        self.assertTrue(np.allclose(stats.freq2, expected_f2))
        self.assertTrue(np.allclose(stats.cmat, expected_cmat))
Пример #3
0
 def test_multi_alpha_shape_and_symmetry_of_couplings(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet
     from multicov.align_io import load_fasta
     from multicov.statistics import Statistics, MaxentModel
     from os.path import join
     align1 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         dna_alphabet,
                         invalid_letter_policy='gap')
     align2 = load_fasta(join('test_data', 'test_aln1.fasta'),
                         protein_alphabet,
                         invalid_letter_policy='gap')
     align3 = load_fasta(join('test_data', 'test_aln2.fasta'),
                         rna_alphabet,
                         invalid_letter_policy='uppergap')
     align = Alignment(align1)
     align.add(align2).add(align3)
     stats = Statistics(align, regularization_amount=0.5)
     maxent = MaxentModel(stats)
     self.assertLess(np.max(np.abs(maxent.couplings - maxent.couplings.T)),
                     1e-10)
     self.assertSequenceEqual(
         np.shape(maxent.couplings), 2 * [
             4 * (align1.get_width() + align3.get_width()) +
             20 * align2.get_width()
         ])
Пример #4
0
    def test_protein(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)

        alpha = 0.3

        stats1 = Statistics(align)
        stats2 = Statistics(align,
                            regularization_amount=alpha,
                            regularizer='pseudocount')

        expected_f1 = (1 - alpha) * stats1.freq1 + alpha / 21.0
        expected_f2 = (1 - alpha) * stats1.freq2 + alpha / 21.0**2

        # block-diagonal needs correction because those variables are not independent
        for i in range(align.get_width()):
            idxs = slice(20 * i, 20 * (i + 1))
            # noinspection PyUnresolvedReferences
            expected_f2[idxs, idxs] = np.diag(expected_f1[idxs])

        expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1)

        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(expected_f1, stats2.freq1))
        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(expected_f2, stats2.freq2))
        # noinspection PyTypeChecker
        self.assertTrue(np.allclose(expected_cmat, stats2.cmat))
Пример #5
0
 def test_with_truncate(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import rna_alphabet
     from multicov.filtering import align_to_sequence
     align = Alignment(['AA-U', 'ACGU', '--GG', 'A-GG', 'GGC-'],
                       rna_alphabet)
     align_to_sequence(align, 'AGG', truncate=True)
     expected = Alignment(['A-U', 'AGU', '-GG', 'AGG', 'GC-'], rna_alphabet)
     self.assertEqual(align, expected)
Пример #6
0
 def test_replace_invalid_by_uppercase_then_leave(self):
     from multicov.alignment import Alignment
     from multicov.align_io import load_fasta
     from multicov.alphabet import dna_alphabet
     align = load_fasta(os.path.join('test_data', 'test_aln2.fasta'),
                        dna_alphabet,
                        invalid_letter_policy='upper')
     expected = Alignment(['GATTACA', 'ACCA--T', 'G.C-A-C'], dna_alphabet)
     expected.annotations['name'] = ['one', 'sequence', 'one line']
     self.assertEqual(align, expected)
Пример #7
0
 def test_dna_unchanged_invalid(self):
     from multicov.alignment import Alignment
     from multicov.align_io import load_fasta
     from multicov.alphabet import dna_alphabet
     align = load_fasta(os.path.join('test_data', 'test_aln2.fasta'),
                        dna_alphabet,
                        invalid_letter_policy='unchanged')
     expected = Alignment(['GATTACA', 'ACCA--T', 'G.c-a-c'], dna_alphabet)
     expected.annotations['name'] = ['one', 'sequence', 'one line']
     self.assertEqual(align, expected)
Пример #8
0
 def test_protein_unchanged_invalid(self):
     from multicov.alignment import Alignment
     from multicov.align_io import load_fasta
     from multicov.alphabet import protein_alphabet
     align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'),
                        protein_alphabet,
                        invalid_letter_policy='unchanged')
     expected = Alignment(['IVGGYTCQ', 'XVGGTEAQ', 'IGG-KDT-'],
                          alphabet=protein_alphabet)
     expected.annotations['name'] = ['seq1', 'seq2', 'seq3']
     self.assertEqual(align, expected)
Пример #9
0
 def test_protein_keep_annot_ws(self):
     from multicov.alignment import Alignment
     from multicov.align_io import load_fasta
     from multicov.alphabet import protein_alphabet
     align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'),
                        protein_alphabet,
                        strip_ws_in_annot=False)
     expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'],
                          alphabet=protein_alphabet)
     expected.annotations['name'] = ['seq1  ', ' seq2', 'seq3']
     self.assertEqual(align, expected)
Пример #10
0
    def test_multi_alpha_single_idx_from_character_alignment(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet, rna_alphabet
        from multicov.binary import binary_index_map
        align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet)
        align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet)

        align = Alignment(align1).add(align2)

        self.assertSequenceEqual(binary_index_map(align, 1), (4, 8))
        self.assertSequenceEqual(binary_index_map(align, 4), (32, 52))
Пример #11
0
 def test_against_lucy(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics
     from scipy.io import loadmat
     import os.path
     lucy = loadmat(os.path.join('test_data', 'lucy_dca_pdz_small.mat'),
                    squeeze_me=True)
     align = Alignment(lucy['alignment']['data'][()], protein_alphabet)
     align.update_sequence_weights(0.7)
     stats = Statistics(align, regularization_amount=0.5)
     self.assertTrue(np.allclose(stats.cmat, lucy['DCAmat']))
Пример #12
0
    def test_returns_copy_even_when_unchanged(self):
        from multicov.alignment import Alignment
        from multicov.filtering import filter_rows
        from multicov.alphabet import protein_alphabet, dna_alphabet

        align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'],
                          protein_alphabet)
        align.add(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet)
        align_clean = filter_rows(align, 0.9)

        self.assertEqual(len(align_clean), 3)
        self.assertIsNot(align_clean, align)
Пример #13
0
 def test_load(self):
     from multicov.align_io import from_hdf
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from pandas import HDFStore
     store = HDFStore(os.path.join('test_data', 'test_aln.h5'), 'r')
     align = from_hdf(store, 'align1')
     expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'],
                          protein_alphabet)
     expected.annotations['seqw'] = [0.5, 1, 0.5]
     store.close()
     self.assertEqual(align, expected)
Пример #14
0
    def test_binalign_object_method(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet, rna_alphabet
        from multicov.binary import BinaryAlignment
        align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet)
        align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet)

        align = Alignment(align1).add(align2)
        bin_align = BinaryAlignment.from_alignment(align)

        self.assertSequenceEqual(bin_align.index_map(2), (8, 12))
        self.assertSequenceEqual(bin_align.index_map(3), (12, 32))
Пример #15
0
 def test_load_multi_alpha(self):
     from multicov.align_io import from_hdf
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet, dna_alphabet
     from pandas import HDFStore
     store = HDFStore(os.path.join('test_data', 'test_aln.h5'), 'r')
     align = from_hdf(store, 'align2')
     expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'],
                          protein_alphabet)
     expected2 = Alignment(['AGCT', '-G-G', 'TA-T'], dna_alphabet)
     expected.add(expected2)
     store.close()
     self.assertEqual(align, expected)
Пример #16
0
 def test_mask_from_first_seq(self):
     from multicov.alignment import Alignment
     from multicov.align_io import load_fasta
     from multicov.alphabet import protein_alphabet
     from numpy import in1d
     align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'),
                        protein_alphabet,
                        invalid_letter_policy='unchanged',
                        mask_fct=lambda s: ~in1d(list(s), ['V', 'G']))
     expected = Alignment(['IYTCQ', 'XTEAQ', 'IKDT-'],
                          alphabet=protein_alphabet)
     expected.annotations['name'] = ['seq1', 'seq2', 'seq3']
     self.assertEqual(align, expected)
Пример #17
0
    def test_binalign_object_method_full_map(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet, rna_alphabet
        from multicov.binary import BinaryAlignment
        align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet)
        align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet)

        align = Alignment(align1).add(align2)
        bin_align = BinaryAlignment.from_alignment(align)
        full_map = bin_align.index_map()

        self.assertTrue(
            np.array_equal(full_map,
                           [[0, 4], [4, 8], [8, 12], [12, 32], [32, 52]]))
Пример #18
0
    def test_include_gaps_from_binalign(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet, rna_alphabet
        from multicov.binary import BinaryAlignment
        align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet)
        align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet)

        align = Alignment(align1).add(align2)
        bin_align = BinaryAlignment.from_alignment(align, include_gaps=True)
        full_map = bin_align.index_map()

        self.assertTrue(
            np.array_equal(full_map,
                           [[0, 5], [5, 10], [10, 15], [15, 36], [36, 57]]))
Пример #19
0
    def test_copy_alpha_annots_refmap_by_ref_for_binalign(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.statistics import Statistics
        align = Alignment([
            'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE',
            'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG',
            'T-PMV', 'LNCPY'
        ], protein_alphabet)
        bin_align = align.to_binary()

        stats = Statistics(bin_align)
        self.assertIs(stats.alphabets, bin_align.alphabets)
        self.assertIs(stats.reference, bin_align.reference)
        self.assertIs(stats.annotations, bin_align.annotations)
Пример #20
0
    def test_on_multi_alpha(self):
        from multicov.alignment import Alignment
        from multicov.filtering import filter_rows
        from multicov.alphabet import protein_alphabet, dna_alphabet

        align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'],
                          protein_alphabet)
        align.add(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet)
        align_clean = filter_rows(align, 0.2)

        expected = Alignment(['IVGGYTCQ', '-VGGTEAQ'],
                             protein_alphabet).add(['ATACAT', 'GATACA'],
                                                   dna_alphabet)

        self.assertEqual(align_clean, expected)
Пример #21
0
    def test_on_protein(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.filtering import filter_rows
        threshold = 1 / 21
        align = Alignment([
            'WKHNAYDMLSSDCQFESSHKHTQVCSAGMGYCAKPNNWGYW-LIVKMMW-CDYKQKLIYIAPLN',
            'KHRCDANC-MLAN-SVIKYTHSACALIWTWNS-KIIRYFFVGAWFKEHFDSVPTAQACVCDSTP',
            'LGVVGYYFKPCT-EVPSYSRFNVFHRIFPYLVYRVEE-NHTGHHVQ-KIVRNQYELRSIFDEHG',
            'LIGDDHRN-LALCPS-T-GTTCCNWKWRSEWTMHSDTNCNPVAE--SYSKRCNDIGYITWINYA',
            'CMPRYWYTYQYDCIFGWRFYSVYWPCLDDMFWQPYVDSMELF-NPMVATEWIMENCQGWG-N-K',
            'QWFWRARPFE--FSC-C-PGP-GWVNLIDWMSCNKAMETLMRPYCNPYLKIQLPRSKNLLDDDG',
            'VTMPEGHHCPAM-PLDLNGQR-KMWGSDFKKEDCKGYPEKFDCENLIDMDICLSLNTRPED-QR',
            'LNYINMHVD-IGP-PCPQYDL--KFKCMYW-GQIEDV-NMQ-WKK-RTMDAVEQIVSMYHMSVE',
            'WHV-EWKPVLC-PHWQFYM-VITEYVAMFQWCPPKGMASPKKGNLPRMFQSAKAIGAHRSDM-Y',
            'PIWGGFNFPWID-GSQRQQR-EVTTGCDDFEHKYNPYLVPG-WEFGKYSNCWT-RCWRVNHDTV',
            'PPCWVEAPYKPMGMWN-GRKV-NVAVWHHVIVL-DMYGLHLLRDWTMVKNAAHIFSHNMEMSNI',
            'E-MWRGLIWSKGAY-YQNDNGTFNWPKQKHP-ARCSF-PTVNKDQNPGP-MVQMREFKSQQGQQ',
            'RFGKFTCMGFRWKEYFTKQ-NPYKYRGIVHVKVQMIYSANGNLDWIDIPMIIRLKCPFGTRVTQ',
            'CGRCGSH-EWL-NIMRNCKFIFWWRPTNAAHIWCARHESPKAD-QIAMTYRML-LDAHIIIVR-',
            'T-PMVWRLVWYDHGCDPWMLIV-PIEPCVVKKPQYKDMERFSPDIKCHYLHDKDDGFWGSDKYI',
            'LNCPYADLDGL-NPQR-FVVS-RCMRDGFRAVVRVSPDDLS-MWCKAGA-NTTV-DNRH-IVQW'
        ], protein_alphabet)
        align_clean = filter_rows(align, max_gaps=threshold)

        # noinspection PyTypeChecker
        gap_fraction = np.mean(align.data == '-', axis=1)
        # noinspection PyTypeChecker
        gap_fraction_clean = np.mean(align_clean.data == '-', axis=1)

        self.assertLess(len(align_clean), len(align))
        self.assertLessEqual(np.max(gap_fraction_clean), threshold)
        self.assertEqual(np.sum(gap_fraction <= threshold), len(align_clean))
Пример #22
0
    def test_search_dna(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import dna_alphabet
        from multicov.filtering import search
        align = Alignment(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet)

        self.assertEqual(search(align, 'AAGG'), 2)
Пример #23
0
 def test_get_str_goes_to_annotations(self):
     from multicov.alignment import Alignment
     from multicov.binary import BinaryAlignment
     from multicov.alphabet import rna_alphabet
     align = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet)
     bin_align = BinaryAlignment.from_alignment(align)
     self.assertIs(bin_align['seqw'], bin_align.annotations['seqw'])
Пример #24
0
    def test_search_approx(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.filtering import search
        align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'],
                          alphabet=protein_alphabet)

        self.assertEqual(search(align, 'IGGYTCQ'), 0)
Пример #25
0
    def test_search_list(self):
        from multicov.alignment import Alignment
        from multicov.alphabet import protein_alphabet
        from multicov.filtering import search
        align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'],
                          alphabet=protein_alphabet)

        self.assertEqual(search(align, ['I', 'G', 'G', 'K', 'D', 'T']), 2)
Пример #26
0
 def test_details_align_accuracy(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import protein_alphabet
     from multicov.filtering import align_to_sequence
     align = Alignment(['-AWGGH', 'D-GG-A', 'WWGYPD', 'W--IIK', '--FDGH'],
                       protein_alphabet)
     details = align_to_sequence(align, 'AWCWGYPPCY')
     self.assertAlmostEqual(details['align_accuracy'], 5 / 6)
Пример #27
0
 def test_details_index(self):
     from multicov.alignment import Alignment
     from multicov.alphabet import rna_alphabet
     from multicov.filtering import align_to_sequence
     align = Alignment(['AG-U', 'ACGU', '--GG', 'A-GG', 'GGC-'],
                       rna_alphabet)
     details = align_to_sequence(align, 'GUAACCGUU')
     self.assertEqual(details['idx'], 1)
Пример #28
0
 def test_with_list_of_seqs(self):
     from multicov.alignment import Alignment
     from multicov.binary import binary_index_map
     from multicov.alphabet import protein_alphabet
     from multicov.statistics import Statistics, MaxentModel
     align = Alignment([
         'WKHNAY', 'KHRCDA', 'LGVVGY', 'LIGDDH', 'CMPRYW', 'QWFWRA',
         'VTMPEG', 'LNYINM', 'WHV-EW', 'PIWGGF', 'PPCWVE', 'E-MWRG',
         'RFGKFT', 'CGRCGS', 'T-PMVW', 'LNCPYA'
     ], protein_alphabet)
     stats = Statistics(align, regularization_amount=0.1)
     maxent = MaxentModel(stats)
     seqs = ['WHVDYA', 'PP-FR-']
     energies = maxent.score(seqs)
     seq_align = Alignment(seqs, protein_alphabet)
     energies0 = maxent.score(seq_align)
     self.assertTrue(np.allclose(energies, energies0))
Пример #29
0
 def test_rna_roundtrip(self):
     from multicov.alignment import Alignment
     from multicov.binary import BinaryAlignment
     from multicov.alphabet import rna_alphabet
     align = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet)
     bin_align = BinaryAlignment.from_alignment(align)
     align_again = bin_align.to_alignment()
     self.assertEqual(align, align_again)
Пример #30
0
 def test_single_alpha_full_map(self):
     from multicov.alignment import Alignment
     from multicov.binary import BinaryAlignment, binary_index_map
     from multicov.alphabet import rna_alphabet
     align = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet)
     bin_align = BinaryAlignment.from_alignment(align)
     full_map = binary_index_map(bin_align)
     self.assertTrue(np.array_equal(full_map, [[0, 4], [4, 8], [8, 12]]))