def test_multi_alpha_diagonalness_of_blockdiagonal_blocks(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet from multicov.align_io import load_fasta from multicov.binary import binary_index_map from multicov.statistics import Statistics, MaxentModel from os.path import join align1 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align3 = load_fasta(join('test_data', 'test_aln2.fasta'), rna_alphabet, invalid_letter_policy='gap') align = Alignment(align1) align.add(align2).add(align3) stats = Statistics(align, regularization_amount=0.5) maxent = MaxentModel(stats) bin_map = binary_index_map(stats) for crt_range in bin_map: crt_slice = slice(crt_range[0], crt_range[1]) crt_block = maxent.couplings[crt_slice, crt_slice] self.assertLess( np.max(np.abs(crt_block - np.diag(np.diag(crt_block)))), 1e-10)
def test_delayed_evaluation(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.statistics import Statistics align = Alignment([ 'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE', 'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG', 'T-PMV', 'LNCPY' ], protein_alphabet) stats = Statistics(align) old_expected_f1 = _slow_get_freq1(align) old_expected_f2 = _slow_get_freq2(align) old_expected_cmat = old_expected_f2 - np.outer(old_expected_f1, old_expected_f1) # modify align, and test that the statistics are calculated for the modified one align.data[1, :] = list('CMPRY') align.data[10, :] = list('KHRCD') expected_f1 = _slow_get_freq1(align) expected_f2 = _slow_get_freq2(align) expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1) self.assertFalse(np.allclose(expected_f1, old_expected_f1)) self.assertFalse(np.allclose(expected_f2, old_expected_f2)) self.assertFalse(np.allclose(expected_cmat, old_expected_cmat)) self.assertTrue(np.allclose(stats.freq1, expected_f1)) self.assertTrue(np.allclose(stats.freq2, expected_f2)) self.assertTrue(np.allclose(stats.cmat, expected_cmat))
def test_multi_alpha_shape_and_symmetry_of_couplings(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, dna_alphabet, rna_alphabet from multicov.align_io import load_fasta from multicov.statistics import Statistics, MaxentModel from os.path import join align1 = load_fasta(join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='gap') align2 = load_fasta(join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='gap') align3 = load_fasta(join('test_data', 'test_aln2.fasta'), rna_alphabet, invalid_letter_policy='uppergap') align = Alignment(align1) align.add(align2).add(align3) stats = Statistics(align, regularization_amount=0.5) maxent = MaxentModel(stats) self.assertLess(np.max(np.abs(maxent.couplings - maxent.couplings.T)), 1e-10) self.assertSequenceEqual( np.shape(maxent.couplings), 2 * [ 4 * (align1.get_width() + align3.get_width()) + 20 * align2.get_width() ])
def test_protein(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.statistics import Statistics align = Alignment([ 'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE', 'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG', 'T-PMV', 'LNCPY' ], protein_alphabet) alpha = 0.3 stats1 = Statistics(align) stats2 = Statistics(align, regularization_amount=alpha, regularizer='pseudocount') expected_f1 = (1 - alpha) * stats1.freq1 + alpha / 21.0 expected_f2 = (1 - alpha) * stats1.freq2 + alpha / 21.0**2 # block-diagonal needs correction because those variables are not independent for i in range(align.get_width()): idxs = slice(20 * i, 20 * (i + 1)) # noinspection PyUnresolvedReferences expected_f2[idxs, idxs] = np.diag(expected_f1[idxs]) expected_cmat = expected_f2 - np.outer(expected_f1, expected_f1) # noinspection PyTypeChecker self.assertTrue(np.allclose(expected_f1, stats2.freq1)) # noinspection PyTypeChecker self.assertTrue(np.allclose(expected_f2, stats2.freq2)) # noinspection PyTypeChecker self.assertTrue(np.allclose(expected_cmat, stats2.cmat))
def test_with_truncate(self): from multicov.alignment import Alignment from multicov.alphabet import rna_alphabet from multicov.filtering import align_to_sequence align = Alignment(['AA-U', 'ACGU', '--GG', 'A-GG', 'GGC-'], rna_alphabet) align_to_sequence(align, 'AGG', truncate=True) expected = Alignment(['A-U', 'AGU', '-GG', 'AGG', 'GC-'], rna_alphabet) self.assertEqual(align, expected)
def test_replace_invalid_by_uppercase_then_leave(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import dna_alphabet align = load_fasta(os.path.join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='upper') expected = Alignment(['GATTACA', 'ACCA--T', 'G.C-A-C'], dna_alphabet) expected.annotations['name'] = ['one', 'sequence', 'one line'] self.assertEqual(align, expected)
def test_dna_unchanged_invalid(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import dna_alphabet align = load_fasta(os.path.join('test_data', 'test_aln2.fasta'), dna_alphabet, invalid_letter_policy='unchanged') expected = Alignment(['GATTACA', 'ACCA--T', 'G.c-a-c'], dna_alphabet) expected.annotations['name'] = ['one', 'sequence', 'one line'] self.assertEqual(align, expected)
def test_protein_unchanged_invalid(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import protein_alphabet align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='unchanged') expected = Alignment(['IVGGYTCQ', 'XVGGTEAQ', 'IGG-KDT-'], alphabet=protein_alphabet) expected.annotations['name'] = ['seq1', 'seq2', 'seq3'] self.assertEqual(align, expected)
def test_protein_keep_annot_ws(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import protein_alphabet align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'), protein_alphabet, strip_ws_in_annot=False) expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], alphabet=protein_alphabet) expected.annotations['name'] = ['seq1 ', ' seq2', 'seq3'] self.assertEqual(align, expected)
def test_multi_alpha_single_idx_from_character_alignment(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, rna_alphabet from multicov.binary import binary_index_map align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet) align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet) align = Alignment(align1).add(align2) self.assertSequenceEqual(binary_index_map(align, 1), (4, 8)) self.assertSequenceEqual(binary_index_map(align, 4), (32, 52))
def test_against_lucy(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.statistics import Statistics from scipy.io import loadmat import os.path lucy = loadmat(os.path.join('test_data', 'lucy_dca_pdz_small.mat'), squeeze_me=True) align = Alignment(lucy['alignment']['data'][()], protein_alphabet) align.update_sequence_weights(0.7) stats = Statistics(align, regularization_amount=0.5) self.assertTrue(np.allclose(stats.cmat, lucy['DCAmat']))
def test_returns_copy_even_when_unchanged(self): from multicov.alignment import Alignment from multicov.filtering import filter_rows from multicov.alphabet import protein_alphabet, dna_alphabet align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], protein_alphabet) align.add(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet) align_clean = filter_rows(align, 0.9) self.assertEqual(len(align_clean), 3) self.assertIsNot(align_clean, align)
def test_load(self): from multicov.align_io import from_hdf from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from pandas import HDFStore store = HDFStore(os.path.join('test_data', 'test_aln.h5'), 'r') align = from_hdf(store, 'align1') expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], protein_alphabet) expected.annotations['seqw'] = [0.5, 1, 0.5] store.close() self.assertEqual(align, expected)
def test_binalign_object_method(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, rna_alphabet from multicov.binary import BinaryAlignment align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet) align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet) align = Alignment(align1).add(align2) bin_align = BinaryAlignment.from_alignment(align) self.assertSequenceEqual(bin_align.index_map(2), (8, 12)) self.assertSequenceEqual(bin_align.index_map(3), (12, 32))
def test_load_multi_alpha(self): from multicov.align_io import from_hdf from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, dna_alphabet from pandas import HDFStore store = HDFStore(os.path.join('test_data', 'test_aln.h5'), 'r') align = from_hdf(store, 'align2') expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], protein_alphabet) expected2 = Alignment(['AGCT', '-G-G', 'TA-T'], dna_alphabet) expected.add(expected2) store.close() self.assertEqual(align, expected)
def test_mask_from_first_seq(self): from multicov.alignment import Alignment from multicov.align_io import load_fasta from multicov.alphabet import protein_alphabet from numpy import in1d align = load_fasta(os.path.join('test_data', 'test_aln1.fasta'), protein_alphabet, invalid_letter_policy='unchanged', mask_fct=lambda s: ~in1d(list(s), ['V', 'G'])) expected = Alignment(['IYTCQ', 'XTEAQ', 'IKDT-'], alphabet=protein_alphabet) expected.annotations['name'] = ['seq1', 'seq2', 'seq3'] self.assertEqual(align, expected)
def test_binalign_object_method_full_map(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, rna_alphabet from multicov.binary import BinaryAlignment align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet) align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet) align = Alignment(align1).add(align2) bin_align = BinaryAlignment.from_alignment(align) full_map = bin_align.index_map() self.assertTrue( np.array_equal(full_map, [[0, 4], [4, 8], [8, 12], [12, 32], [32, 52]]))
def test_include_gaps_from_binalign(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, rna_alphabet from multicov.binary import BinaryAlignment align1 = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet) align2 = Alignment(['DF', 'YA', '-C'], alphabet=protein_alphabet) align = Alignment(align1).add(align2) bin_align = BinaryAlignment.from_alignment(align, include_gaps=True) full_map = bin_align.index_map() self.assertTrue( np.array_equal(full_map, [[0, 5], [5, 10], [10, 15], [15, 36], [36, 57]]))
def test_copy_alpha_annots_refmap_by_ref_for_binalign(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.statistics import Statistics align = Alignment([ 'WKHNA', 'KHRCD', 'LGVVG', 'LIGDD', 'CMPRY', 'QWFWR', 'VTMPE', 'LNYIN', 'WHV-E', 'PIWGG', 'PPCWV', 'E-MWR', 'RFGKF', 'CGRCG', 'T-PMV', 'LNCPY' ], protein_alphabet) bin_align = align.to_binary() stats = Statistics(bin_align) self.assertIs(stats.alphabets, bin_align.alphabets) self.assertIs(stats.reference, bin_align.reference) self.assertIs(stats.annotations, bin_align.annotations)
def test_on_multi_alpha(self): from multicov.alignment import Alignment from multicov.filtering import filter_rows from multicov.alphabet import protein_alphabet, dna_alphabet align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], protein_alphabet) align.add(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet) align_clean = filter_rows(align, 0.2) expected = Alignment(['IVGGYTCQ', '-VGGTEAQ'], protein_alphabet).add(['ATACAT', 'GATACA'], dna_alphabet) self.assertEqual(align_clean, expected)
def test_on_protein(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.filtering import filter_rows threshold = 1 / 21 align = Alignment([ 'WKHNAYDMLSSDCQFESSHKHTQVCSAGMGYCAKPNNWGYW-LIVKMMW-CDYKQKLIYIAPLN', 'KHRCDANC-MLAN-SVIKYTHSACALIWTWNS-KIIRYFFVGAWFKEHFDSVPTAQACVCDSTP', 'LGVVGYYFKPCT-EVPSYSRFNVFHRIFPYLVYRVEE-NHTGHHVQ-KIVRNQYELRSIFDEHG', 'LIGDDHRN-LALCPS-T-GTTCCNWKWRSEWTMHSDTNCNPVAE--SYSKRCNDIGYITWINYA', 'CMPRYWYTYQYDCIFGWRFYSVYWPCLDDMFWQPYVDSMELF-NPMVATEWIMENCQGWG-N-K', 'QWFWRARPFE--FSC-C-PGP-GWVNLIDWMSCNKAMETLMRPYCNPYLKIQLPRSKNLLDDDG', 'VTMPEGHHCPAM-PLDLNGQR-KMWGSDFKKEDCKGYPEKFDCENLIDMDICLSLNTRPED-QR', 'LNYINMHVD-IGP-PCPQYDL--KFKCMYW-GQIEDV-NMQ-WKK-RTMDAVEQIVSMYHMSVE', 'WHV-EWKPVLC-PHWQFYM-VITEYVAMFQWCPPKGMASPKKGNLPRMFQSAKAIGAHRSDM-Y', 'PIWGGFNFPWID-GSQRQQR-EVTTGCDDFEHKYNPYLVPG-WEFGKYSNCWT-RCWRVNHDTV', 'PPCWVEAPYKPMGMWN-GRKV-NVAVWHHVIVL-DMYGLHLLRDWTMVKNAAHIFSHNMEMSNI', 'E-MWRGLIWSKGAY-YQNDNGTFNWPKQKHP-ARCSF-PTVNKDQNPGP-MVQMREFKSQQGQQ', 'RFGKFTCMGFRWKEYFTKQ-NPYKYRGIVHVKVQMIYSANGNLDWIDIPMIIRLKCPFGTRVTQ', 'CGRCGSH-EWL-NIMRNCKFIFWWRPTNAAHIWCARHESPKAD-QIAMTYRML-LDAHIIIVR-', 'T-PMVWRLVWYDHGCDPWMLIV-PIEPCVVKKPQYKDMERFSPDIKCHYLHDKDDGFWGSDKYI', 'LNCPYADLDGL-NPQR-FVVS-RCMRDGFRAVVRVSPDDLS-MWCKAGA-NTTV-DNRH-IVQW' ], protein_alphabet) align_clean = filter_rows(align, max_gaps=threshold) # noinspection PyTypeChecker gap_fraction = np.mean(align.data == '-', axis=1) # noinspection PyTypeChecker gap_fraction_clean = np.mean(align_clean.data == '-', axis=1) self.assertLess(len(align_clean), len(align)) self.assertLessEqual(np.max(gap_fraction_clean), threshold) self.assertEqual(np.sum(gap_fraction <= threshold), len(align_clean))
def test_search_dna(self): from multicov.alignment import Alignment from multicov.alphabet import dna_alphabet from multicov.filtering import search align = Alignment(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet) self.assertEqual(search(align, 'AAGG'), 2)
def test_get_str_goes_to_annotations(self): from multicov.alignment import Alignment from multicov.binary import BinaryAlignment from multicov.alphabet import rna_alphabet align = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet) bin_align = BinaryAlignment.from_alignment(align) self.assertIs(bin_align['seqw'], bin_align.annotations['seqw'])
def test_search_approx(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.filtering import search align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], alphabet=protein_alphabet) self.assertEqual(search(align, 'IGGYTCQ'), 0)
def test_search_list(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.filtering import search align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], alphabet=protein_alphabet) self.assertEqual(search(align, ['I', 'G', 'G', 'K', 'D', 'T']), 2)
def test_details_align_accuracy(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.filtering import align_to_sequence align = Alignment(['-AWGGH', 'D-GG-A', 'WWGYPD', 'W--IIK', '--FDGH'], protein_alphabet) details = align_to_sequence(align, 'AWCWGYPPCY') self.assertAlmostEqual(details['align_accuracy'], 5 / 6)
def test_details_index(self): from multicov.alignment import Alignment from multicov.alphabet import rna_alphabet from multicov.filtering import align_to_sequence align = Alignment(['AG-U', 'ACGU', '--GG', 'A-GG', 'GGC-'], rna_alphabet) details = align_to_sequence(align, 'GUAACCGUU') self.assertEqual(details['idx'], 1)
def test_with_list_of_seqs(self): from multicov.alignment import Alignment from multicov.binary import binary_index_map from multicov.alphabet import protein_alphabet from multicov.statistics import Statistics, MaxentModel align = Alignment([ 'WKHNAY', 'KHRCDA', 'LGVVGY', 'LIGDDH', 'CMPRYW', 'QWFWRA', 'VTMPEG', 'LNYINM', 'WHV-EW', 'PIWGGF', 'PPCWVE', 'E-MWRG', 'RFGKFT', 'CGRCGS', 'T-PMVW', 'LNCPYA' ], protein_alphabet) stats = Statistics(align, regularization_amount=0.1) maxent = MaxentModel(stats) seqs = ['WHVDYA', 'PP-FR-'] energies = maxent.score(seqs) seq_align = Alignment(seqs, protein_alphabet) energies0 = maxent.score(seq_align) self.assertTrue(np.allclose(energies, energies0))
def test_rna_roundtrip(self): from multicov.alignment import Alignment from multicov.binary import BinaryAlignment from multicov.alphabet import rna_alphabet align = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet) bin_align = BinaryAlignment.from_alignment(align) align_again = bin_align.to_alignment() self.assertEqual(align, align_again)
def test_single_alpha_full_map(self): from multicov.alignment import Alignment from multicov.binary import BinaryAlignment, binary_index_map from multicov.alphabet import rna_alphabet align = Alignment(['ACA', 'GUA', '-A-'], alphabet=rna_alphabet) bin_align = BinaryAlignment.from_alignment(align) full_map = binary_index_map(bin_align) self.assertTrue(np.array_equal(full_map, [[0, 4], [4, 8], [8, 12]]))