def test_on_protein(self): from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet from multicov.filtering import filter_rows threshold = 1 / 21 align = Alignment([ 'WKHNAYDMLSSDCQFESSHKHTQVCSAGMGYCAKPNNWGYW-LIVKMMW-CDYKQKLIYIAPLN', 'KHRCDANC-MLAN-SVIKYTHSACALIWTWNS-KIIRYFFVGAWFKEHFDSVPTAQACVCDSTP', 'LGVVGYYFKPCT-EVPSYSRFNVFHRIFPYLVYRVEE-NHTGHHVQ-KIVRNQYELRSIFDEHG', 'LIGDDHRN-LALCPS-T-GTTCCNWKWRSEWTMHSDTNCNPVAE--SYSKRCNDIGYITWINYA', 'CMPRYWYTYQYDCIFGWRFYSVYWPCLDDMFWQPYVDSMELF-NPMVATEWIMENCQGWG-N-K', 'QWFWRARPFE--FSC-C-PGP-GWVNLIDWMSCNKAMETLMRPYCNPYLKIQLPRSKNLLDDDG', 'VTMPEGHHCPAM-PLDLNGQR-KMWGSDFKKEDCKGYPEKFDCENLIDMDICLSLNTRPED-QR', 'LNYINMHVD-IGP-PCPQYDL--KFKCMYW-GQIEDV-NMQ-WKK-RTMDAVEQIVSMYHMSVE', 'WHV-EWKPVLC-PHWQFYM-VITEYVAMFQWCPPKGMASPKKGNLPRMFQSAKAIGAHRSDM-Y', 'PIWGGFNFPWID-GSQRQQR-EVTTGCDDFEHKYNPYLVPG-WEFGKYSNCWT-RCWRVNHDTV', 'PPCWVEAPYKPMGMWN-GRKV-NVAVWHHVIVL-DMYGLHLLRDWTMVKNAAHIFSHNMEMSNI', 'E-MWRGLIWSKGAY-YQNDNGTFNWPKQKHP-ARCSF-PTVNKDQNPGP-MVQMREFKSQQGQQ', 'RFGKFTCMGFRWKEYFTKQ-NPYKYRGIVHVKVQMIYSANGNLDWIDIPMIIRLKCPFGTRVTQ', 'CGRCGSH-EWL-NIMRNCKFIFWWRPTNAAHIWCARHESPKAD-QIAMTYRML-LDAHIIIVR-', 'T-PMVWRLVWYDHGCDPWMLIV-PIEPCVVKKPQYKDMERFSPDIKCHYLHDKDDGFWGSDKYI', 'LNCPYADLDGL-NPQR-FVVS-RCMRDGFRAVVRVSPDDLS-MWCKAGA-NTTV-DNRH-IVQW' ], protein_alphabet) align_clean = filter_rows(align, max_gaps=threshold) # noinspection PyTypeChecker gap_fraction = np.mean(align.data == '-', axis=1) # noinspection PyTypeChecker gap_fraction_clean = np.mean(align_clean.data == '-', axis=1) self.assertLess(len(align_clean), len(align)) self.assertLessEqual(np.max(gap_fraction_clean), threshold) self.assertEqual(np.sum(gap_fraction <= threshold), len(align_clean))
def test_returns_copy_even_when_unchanged(self): from multicov.alignment import Alignment from multicov.filtering import filter_rows from multicov.alphabet import protein_alphabet, dna_alphabet align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], protein_alphabet) align.add(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet) align_clean = filter_rows(align, 0.9) self.assertEqual(len(align_clean), 3) self.assertIsNot(align_clean, align)
def test_on_multi_alpha(self): from multicov.alignment import Alignment from multicov.filtering import filter_rows from multicov.alphabet import protein_alphabet, dna_alphabet align = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], protein_alphabet) align.add(['ATACAT', 'GATACA', 'AA--GG'], dna_alphabet) align_clean = filter_rows(align, 0.2) expected = Alignment(['IVGGYTCQ', '-VGGTEAQ'], protein_alphabet).add(['ATACAT', 'GATACA'], dna_alphabet) self.assertEqual(align_clean, expected)
def test_on_empty(self): from multicov.alignment import Alignment from multicov.filtering import filter_rows align1 = Alignment() align2 = filter_rows(Alignment()) self.assertEqual(align1, align2)