def test_should_single_point_crossover_work_properly_case_g(self, random_call): """ Example of MSA in Ortuño's paper GKGD---PK|KP, GKGD-PK|KP => GKGD---PK-KP, GKGD-PK--KP M------QD|RV, --M--QD|RV => M------QD-RV, --M--QD--RV MKKLKKHPD|FP, MKKLKKHPD|FP => MKKLKKHPD-FP, MKKLKKHPDFP M--------|HI, ---M--H|I- => M--------HI-, ---M--H---I """ # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3', 'seq4'] problem.number_of_variables = 4 msa_1 = MSASolution(problem, msa=[('seq1', 'GKGD---PKKP'), ('seq2', 'M------QDRV'), ('seq3', 'MKKLKKHPDFP'), ('seq4', 'M--------HI')]) msa_2 = MSASolution(problem, msa=[('seq1', 'GKGD-PKKP'), ('seq2', '--M--QDRV'), ('seq3', 'MKKLKKHPDFP'), ('seq4', '---M--HI-')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run random_call.return_value = 8 children = crossover.execute([msa_1, msa_2]) # check self.assertEqual(["GKGD---PK-KP", "M------QD-RV", "MKKLKKHPD-FP", "M--------HI-"], children[0].decode_alignment_as_list_of_sequences()) self.assertEqual(["GKGD-PK--KP", "--M--QD--RV", "MKKLKKHPDFP", "---M--H---I"], children[1].decode_alignment_as_list_of_sequences())
def test_should_single_point_crossover_work_properly_real_case(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['a', 'b', 'c', 'd'] problem.number_of_variables = 4 msa_1 = MSASolution(problem, msa=[ ('a', '----GKGDPKKPRGKMSSYAFFVQTSREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFEDMAKADKARYEREMKTYIPPK----------GE'), ('b', '-------MQDRVKRPMNAFIVWSRDQRRKMALENPRMRN--SEISKQLGYQWKMLTEAEKWPFFQEAQKLQAMHREKYPNYKYRP---RRKAKMLPK'), ('c', 'MKKLK---KHPDFPKKPLTPYFRFFMEKRAKYAKLHPEMSNLDLTKILSKKYKELPEKKKMKYIQDFQREKQEFERNLARFREDH---PDLIQNAKK'), ('d', '---------MHIKKPLNAFMLYMKEMRANVVAESTLKES--AAINQILGRRWHALSREEQAKYYELARKERQLHMQLYPGWSARDNYGKKKKRKREK') ]) msa_2 = MSASolution(problem, msa=[ ('a', '----GKGDPKKPRGKMSSYAFFVQTSREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFEDMAKADKARYEREMKTYIPPK---GE-------'), ('b', '----M---QDRVKRPMNAFIVWSRDQRRKMALENPRMRN--SEISKQLGYQWKMLTEAEKWPFFQEAQKLQAMHREKYPNYKYRP---RRKAKMLPK'), ('c', 'MKKLK-KHPDFPKKPLTPYFRFFMEKRAKYAKLHPEMSN--LDLTKILSKKYKELPEKKKMKYIQDFQREKQEFERNLARFREDH---PDLIQNAKK'), ('d', '-------MH--IKKPLNAFMLYMKEMRANVVAESTLKES--AAINQILGRRWHALSREEQAKYYELARKERQLHMQLYPGWSARDNYGKKKKRKREK') ]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run children = crossover.cross_parents(10, [msa_1, msa_2], [10, 10, 10, 10], [10, 10, 8, 8]) # check self.assertTrue(children[0].is_valid_msa()) self.assertTrue(children[1].is_valid_msa())
def find_length_of_the_largest_sequence(self, solution: MSASolution): max_length = solution.get_length_of_sequence(0) for i in range(1, solution.number_of_variables): length_of_sequence_i = solution.get_length_of_sequence(i) if max_length < length_of_sequence_i: max_length = length_of_sequence_i return max_length
def test_should_return_number_of_gaps_of_one_sequences(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', 'AC---TGAC'), ('seq2', 'AT--CT--C'), ('seq3', 'AAC---TGC')]) # check self.assertEqual(3, msa.get_number_of_gaps_of_sequence_at_index(0))
def test_should_return_original_alignment_size(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', 'AC---TGAC'), ('seq2', 'AT--CT--C'), ('seq3', 'AAC---TGC')]) # check self.assertEqual(9, msa.get_length_of_alignment())
def evaluate(self, solution: MSASolution) -> MSASolution: solution.remove_full_of_gaps_columns() sequences = solution.decode_alignment_as_list_of_sequences() for i, score in enumerate(self.score_list): solution.objectives[i] = score.compute(sequences) if not score.is_minimization(): solution.objectives[i] = -solution.objectives[i] return solution
def test_should_return_gap_columns(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', '--AA-'), ('seq2', '--AA-'), ('seq3', '--AA-')]) # check self.assertEqual([0, 1, 4], msa.get_gap_columns_from_alignment())
def test_should_return_original_sequences(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', 'AC---TGAC'), ('seq2', 'AT--CT--C'), ('seq3', 'AAC---TGC')]) # check self.assertEqual(['AC---TGAC', 'AT--CT--C', 'AAC---TGC'], msa.decode_alignment_as_list_of_sequences())
def test_should_return_is_gap_column(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', 'AC---TGAC'), ('seq2', 'AT--CT--C'), ('seq3', 'AAC---TGC')]) # check self.assertTrue(msa.is_gap_column(3)) self.assertFalse(msa.is_gap_column(4))
def test_should_single_point_crossover_work_properly_real_case(self, random_call): # setup problem = MSA(score_list=[]) problem.identifiers = ['1bbt_ac', '1al2_ad', '1b35_C', '1bbt_ab', '1mec_aa', '1bbt_aa', '1al2_ab', '1al2_ac'] problem.number_of_variables = 8 msa_1 = MSASolution(problem, msa=[ ('1bbt_ac', '------GIFPVACSDGYGGLVTTDPKTAD---PVYGKVFNPPRNQLPGRFTNLLDVAEACP--------TFLRFEGGVPYVTTKTDSDRVLAQFDMSL----AAKHMSNTFLAG---------------------LAQYYTQYSGT-----INLHFMFTGPTDAKA-------RYMVAY----APPGMEPPKTPEAAAH---------------CIHAEWDTGLNSKF---------TFSIPYLSAADYT----YTASDVAETTNV--------QGWVCLFQ--------ITHGKADG-------DALVVLASAGKDF-----------------------ELRLPVDARAE----'), ('1al2_ad', '-------GLPVMNTPGSNQYLTADNFQSP---CALPEFDVTPPIDIPGEVKNMMELAEIDTMIPFDL--SATKKNTMEMYRVRLSDKPHTDDPILCLSLSPASDPRLSHTMLGE---------------------ILNYYTHWAGS-----LKFTFLFCGSMMATG-------KLLVSY----APPGADPPKKRKEAML---------------GTHVIWDIGLQSSC---------TMVVPWISNTT------YRQTIDDSFTE---------GGYISVFYQTRIV---VPLSTPRE-------MDILGFVSACNDF-----------------------SVRLLRDTTHIEQKA'), ('1b35_C', 'SKPTVQGKIGECKLRGQGRMANFDGMDMSHKMALSSTNEIETNEGLAGTSLDVMDLSRVLSIPNYWDRFTWKTSDVINTVLWDNYVSPFKVKPYSATI-----TDRFRCTHMGK---------------------VANAFTYWRGS-----MVYTFKFVKTQYHSG---RLRISFIPYYYNTTISTGTPDVSRTQKI---------------------VVDLRTSTAV---------SFTVPYIGSRPWLYCIRPESSWLSKDNTDGALMYNCVSGIVRVEVLNQLVAAQNVFSEIDVICEVNGGPDLEFAGPTCPRY----------VPYAGDFTLADTRKIEAERTQEYSNNED'), ('1bbt_ab', '-------LLEDRILTTRNGHTTSTTQSS----VGVTYGYATAEDFVSGPNTSGLETRVV----------QAERFFKTHLFDWVTSDSFGRCHLLELPT---------DHKGVYGS--------------------LTDSYAYMRNG-----WDVEVTAVGNQFNGG-------CLLVAM----VPELCSIQKRELYQLT--------------LFPHQFINPRTNMTA---------HITVPFVGVNR------YDQYKVHKP-----------WTLVVMVVAPLTV---NTEGAPQI-------KVYANIAPTNVHV-----------------------AGEFPSKE-------'), ('1mec_aa', '------------------GVENAEKGVTEN--TDATADFVAQPVYLPENQTKVAFFYDRSSPIGRFAVKSGSLESGFAPFSNKACPNSVILTPGPQFDPAYDQLRPQRLTEIWGNGNEETSEVFPLKTKQDYSFCLFSPFVYYKCD-----LEVTLSPHTSGAHGL---------LVRW----CPTGTPTKPTTQVLHEVSSLSEGRT------PQVYSAGPGTSNQI---------SFVVPYNSPLSVLPAVWYNGHKRFDNTGD--------LGIAPNSDFGTLF---FAGTKPDI-------KFTVYLRYKNMRVFCPRP--TVFFPWPT----SGDKIDMTPRAGVL-----'), ('1bbt_aa', '---------------------TTSAGESADPVTTTVENYGGETQIQRRQHTDVSFI--------------------MDRFVKVTPQNQINILDLMQVP---------SHTLVGG---------------------LLRASTYYFSD-----LEIAVK------HEG---------DLTW----VPNGAPEK---------------------------ALDNTTNPTAYHKAPLT--RLALPYTAPHRVLATV-YNGECRTLPTSFN-------YGAIKATRVTELL---YRMKRAETYCP----RPLLAIHPTEARH---------------------KQKIVAP----------'), ('1al2_ab', '------AATSRDALPNTEASGPTHSKEIP---ALTAVETGATNPLVPSDTVQTRHVVQH----------RSRSESSIESFFARGACVTIMTVDNPAST-----TNKDKLFAVWKITYKDTVQLRR----------KLEFFTYSRFD-----MELTFVVTANFTETNNGHALNQVYQIMY----IPPGAPVP----EKWD-----------------DYTWQTSSNPSIFYTYGTAPARISVPYVGISN-AYSHFYDGFSKVPLKDQSAALGDSLYGAASLNDFGILAVRVVNDHNPTKVT----SKIRVYLKPKHIRVWCPRPPRAVAYYGPGVDYKDGTLTPLSTKDLTTY----'), ('1al2_ac', '----EACGYSDRVLQLTLGNSTITTQEA----ANSVVAYGRWPEYLRDSEANPVDQPTEPDV-------AACRFYTLDTVSWTKESRGWWWKLPDALRDMGLFGQNMYYHYLGRSGYTVHVQCNASKFHQGALGVFAVPEMCLAGDSNTTTMHTSYQNANPGEKGG-------TFTGTF----TPDNNQTSPARRFCPVDYLLGNGTLLGNAFVFPHQIINLRTNNCA---------TLVLPYVNSLS------IDSMVKHNN-----------WGIAILPLAPLNF---ASESSPEI-------PITLTIAPMCCEF-------------------NGLRNITLPRLQ-------'), ]) msa_2 = MSASolution(problem, msa=[ ('1bbt_ac', '------GIFPVACSDGYGGLVTTDPKTAD---PVYGKVFNPPRNQLPGRFTNLLDVAEACP--------TFLRFEGGVPYVTTKTDSDRVLAQFDMSL----AAKHMSNTFLAG---------------------LAQYYTQYSGT-----INLHFMFTGPTDAKA-------RYMVAY----APPGMEPPKTPEAAAH---------------CIHAEWDTGLNSKF---------TFSIPYLSAADYT----YTASDVAETTNV--------QGWVCLFQ--------ITHGKADG-------DALVVLASAGKDF-----------------------ELRLPVDARAE----'), ('1al2_ad', '-------GLPVMNTPGSNQYLTADNFQSP---CALPEFDVTPPIDIPGEVKNMMELAEIDTMIPFDL--SATKKNTMEMYRVRLSDKPHTDDPILCLSLSPASDPRLSHTMLGE---------------------ILNYYTHWAGS-----LKFTFLFCGSMMATG-------KLLVSY----APPGADPPKKRKEAML---------------GTHVIWDIGLQSSC---------TMVVPWISNTT------YRQTIDDSFTE---------GGYISVFYQTRIV---VPLSTPRE-------MDILGFVSACNDF-----------------------SVRLLRDTTHIEQKA'), ('1b35_C', 'SKPTVQGKIGECKLRGQGRMANFDGMDMSHKMALSSTNEIETNEGLAGTSLDVMDLSRVLSIPNYWDRFTWKTSDVINTVLWDNYVSPFKVKPYSATI-----TDRFRCTHMGK---------------------VANAFTYWRGS-----MVYTFKFVKTQYHSG---RLRISFIPYYYNTTISTGTPDVSRTQKI---------------------VVDLRTSTAV---------SFTVPYIGSRPWLYCIRPESSWLSKDNTDGALMYNCVSGIVRVEVLNQLVAAQNVFSEIDVICEVNGGPDLEFAGPTCPRY----------VPYAGDFTLADTRKIEAERTQEYSNNED'), ('1bbt_ab', '-------LLEDRILTTRNGHTTSTTQSS----VGVTYGYATAEDFVSGPNTSGLETRVV----------QAERFFKTHLFDWVTSDSFGRCHLLELPT---------DHKGVYGS--------------------LTDSYAYMRNG-----WDVEVTAVGNQFNGG-------CLLVAM----VPELCSIQKRELYQLT--------------LFPHQFINPRTNMTA---------HITVPFVGVNR------YDQYKVHKP-----------WTLVVMVVAPLTV---NTEGAPQI-------KVYANIAPTNVHV-----------------------AGEFPSKE-------'), ('1mec_aa', '------------------GVENAEKGVTEN--TDATADFVAQPVYLPENQTKVAFFYDRSSPIGRFAVKSGSLESGFAPFSNKACPNSVILTPGPQFDPAYDQLRPQRLTEIWGNGNEETSEVFPLKTKQDYSFCLFSPFVYYKCD-----LEVTLSPHTSGAHGL---------LVRW----CPTGTPTKPTTQVLHEVSSLSEGRT------PQVYSAGPGTSNQI---------SFVVPYNSPLSVLPAVWYNGHKRFDNTGD--------LGIAPNSDFGTLF---FAGTKPDI-------KFTVYLRYKNMRVFCPRP--TVFFPWPT----SGDKIDMTPRAGVL-----'), ('1bbt_aa', '---------------------TTSAGESADPVTTTVENYGGETQIQRRQHTDVSFI--------------------MDRFVKVTPQNQINILDLMQVP---------SHTLVGG---------------------LLRASTYYFSD-----LEIAVK------HEG---------DLTW----VPNGAPEK---------------------------ALDNTTNPTAYHKAPLT--RLALPYTAPHRVLATV-YNGECRTLPTSFN-------YGAIKATRVTELL---YRMKRAETYCP----RPLLAIHPTEARH---------------------KQKIVAP----------'), ('1al2_ab', '------AATSRDALPNTEASGPTHSKEIP---ALTAVETGATNPLVPSDTVQTRHVVQH----------RSRSESSIESFFARGACVTIMTVDNPAST-----TNKDKLFAVWKITYKDTVQLRR----------KLEFFTYSRFD-----MELTFVVTANFTETNNGHALNQVYQIMY----IPPGAPVP----EKWD-----------------DYTWQTSSNPSIFYTYGTAPARISVPYVGISN-AYSHFYDGFSKVPLKDQSAALGDSLYGAASLNDFGILAVRVVNDHNPTKVT----SKIRVYLKPKHIRVWCPRPPRAVAYYGPGVDYKDGTLTPLSTKDLTTY----'), ('1al2_ac', '----EACGYSDRVLQLTLGNSTITTQEA----ANSVVAYGRWPEYLRDSEANPVDQPTEPDV-------AACRFYTLDTVSWTKESRGWWWKLPDALRDMGLFGQNMYYHYLGRSGYTVHVQCNASKFHQGALGVFAVPEMCLAGDSNTTTMHTSYQNANPGEKGG-------TFTGTF----TPDNNQTSPARRFCPVDYLLGNGTLLGNAFVFPHQIINLRTNNCA---------TLVLPYVNSLS------IDSMVKHNN-----------WGIAILPLAPLNF---ASESSPEI-------PITLTIAPMCCEF-------------------NGLRNITLPRLQ-------'), ]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run random_call.return_value = 176 children = crossover.execute([msa_1, msa_2]) # check self.assertTrue(children[0].is_valid_msa()) self.assertTrue(children[1].is_valid_msa())
def fill_sequences_with_gaps_to_reach_the_max_sequence_length( self, solution: MSASolution, max_length: int, cutting_points: list): for i in range(solution.number_of_variables): sequence_length = solution.get_length_of_sequence(i) if sequence_length != max_length: for j in range(sequence_length, max_length): if cutting_points[i] == -1: solution.add_gap_to_sequence_at_index( seq_index=i, gap_position=sequence_length - 1) else: solution.add_gap_to_sequence_at_index( seq_index=i, gap_position=cutting_points[i] + 1)
def test_should_remove_gap_column(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', 'AC---TGAC'), ('seq2', 'AT--CT--C'), ('seq3', 'AAC---TGC')]) msa.remove_gap_column(3) # check self.assertEqual([[2, 2, 4, 4], [2, 2, 6, 7], [4, 5]], msa.gaps_groups)
def test_should_remove_all_gap_columns_case_d(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2'] problem.number_of_variables = 2 msa = MSASolution(problem, msa=[('seq1', 'AB--CDE-'), ('seq2', 'AB--CD-E')]) msa.remove_full_of_gaps_columns() # check self.assertEqual(['ABCDE-', 'ABCD-E'], msa.decode_alignment_as_list_of_sequences())
def test_should_remove_all_gap_columns_case_b(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', 'AC--T--GC'), ('seq2', 'AC-----AC'), ('seq3', 'A---C--AC')]) msa.remove_full_of_gaps_columns() # check self.assertEqual(['ACTGC', 'AC-AC', 'A-CAC'], msa.decode_alignment_as_list_of_sequences())
def find_cutting_points_in_first_parent(self, solution: MSASolution, position: int) -> list: """ Find the real cutting points in a solution. If the column is a gap then the next non-gap symbol must be found """ positions = [-1 for _ in range(solution.number_of_variables)] for i in range(solution.number_of_variables): if solution.is_gap_char_at_sequence(i, position): positions[i] = solution.get_next_char_position_after_gap( i, position) else: positions[i] = position return positions
def test_should_return_length_of_gaps_groups(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa_1 = MSASolution(problem, msa=[('seq1', 'AC---TGAC'), ('seq2', 'AT--CT--C'), ('seq3', 'AAC---TGC')]) problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3', 'seq4'] problem.number_of_variables = 4 msa_2 = MSASolution(problem, msa=[('seq1', 'GKGD---PKKP'), ('seq2', 'M------QDRV'), ('seq3', 'MKKLKKHPDFP'), ('seq4', 'M--------HI-')]) # check self.assertEqual(3, msa_1.get_length_of_gaps(0)) self.assertEqual(4, msa_1.get_length_of_gaps(1)) self.assertEqual(3, msa_1.get_length_of_gaps(2)) self.assertEqual(3, msa_2.get_length_of_gaps(0)) self.assertEqual(6, msa_2.get_length_of_gaps(1)) self.assertEqual(0, msa_2.get_length_of_gaps(2)) self.assertEqual(9, msa_2.get_length_of_gaps(3))
def do_mutation(self, solution: MSASolution) -> MSASolution: if random.random() <= self.probability: for i in range(solution.number_of_variables): gaps_group = solution.gaps_groups[i] if len(gaps_group) >= 4: random_gaps_group = random.randrange( 0, len(gaps_group) - 2, 2) right_is_closest = False if not right_is_closest: diff = (gaps_group[random_gaps_group + 3] - gaps_group[random_gaps_group + 2]) - \ (gaps_group[random_gaps_group + 1] - gaps_group[random_gaps_group]) if diff < 0: # diff < 0 means that gaps group 2 is shorter than gaps group 1, thus we need to decrease # the length of the gaps group 1 diff = -1 * diff gaps_group[random_gaps_group + 1] -= diff gaps_group[random_gaps_group + 3] += diff # displace gaps group 2 one position to the left gaps_group[random_gaps_group + 2] -= diff gaps_group[random_gaps_group + 3] -= diff elif diff > 0: # diff > 0 means that gaps group 2 is larger than gaps group 1, thus we need to increase # the length of the gaps group 1 gaps_group[random_gaps_group + 1] += diff gaps_group[random_gaps_group + 3] -= diff # displace gaps group 2 one position to the right gaps_group[random_gaps_group + 2] += diff gaps_group[random_gaps_group + 3] += diff if self.remove_full_of_gap_columns: solution.remove_full_of_gaps_columns() # Sanity check: alignment is valid (same length for all sequences) if not solution.is_valid_msa(): raise Exception("Mutated solution is not valid! {0}".format( solution.decode_alignment_as_list_of_pairs())) return solution
def test_should_single_point_crossover_work_properly_case_j(self, random_call): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2'] problem.number_of_variables = 2 msa_1 = MSASolution(problem, msa=[('seq1', 'MIKMIM-IK'), ('seq2', 'A-B-CDEF-')]) msa_2 = MSASolution(problem, msa=[('seq1', '--MIKMIMIK'), ('seq2', 'ABC-D-E-F-')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=True) # run random_call.return_value = 2 children = crossover.execute([msa_1, msa_2]) # check self.assertEqual(["MIK--MIMIK", "A-BCD-E-F-"], children[0].decode_alignment_as_list_of_sequences()) self.assertEqual(["--MIKMIM-IK", "AB----CDEF-"], children[1].decode_alignment_as_list_of_sequences())
def test_should_the_solution_remain_unchanged_if_the_probability_is_zero(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa_1 = MSASolution(problem, msa=[('seq1', 'ACTC'), ('seq2', 'A-TC'), ('seq3', 'A--C')]) msa_2 = MSASolution(problem, msa=[('seq1', 'CT-G'), ('seq2', '-T-G'), ('seq3', '-ATG')]) crossover = SPXMSA(probability=0.0, remove_gap_columns=False) # run offspring = crossover.execute([msa_1, msa_2]) # check self.assertEqual([('seq1', 'ACTC'), ('seq2', 'A-TC'), ('seq3', 'A--C')], offspring[0].decode_alignment_as_list_of_pairs()) self.assertEqual([('seq1', 'CT-G'), ('seq2', '-T-G'), ('seq3', '-ATG')], offspring[1].decode_alignment_as_list_of_pairs())
def test_should_single_point_crossover_work_properly_case_c(self, random_call): """ A|B-CD-EF, ---A|BCD-EF => ABCD-EF, ---AB-CD-EF """ # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1'] problem.number_of_variables = 1 msa_1 = MSASolution(problem, msa=[('seq1', 'AB-CD-EF')]) msa_2 = MSASolution(problem, msa=[('seq1', '---ABCD-EF')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run random_call.return_value = 0 children = crossover.execute([msa_1, msa_2]) # check self.assertEqual(["ABCD-EF"], children[0].decode_alignment_as_list_of_sequences()) self.assertEqual(["---AB-CD-EF"], children[1].decode_alignment_as_list_of_sequences())
def test_should_single_point_crossover_work_properly_case_h(self, random_call): """ MSA with no crossover in the first sequence -----------|-M, --M|------ => ------------M------, --M """ # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1'] problem.number_of_variables = 1 msa_1 = MSASolution(problem, msa=[('seq1', '------------M')]) msa_2 = MSASolution(problem, msa=[('seq1', '--M------')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run random_call.return_value = 10 children = crossover.execute([msa_1, msa_2]) # check self.assertEqual(["------------M------"], children[0].decode_alignment_as_list_of_sequences()) self.assertEqual(["--M"], children[1].decode_alignment_as_list_of_sequences())
def test_should_get_original_char_position_in_aligned_sequence(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2', 'seq3'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('seq1', '-ABC'), ('seq2', 'ABCD'), ('seq3', '--AB')]) # check self.assertEqual( 1, msa.get_original_char_position_in_aligned_sequence(seq_index=0, position=0)) self.assertEqual( 2, msa.get_original_char_position_in_aligned_sequence(seq_index=0, position=1)) self.assertEqual( 3, msa.get_original_char_position_in_aligned_sequence(seq_index=0, position=2)) self.assertEqual( 0, msa.get_original_char_position_in_aligned_sequence(seq_index=1, position=0)) self.assertEqual( 1, msa.get_original_char_position_in_aligned_sequence(seq_index=1, position=1)) self.assertEqual( 2, msa.get_original_char_position_in_aligned_sequence(seq_index=1, position=2)) self.assertEqual( 2, msa.get_original_char_position_in_aligned_sequence(seq_index=2, position=0)) self.assertEqual( 3, msa.get_original_char_position_in_aligned_sequence(seq_index=2, position=1))
def test_should_single_point_crossover_work_properly_case_f(self, random_call): """ GKGD---P|KK, GKGD-P|KK => GKGD---PKK, GKGD-P-KK M------Q|DR-, --M--Q|DR => M------QDR, --M--QDR- """ # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2'] problem.number_of_variables = 2 msa_1 = MSASolution(problem, msa=[('seq1', 'GKGD---PKK'), ('seq2', 'M------QDR-')]) msa_2 = MSASolution(problem, msa=[('seq1', 'GKGD-PKK'), ('seq2', '--M--QDR')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run random_call.return_value = 7 children = crossover.execute([msa_1, msa_2]) # check self.assertEqual(["GKGD---PKK", "M------QDR"], children[0].decode_alignment_as_list_of_sequences()) self.assertEqual(["GKGD-P-KK", "--M--QDR-"], children[1].decode_alignment_as_list_of_sequences())
def test_should_single_point_crossover_work_properly_case_a_with_remove_gap_columns(self, random_call): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1'] problem.number_of_variables = 1 msa_1 = MSASolution(problem, msa=[('seq1', 'AB--CD-E')]) msa_2 = MSASolution(problem, msa=[('seq1', 'AB--CDE-')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) crossover_remove_full = SPXMSA(probability=1.0, remove_gap_columns=True) # run random_call.return_value = 4 children_1 = crossover.execute([msa_1, msa_2]) children_2 = crossover_remove_full.execute([msa_1, msa_2]) # check self.assertEqual(["AB--CDE-"], children_1[0].decode_alignment_as_list_of_sequences()) self.assertEqual(["AB--CD-E"], children_1[1].decode_alignment_as_list_of_sequences()) self.assertEqual(["ABCDE"], children_2[0].decode_alignment_as_list_of_sequences()) self.assertEqual(["ABCDE"], children_2[1].decode_alignment_as_list_of_sequences())
def __find_original_positions_in_aligned_sequences( self, solution: MSASolution, column_positions_in_first_parent: list): positions = [-1 for _ in range(solution.number_of_variables)] for i in range(solution.number_of_variables): pos = column_positions_in_first_parent[i] positions[ i] = solution.get_original_char_position_in_aligned_sequence( i, pos) return positions
def do_mutation(self, solution: MSASolution) -> MSASolution: if random.random() <= self.probability: if solution.number_of_variables >= 1: seq = random.randint(0, solution.number_of_variables - 1) else: seq = 0 gaps_group = solution.gaps_groups[seq] if len(gaps_group) >= 4: random_gaps_group = random.randrange(0, len(gaps_group) - 2, 2) right_is_closest = False if not right_is_closest: to_add = gaps_group[random_gaps_group + 3] - gaps_group[random_gaps_group + 2] + 1 gaps_group[random_gaps_group + 1] += to_add del gaps_group[random_gaps_group + 3] del gaps_group[random_gaps_group + 2] solution.merge_gaps_groups() if self.remove_full_of_gap_columns: solution.remove_full_of_gaps_columns() # Sanity check: alignment is valid (same length for all sequences) if not solution.is_valid_msa(): raise Exception("Mutated solution is not valid! {0}".format( solution.decode_alignment_as_list_of_pairs())) return solution
def do_mutation(self, solution: MSASolution) -> MSASolution: if random.random() <= self.probability: # Select one random sequence from all for seq in range(solution.number_of_variables): gaps_group = solution.gaps_groups[seq] if len(gaps_group) >= 4: random_gaps_group = random.randrange( 0, len(gaps_group) - 2, 2) shift_to = -1 if random.randint(0, 1) == 0 else 1 gaps_group[random_gaps_group] += shift_to gaps_group[random_gaps_group + 1] += shift_to solution.merge_gaps_groups() if self.remove_full_of_gap_columns: solution.remove_full_of_gaps_columns() # Sanity check: alignment is valid (same length for all sequences) if not solution.is_valid_msa(): raise Exception("Mutated solution is not valid! {0}".format( solution.decode_alignment_as_list_of_pairs())) return solution
def test_should_find_max_sequence_length(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['a', 'b', 'c'] problem.number_of_variables = 3 msa = MSASolution(problem, msa=[('a', 'AAC'), ('b', 'AAAAAAAC'), ('c', 'C')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run max = crossover.find_length_of_the_largest_sequence(msa) # check self.assertEqual(8, max)
def test_should_fill_sequences_with_gaps_to_reach_the_max_sequence_length(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['a', 'b'] problem.number_of_variables = 2 msa_1 = MSASolution(problem, msa=[('a', '-----GE'), ('b', 'KWPFFQEAQK')]) msa_2 = MSASolution(problem, msa=[('a', '-----GE'), ('b', 'KWPFFQEAQK')]) msa_3 = MSASolution(problem, msa=[('a', '-'), ('b', 'ABC')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run crossover.fill_sequences_with_gaps_to_reach_the_max_sequence_length(msa_1, 10, [-1, -1]) crossover.fill_sequences_with_gaps_to_reach_the_max_sequence_length(msa_2, 10, [-1, 5]) crossover.fill_sequences_with_gaps_to_reach_the_max_sequence_length(msa_3, 5, [-1, 1]) # check self.assertEqual(["-----G---E", "KWPFFQEAQK"], msa_1.decode_alignment_as_list_of_sequences()) self.assertEqual(["-----G---E", "KWPFFQEAQK"], msa_2.decode_alignment_as_list_of_sequences()) self.assertEqual(["-----", "AB--C"], msa_3.decode_alignment_as_list_of_sequences())
def test_should_find_original_positions_in_solution_with_gaps(self): # setup problem = MSA(score_list=[]) problem.identifiers = ['seq1', 'seq2'] problem.number_of_variables = 2 msa = MSASolution(problem, msa=[('seq1', 'BC-D-E---'), ('seq2', '--C--E---')]) crossover = SPXMSA(probability=1.0, remove_gap_columns=False) # run cutting_points = crossover.find_original_positions_in_original_sequences(msa, 5) # check self.assertEqual(3, cutting_points[0]) self.assertEqual(1, cutting_points[1])