def test_local_pairwise_align_protein(self): obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE"), Protein("AW-HE")])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=5., gap_extend_penalty=0.5) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE-E"), Protein("AW-HEAE")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(4, 9), (1, 6)]) # Protein sequences with metadata obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE", metadata={'id': "s1"}), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE"), Protein("AW-HE")])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) # Fails when either input is passed as a TabularMSA self.assertRaises(TypeError, local_pairwise_align_protein, TabularMSA( [Protein("HEAGAWGHEE", metadata={'id': "s1"})]), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE", metadata={'id': "s1"}), TabularMSA( [Protein("PAWHEAE", metadata={'id': "s2"})]), gap_open_penalty=10., gap_extend_penalty=5.) # TypeError on invalid input self.assertRaises(TypeError, local_pairwise_align_protein, 42, Protein("HEAGAWGHEE")) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE"), 42)
def test_local_pairwise_align_protein(self): expected = ("AWGHE", "AW-HE", 26.0, 4, 1) actual = local_pairwise_align_protein( "HEAGAWGHEE", "PAWHEAE", gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(4, 8), (1, 4)]) self.assertEqual(actual.ids(), list('01')) expected = ("AWGHE-E", "AW-HEAE", 32.0, 4, 1) actual = local_pairwise_align_protein( "HEAGAWGHEE", "PAWHEAE", gap_open_penalty=5., gap_extend_penalty=0.5) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(4, 9), (1, 6)]) self.assertEqual(actual.ids(), list('01')) expected = ("AWGHE", "AW-HE", 26.0, 4, 1) # Protein (rather than str) as input actual = local_pairwise_align_protein( Protein("HEAGAWGHEE", metadata={'id': "s1"}), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(4, 8), (1, 4)]) self.assertEqual(actual.ids(), ["s1", "s2"]) # Fails when either input is passed as an Alignment self.assertRaises(TypeError, local_pairwise_align_protein, Alignment([Protein("HEAGAWGHEE", metadata={'id': "s1"})]), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE", metadata={'id': "s1"}), Alignment([Protein("PAWHEAE", metadata={'id': "s2"})]), gap_open_penalty=10., gap_extend_penalty=5.) # ids are provided if they're not passed in actual = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(actual.ids(), list('01')) # TypeError on invalid input self.assertRaises(TypeError, local_pairwise_align_protein, 42, "HEAGAWGHEE") self.assertRaises(TypeError, local_pairwise_align_protein, "HEAGAWGHEE", 42)
def test_local_pairwise_align_protein(self): obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE"), Protein("AW-HE")])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=5., gap_extend_penalty=0.5) self.assertEqual(obs_msa, TabularMSA([Protein("AWGHE-E"), Protein("AW-HEAE")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(4, 9), (1, 6)]) # Protein sequences with metadata obs_msa, obs_score, obs_start_end = local_pairwise_align_protein( Protein("HEAGAWGHEE", metadata={'id': "s1"}), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual( obs_msa, TabularMSA([Protein("AWGHE", metadata={'id': "s1"}), Protein("AW-HE", metadata={'id': "s2"})])) self.assertEqual(obs_score, 26.0) self.assertEqual(obs_start_end, [(4, 8), (1, 4)]) # Fails when either input is passed as a TabularMSA self.assertRaises(TypeError, local_pairwise_align_protein, TabularMSA([Protein("HEAGAWGHEE", metadata={'id': "s1"})]), Protein("PAWHEAE", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=5.) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE", metadata={'id': "s1"}), TabularMSA([Protein("PAWHEAE", metadata={'id': "s2"})]), gap_open_penalty=10., gap_extend_penalty=5.) # TypeError on invalid input self.assertRaises(TypeError, local_pairwise_align_protein, 42, Protein("HEAGAWGHEE")) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE"), 42)
def calculate_sim(target_protein, ): protein_list = target_protein.seq.tolist() protein_num = len(protein_list) sim_matrix = np.zeros(shape=[protein_num, protein_num]) print(f'==Start== with protein : {protein_num}') for i in range(len(protein_list)): for j in range(len(protein_list)): protein_similarity = local_pairwise_align_protein( seq1=Protein(protein_list[i]), seq2=Protein(protein_list[j]), ) print(protein_similarity) sim_matrix[i, j] = protein_similarity[1] print(sim_matrix) sim_value = np.zeros(shape=sim_matrix.shape) for i in range(protein_num): for j in range(protein_num): value = (sim_matrix[i, j] + sim_matrix[j, i]) / (sim_matrix[i, i] + sim_matrix[j, j]) sim_value[i, j] = value sim_value[j, i] = value print(sim_value) return sim_matrix, sim_value
def align(seq1, seq2, go, ge): ''' Perform alignment using scikit-bio for any two given sequences, gap penalties, and score matrix. ''' a, b = read_seq(seq1, seq2) # scoreMatrix = read_matrix(sys.argv[1]) alignment, score, start_end_positions = local_pairwise_align_protein( Protein(a, lowercase=True), Protein(b, lowercase=True), gap_open_penalty=go, gap_extend_penalty=ge, substitution_matrix=None) print("\nScore:", score) return score
def test_local_pairwise_align_protein(self): expected = ("AWGHE", "AW-HE", 26.0, 4, 1) actual = local_pairwise_align_protein("HEAGAWGHEE", "PAWHEAE", gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(4, 8), (1, 4)]) self.assertEqual(actual.ids(), list('01')) expected = ("AWGHE-E", "AW-HEAE", 32.0, 4, 1) actual = local_pairwise_align_protein("HEAGAWGHEE", "PAWHEAE", gap_open_penalty=5., gap_extend_penalty=0.5) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(4, 9), (1, 6)]) self.assertEqual(actual.ids(), list('01')) expected = ("AWGHE", "AW-HE", 26.0, 4, 1) # Protein (rather than str) as input actual = local_pairwise_align_protein(Protein("HEAGAWGHEE", "s1"), Protein("PAWHEAE", "s2"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(4, 8), (1, 4)]) self.assertEqual(actual.ids(), ["s1", "s2"]) # Fails when either input is passed as an Alignment self.assertRaises(TypeError, local_pairwise_align_protein, Alignment([Protein("HEAGAWGHEE", "s1")]), Protein("PAWHEAE", "s2"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertRaises(TypeError, local_pairwise_align_protein, Protein("HEAGAWGHEE", "s1"), Alignment([Protein("PAWHEAE", "s2")]), gap_open_penalty=10., gap_extend_penalty=5.) # ids are provided if they're not passed in actual = local_pairwise_align_protein(Protein("HEAGAWGHEE"), Protein("PAWHEAE"), gap_open_penalty=10., gap_extend_penalty=5.) self.assertEqual(actual.ids(), list('01')) # TypeError on invalid input self.assertRaises(TypeError, local_pairwise_align_protein, 42, "HEAGAWGHEE") self.assertRaises(TypeError, local_pairwise_align_protein, "HEAGAWGHEE", 42)
def seq_score(s1, s2): alignment,score,start_end_positions = local_pairwise_align_protein(s1,s2,gap_open_penalty=11,gap_extend_penalty=1) return score
with open(data_check[-1], 'rb') as f: protein_data = pickle.load(f) from skbio.alignment import local_pairwise_align_protein from skbio import Protein from skbio.alignment import local_pairwise_align # %% seq_list = protein_data.iloc[:10].seq.tolist() protein_num = len(seq_list) similarity_matrix = np.zeros(shape=[protein_num, protein_num]) for i in range(len(seq_list)): for j in range(len(seq_list)): protein_similarity = local_pairwise_align_protein( seq1=Protein(seq_list[i]), seq2=Protein(seq_list[j]), ) print(protein_similarity) similarity_matrix[i, j] = protein_similarity[1] print(similarity_matrix) similarity_value = np.zeros(shape=similarity_matrix.shape) for i in range(protein_num): for j in range(protein_num): value = (similarity_matrix[i, j] + similarity_matrix[j, i]) / ( similarity_matrix[i, i] + similarity_matrix[j, j]) similarity_value[i, j] = value similarity_value[j, i] = value