def exercise_align(): # i_seqs, j_seqs = align( "EASYA", "AETSYT").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 2, 3]) and j_seqs == flex.size_t( [1, 3, 4]) # i_seqs, j_seqs = align( "AAAGGTT", "AAATT").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1, 2, 5, 6]) and \ j_seqs == flex.size_t([0, 1, 2, 3, 4]) # i_seqs, j_seqs = align( "AESD", "AEDK").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1]) and j_seqs == flex.size_t([0, 1]) # i_seqs, j_seqs = align( "EASY", "YSAE").extract_alignment().exact_match_selections() assert i_seqs.size() == 0 and j_seqs.size() == 0 # i_seqs, j_seqs = align("EASY", "KMT").extract_alignment().exact_match_selections() assert i_seqs.size() == 0 and j_seqs.size() == 0 # i_seqs, j_seqs = align( "EASY", "KMST").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([2]) and j_seqs == flex.size_t([2]) # i_seqs, j_seqs = align( "EASY", "KMMST").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([2]) and j_seqs == flex.size_t([3]) # i_seqs, j_seqs = align( "EASY", "EATY").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1, 3]) and j_seqs == flex.size_t( [0, 1, 3]) # i_seqs, j_seqs = align( "EASIEST", "EATY").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1]) and j_seqs == flex.size_t([0, 1]) # i_seqs, j_seqs = align( "EEEEEASIEST", "EEEATYRRIESQQEIES").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1, 2, 7, 8, 9]) and \ j_seqs == flex.size_t([0, 1, 2, 8, 9, 10])
def exercise_align(): # i_seqs, j_seqs = align("EASYA", "AETSYT").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 2, 3]) and j_seqs == flex.size_t([1, 3, 4]) # i_seqs, j_seqs = align("AAAGGTT", "AAATT").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1, 2, 5, 6]) and \ j_seqs == flex.size_t([0, 1, 2, 3, 4]) # i_seqs, j_seqs = align("AESD", "AEDK").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1]) and j_seqs == flex.size_t([0, 1]) # i_seqs, j_seqs = align("EASY", "YSAE").extract_alignment().exact_match_selections() assert i_seqs.size()==0 and j_seqs.size()==0 # i_seqs, j_seqs = align("EASY", "KMT").extract_alignment().exact_match_selections() assert i_seqs.size()==0 and j_seqs.size()==0 # i_seqs, j_seqs = align("EASY", "KMST").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([2]) and j_seqs == flex.size_t([2]) # i_seqs, j_seqs = align("EASY", "KMMST").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([2]) and j_seqs == flex.size_t([3]) # i_seqs, j_seqs = align("EASY", "EATY").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1, 3]) and j_seqs == flex.size_t([0, 1, 3]) # i_seqs, j_seqs = align("EASIEST", "EATY").extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1]) and j_seqs == flex.size_t([0, 1]) # i_seqs, j_seqs = align("EEEEEASIEST", "EEEATYRRIESQQEIES" ).extract_alignment().exact_match_selections() assert i_seqs == flex.size_t([0, 1, 2, 7, 8, 9]) and \ j_seqs == flex.size_t([0, 1, 2, 8, 9, 10])
def mmtbx_res_alignment(seq_a, seq_b, min_percent=0.85, atomnames=False): # Check for the basic cases (shortcut for obvious cases) a = len(seq_a) b = len(seq_b) if (a == 0) or (b == 0): return [], [], 0 if seq_a == seq_b: return list(range(a)), list(range(a)), 1.0 norm_seq_a = seq_a norm_seq_b = seq_b if not atomnames: norm_seq_a = "" norm_seq_b = "" from iotbx.pdb.amino_acid_codes import one_letter_given_three_letter, \ one_letter_given_three_letter_modified_aa merged_one_given_three = one_letter_given_three_letter.copy() merged_one_given_three.update( one_letter_given_three_letter_modified_aa) merged_one_given_three.update({ "A": "A", "C": "C", "G": "G", "U": "U", "DA": "A", "DC": "C", "DG": "G", "DT": "T" }) for l in seq_a: one_letter = merged_one_given_three.get(l.strip(), 'X') norm_seq_a += one_letter for l in seq_b: one_letter = merged_one_given_three.get(l.strip(), 'X') norm_seq_b += one_letter from mmtbx.alignment import align # print norm_seq_a # STOP() obj = align( norm_seq_a, norm_seq_b, gap_opening_penalty=1, # default gap_extension_penalty=0.5, # default is 1 similarity_function="identity") alignment = obj.extract_alignment() sim1 = alignment.calculate_sequence_identity() # print "Sequence identity is", sim1 # alignment.pretty_print(block_size=60) al_a, al_b = alignment.exact_match_selections() # alignment.pretty_print() if sim1 < min_percent: # chains are too different, return empty arrays return flex.size_t([]), flex.size_t([]), 0 return al_a, al_b, sim1
def mmtbx_res_alignment(seq_a, seq_b, min_percent=0.85, atomnames=False): # Check for the basic cases (shortcut for obvious cases) a = len(seq_a) b = len(seq_b) if (a == 0) or (b == 0): return [], [], 0 if seq_a == seq_b: return range(a), range(a), 1.0 norm_seq_a = seq_a norm_seq_b = seq_b if not atomnames: norm_seq_a = "" norm_seq_b = "" from iotbx.pdb.amino_acid_codes import one_letter_given_three_letter, \ one_letter_given_three_letter_modified_aa merged_one_given_three = one_letter_given_three_letter.copy() merged_one_given_three.update(one_letter_given_three_letter_modified_aa) merged_one_given_three.update({ " A": "A", " C": "C", " G": "G", " U": "U", " DA": "A", " DC": "C", " DG": "G", " DT": "T"}) for l in seq_a: one_letter = merged_one_given_three.get(l, 'X') norm_seq_a += one_letter for l in seq_b: one_letter = merged_one_given_three.get(l, 'X') norm_seq_b += one_letter from mmtbx.alignment import align obj = align( norm_seq_a, norm_seq_b, gap_opening_penalty=1, # default gap_extension_penalty=0.5, # default is 1 similarity_function="identity") alignment = obj.extract_alignment() sim1 = alignment.calculate_sequence_identity() # print "Sequence identity is", sim1 # alignment.pretty_print(block_size=60) al_a, al_b = alignment.exact_match_selections() # alignment.pretty_print() if sim1 < min_percent: # chains are to different, return empty arrays return flex.size_t([]), flex.size_t([]), 0 return al_a, al_b, sim1
def exercise_align_mask(): B="GCGAGATAAAGGGACCCATAAA" +"TGTCG"+ "TAGCATCGGGCTAATAGATAAGACACA" A= "TGTCG"+ "AGCATCGGGCTAATAGATAAGACACA" scores=[] for i in xrange(len(A)): masking_a=len(A)*[10] masking_a[i]=1 obj = align(A,B,masking_a=masking_a) print "score=%.1f" % obj.score() alignment = obj.extract_alignment() print alignment.match_codes scores.append(obj.score()) print scores assert scores==[2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 5.0, 4.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0] print "OK"
def exercise_2(): A = "AAAGGTT" B = "AAATT" obj = align(A, B) obj.show_matrices() print("score=%.1f" % obj.score()) alignment = obj.extract_alignment() print(alignment.match_codes) print(alignment.a) print(alignment.identity_matches()) print(alignment.b) # 1rra vs. 1bli A = "AESSADKFKRQHMDTEGPSKSSPTYCNQMMKRQGMTKGSCKPVNTFVHEPLEDVQAICSQGQVTCKNGRNNCHKSSSTLRITDCRLKGSSKYPNCDYTTTDSQKHIIIACDGNPYVPVHFDASV" B = "DNSRYTHFLTQHYDAKPQGRDDRYCESIMRRRGLTSPCKDINTFIHGNKRSIKAICENKNGNPHRENLRISKSSFQVTTCKLHGGSPWPPCQYRATAGFRNVVVACENGLPVHLDQSIFRRP".lower( ) obj = align(A, B, gap_opening_penalty=150, gap_extension_penalty=20, similarity_function="dayhoff", style="global") print("\n1rra vs. 1bli; GLOBAL allignment; mdm78") print("score=%.1f" % obj.score()) alignment = obj.extract_alignment() print(alignment.match_codes) print(alignment.a) print(alignment.dayhoff_matches()) print(alignment.b) assert approx_equal(alignment.calculate_sequence_identity(), 0.330645) # 1rra vs. 1bli A = "AESSADKFKRQHMDTEGPSKSSPTYCNQMMKRQGMTKGSCKPVNTFVHEPLEDVQAICSQGQVTCKNGRNNCHKSSSTLRITDCRLKGSSKYPNCDYTTTDSQKHIIIACDGNPYVPVHFDASV" B = "DNSRYTHFLTQHYDAKPQGRDDRYCESIMRRRGLTSPCKDINTFIHGNKRSIKAICENKNGNPHRENLRISKSSFQVTTCKLHGGSPWPPCQYRATAGFRNVVVACENGLPVHLDQSIFRRP" obj = align(A, B, gap_opening_penalty=150, gap_extension_penalty=20, similarity_function="dayhoff", style="local") print("\n1rra vs. 1bli; LOCAL allignment; mdm78") print("score=%.1f" % obj.score()) alignment = obj.extract_alignment() print(alignment.match_codes) print(alignment.a) print(alignment.dayhoff_matches()) print(alignment.b) assert approx_equal(alignment.calculate_sequence_identity(), 0.341880) # 1rra vs. 1bli A = "AESSADKFKRQHMDTEGPSKSSPTYCNQMMKRQGMTKGSCKPVNTFVHEPLEDVQAICSQGQVTCKNGRNNCHKSSSTLRITDCRLKGSSKYPNCDYTTTDSQKHIIIACDGNPYVPVHFDASV" B = "DNSRYTHFLTQHYDAKPQGRDDRYCESIMRRRGLTSPCKDINTFIHGNKRSIKAICENKNGNPHRENLRISKSSFQVTTCKLHGGSPWPPCQYRATAGFRNVVVACENGLPVHLDQSIFRRP" obj = align(A, B, gap_opening_penalty=10, gap_extension_penalty=2, similarity_function="blosum50", style="global") print("\n1rra vs. 1bli; GLOBAL allignment; blosum50") print("score=%.1f" % obj.score()) alignment = obj.extract_alignment() print(alignment.match_codes) print(alignment.a) print(alignment.matches()) print(alignment.b) assert approx_equal(alignment.calculate_sequence_identity(), 0.362903) # 1rra vs. 1bli A = "AESSADKFKRQHMDTEGPSKSSPTYCNQMMKRQGMTKGSCKPVNTFVHEPLEDVQAICSQGQVTCKNGRNNCHKSSSTLRITDCRLKGSSKYPNCDYTTTDSQKHIIIACDGNPYVPVHFDASV" B = "DNSRYTHFLTQHYDAKPQGRDDRYCESIMRRRGLTSPCKDINTFIHGNKRSIKAICENKNGNPHRENLRISKSSFQVTTCKLHGGSPWPPCQYRATAGFRNVVVACENGLPVHLDQSIFRRP" obj = align(A, B, gap_opening_penalty=10, gap_extension_penalty=2, similarity_function="blosum50", style="local") print("\n1rra vs. 1bli; LOCAL allignment; blosum50") print("score=%.1f" % obj.score()) alignment = obj.extract_alignment() print(alignment.match_codes) print(alignment.a) print( alignment.matches(similarity_function=blosum50, is_similar_threshold=0)) print(alignment.b) assert approx_equal(alignment.calculate_sequence_identity(), 0.368852) print() alignment.pretty_print(matches=None, out=None, block_size=50, n_block=1, top_name="1rra", bottom_name="1bli", comment="""pretty_print is pretty pretty""") # example from PDB ID 2dex A = "GTLIRVTPEQPTHAVCVLGTLTQLDICSSAPXXXTSFSINASPGVVVDI" B = "GPLGSPEFMAQGTLIRVTPEQPTHAVCVLGTLTQLDICSSAPEDCTSFSINASPGVVVDI" obj = align(A, B, similarity_function="identity") alignment = obj.extract_alignment() assert alignment.match_codes == 'iiiiiiiiiiimmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm' assert alignment.a == '-----------GTLIRVTPEQPTHAVCVLGTLTQLDICSSAPXXXTSFSINASPGVVVDI' assert alignment.b == 'GPLGSPEFMAQGTLIRVTPEQPTHAVCVLGTLTQLDICSSAPEDCTSFSINASPGVVVDI' print("OK") # necessary for auto_build checking
def __init__(self, pdb_hierarchy, sequences, params=None, log=None, nproc=Auto, include_secondary_structure=False, extract_coordinates=False, extract_residue_groups=False, minimum_identity=0, custom_residues=[]): # # XXX This is to stop assertion crash that checks lengths of provided # XXX sequence with the length of sequence from model (which can happen to # XXX be different due to presence of altlocs! # XXX Also this assumes altlocs within residue groups are the same resname's # XXX And of course making a copy is a bad idea, obviously! # XXX No test. # pdb_hierarchy = pdb_hierarchy.deep_copy() pdb_hierarchy.atoms().reset_i_seq() pdb_hierarchy.remove_alt_confs(always_keep_one_conformer=True) # assert (len(sequences) > 0) for seq_object in sequences : assert (seq_object.sequence != "") if (log is None): log = sys.stdout if (params is None): params = master_phil.extract() self.n_protein = 0 self.n_rna_dna = 0 self.n_other = 0 self.chains = [] self.minimum_identity = minimum_identity self.sequences = sequences self.custom_residues = custom_residues if self.custom_residues is None: self.custom_residues = list() self.sequence_mappings = [ None ] * len(sequences) for i_seq in range(1, len(sequences)): seq_obj1 = sequences[i_seq] for j_seq in range(0, len(sequences)): if (j_seq == i_seq): break else : seq_obj2 = sequences[j_seq] if (seq_obj1.sequence == seq_obj2.sequence): self.sequence_mappings[i_seq ] = j_seq break if (len(pdb_hierarchy.models()) > 1): raise Sorry("Multi-model PDB files not supported.") helix_selection = sheet_selection = None if (include_secondary_structure): import mmtbx.secondary_structure ssm = mmtbx.secondary_structure.manager( pdb_hierarchy=pdb_hierarchy, sec_str_from_pdb_file=None) helix_selection = ssm.helix_selection() sheet_selection = ssm.beta_selection() pdb_chains = [] chain_seq = {} for pdb_chain in pdb_hierarchy.models()[0].chains(): unk = UNK_AA chain_id = pdb_chain.id main_conf = pdb_chain.conformers()[0] if (main_conf.is_na()): self.n_rna_dna += 1 unk = UNK_NA chain_type = NUCLEIC_ACID elif (main_conf.is_protein()): self.n_protein += 1 chain_type = PROTEIN else : self.n_other += 1 print("Skipping non-polymer chain '%s'" % chain_id, file=log) continue pad = True pad_at_start = False seq = pdb_chain.as_padded_sequence( substitute_unknown=unk, pad=pad, pad_at_start=pad_at_start) chain_seq[chain_id] = seq resids = pdb_chain.get_residue_ids(pad=pad, pad_at_start=pad_at_start) resnames = pdb_chain.get_residue_names_padded( pad=pad, pad_at_start=pad_at_start) assert (len(seq) == len(resids) == len(resnames)) sec_str = None if (helix_selection is not None) and (main_conf.is_protein()): sec_str = main_conf.as_sec_str_sequence(helix_selection, sheet_selection, pad=pad, pad_at_start=pad_at_start) assert (len(sec_str) == len(seq)) c = chain(chain_id=chain_id, sequence=seq, resids=resids, chain_type=chain_type, sec_str=sec_str, resnames=resnames) self.chains.append(c) pdb_chains.append(pdb_chain) if len(self.chains) == 0: raise Sorry("Could not find any polymer chains to align.") debug = False if debug: alignments_and_names = [] for i in range(len(self.chains)): alignments_and_names.append(self.align_chain(i)) else: alignments_and_names = easy_mp.pool_map( fixed_func=self.align_chain, args=range(len(self.chains)), processes=nproc) assert (len(alignments_and_names) == len(self.chains) == len(pdb_chains)) for i, c in enumerate(self.chains): alignment, seq_name, seq_id = alignments_and_names[i] # if no alignment was found, just use the sequence from the model if alignment is None: alignment = align(chain_seq[c.chain_id], chain_seq[c.chain_id]).extract_alignment() seq_name = 'model' seq_id = 0 pdb_chain = pdb_chains[i] try : c.set_alignment(alignment, seq_name, seq_id) except Exception as e : print("Error processing chain %s" % c.chain_id) raise print(e) else : if (extract_coordinates): c.extract_coordinates(pdb_chain) if extract_residue_groups: c.extract_residue_groups(pdb_chain) self.sequences = None