def test_include_hoh(): pdb_inp = pdb.hierarchy.input(pdb_string=test_pdb_4) chains_info = get_chains_info(pdb_inp.hierarchy,exclude_water=True) isel1 = flex.size_t(range(7)) sel_str1 = selection_string_from_selection( pdb_inp,isel1,chains_info=chains_info) s = "(chain 'A' and resid 151:157)" assert sel_str1 == s, sel_str1 # cache = pdb_inp.hierarchy.atom_selection_cache().selection sel = cache(s).iselection() assert sel.size() == 7, sel.size() # chains_info = get_chains_info(pdb_inp.hierarchy,exclude_water=False) isel1 = flex.size_t(range(12)) sel_str1 = selection_string_from_selection( pdb_inp,isel1,chains_info=chains_info) assert sel_str1 == "chain 'A'", sel_str
def test_selection_with_alternative_conformers(): pdb_inp = pdb.hierarchy.input(pdb_string=test_pdb_5) cache = pdb_inp.hierarchy.atom_selection_cache().selection chains_info = get_chains_info(pdb_inp.hierarchy) ch_D = chains_info['D'] # test conditions verification assert ch_D.no_altloc == [True, True, True, False, True, True] select_all = sorted([x for xi in ch_D.atom_selection for x in xi]) test_list = range(23) + range(27,42) assert select_all == test_list
def test_selection_with_alternative_conformers(): pdb_h = iotbx.pdb.input( source_info=None, lines=test_pdb_5).construct_hierarchy(sort_atoms=True) asc = pdb_h.atom_selection_cache() chains_info = get_chains_info(pdb_h) ch_D = chains_info['D'] # test conditions verification assert ch_D.no_altloc == [True, True, True, False, True, True] select_all = sorted([x for xi in ch_D.atom_selection for x in xi]) test_list = list(asc.iselection("not altloc B")) assert select_all == test_list, "%s" % select_all
def update_str_selections_if_needed(self, hierarchy, asc=None, chains_info=None): from mmtbx.ncs.ncs_search import get_chains_info from iotbx.pdb.atom_selection import selection_string_from_selection if asc is None: asc = hierarchy.atom_selection_cache() if chains_info is None: chains_info = get_chains_info(hierarchy) for gr in self: if gr.master_str_selection is None: gr.master_str_selection = selection_string_from_selection( hierarchy, gr.master_iselection, chains_info, asc) for c in gr.copies: if c.str_selection is None: c.str_selection = selection_string_from_selection( hierarchy, c.iselection, chains_info, asc)
def selection_string_from_selection(pdb_h, selection, chains_info=None, atom_selection_cache=None): """ !!! if selection contains alternative conformations, the assertion in the end will fail. This is to prevent using this function with such selections. This limits its application to search NCS only and at the same time asserts that found NCS groups don't contain alternative conformations. Convert a selection array to a selection string. The function tries to minimise the selection string as possible, using chain names, resseq ranges and when there is not other option residues IDs Limitations: When pdb_h contains multiple conformations, selection must not include residues with alternate locations Args: pdb_h : iotbx.pdb.hierarchy selection (flex.bool or flex.size_t) chains_info : object containing chains (str): chain IDs OR selections string res_name (list of str): list of residues names resid (list of str): list of residues sequence number, resid atom_names (list of list of str): list of atoms in residues atom_selection (list of list of list of int): the location of atoms in ph chains_atom_number (list of int): list of number of atoms in each chain Returns: sel_str (str): atom selection string """ if isinstance(selection, flex.bool): selection = selection.iselection(True) if selection.size() == 0: raise Sorry('Empty atom selection') # pdb_hierarchy_inp is a hierarchy selection_set = set(selection) sel_list = [] # pdb_h.select(selection).write_pdb_file("selected_in.pdb") # using chains_info to improve performance if not chains_info: chains_info = get_chains_info(pdb_h) # print "chains_info" # for k, v in chains_info.iteritems(): # print k, v # print "\n\n" chain_ids = sorted(chains_info) for ch_id in chain_ids: # print "chains_info[ch_id].atom_selection", chains_info[ch_id].atom_selection # this "unfolds" the atom_selection array which is [[],[],[],[]...] into # a set if not chain_is_needed(selection, chains_info[ch_id].atom_selection): continue a_sel = {x for xi in chains_info[ch_id].atom_selection for x in xi} test_set = a_sel.intersection(selection_set) if not test_set: continue ch_sel = "chain '%s'" % convert_wildcards_in_chain_id(ch_id) # Chain should be present, so do all the work. # if there is water in chain, specify residues numbers water_present = (len(a_sel) != chains_info[ch_id].chains_atom_number) complete_ch_not_present = (test_set != a_sel) or water_present if bool(chains_info[ch_id].no_altloc): no_altloc = chains_info[ch_id].no_altloc no_altloc_present = no_altloc.count(False) > 0 else: no_altloc_present = False # exclude residues with alternative locations complete_ch_not_present |= no_altloc_present # print "complete_ch_not_present", complete_ch_not_present res_sel = [] if complete_ch_not_present: # collect continuous ranges of residues when possible res_len = len(chains_info[ch_id].resid) # prev_resid = None prev_all_atoms_present = None cur_all_atoms_present = None atoms_for_dumping = [] # all_prev_atoms_in_range previous_res_selected_atom_names = [] a_sel = set(chains_info[ch_id].atom_selection[0]) cur_res_selected_atom_names = get_atom_names_from_test_set( a_sel.intersection(selection_set), a_sel, chains_info[ch_id].atom_names[0]) atoms_in_current_range = cur_res_selected_atom_names sequence_was_broken = False first_resid = chains_info[ch_id].resid[0] last_resid = None for i in xrange(res_len): cur_resid = chains_info[ch_id].resid[i] # test that all atoms in residue are included in selection a_sel = set(chains_info[ch_id].atom_selection[i]) # print "a_sel", a_sel test_set = a_sel.intersection(selection_set) # if not bool(test_set): continue if len(test_set) == 0: # None of residue's atoms are selected # print "Breaking 1" sequence_was_broken = True continue if no_altloc_present and not no_altloc[i]: # print "Breaking 2" sequence_was_broken = True continue all_atoms_present = (test_set == a_sel) if prev_all_atoms_present is None: prev_all_atoms_present = cur_all_atoms_present else: prev_all_atoms_present = cur_all_atoms_present and prev_all_atoms_present cur_all_atoms_present = all_atoms_present previous_res_selected_atom_names = cur_res_selected_atom_names cur_res_selected_atom_names = get_atom_names_from_test_set( test_set, a_sel, chains_info[ch_id].atom_names[i]) # print "all_atoms_present (cur/prev), test_set", chains_info[ch_id].resid[i], cur_all_atoms_present, prev_all_atoms_present, test_set, chains_info[ch_id].atom_names[i] # prev_resid = cur_resid cur_resid = chains_info[ch_id].resid[i] # print "cur_resid", cur_resid # new range is needed when previous selection doesn't match current # selection. # print "cur/prev res_sel", cur_res_selected_atom_names, previous_res_selected_atom_names # print "atoms_for_dumping", atoms_for_dumping # print "atoms_in_current_range", atoms_in_current_range # print "intersecting sets:", set(cur_res_selected_atom_names) ^ set(previous_res_selected_atom_names) continue_range = False continue_range = ((cur_all_atoms_present and prev_all_atoms_present) or (len( set(cur_res_selected_atom_names) ^ set(atoms_in_current_range)) == 0)) continue_range &= not chains_info[ch_id].gap_residue[i] # print "continue range 1", continue_range # residues are consequtive continue_range = continue_range and not sequence_was_broken # print "continue range 2", continue_range if len(atoms_for_dumping) > 0: continue_range = continue_range and (len( set(atoms_for_dumping) ^ set(cur_res_selected_atom_names)) == 0) sequence_was_broken = False # print "continue range 3", continue_range if continue_range: # continue range # print "Continuing range" last_resid = cur_resid atoms_in_current_range = list( set(atoms_in_current_range) | set(cur_res_selected_atom_names)) if not cur_all_atoms_present: # all_prev_atoms_in_range |= set(cur_res_selected_atom_names) atoms_for_dumping = cur_res_selected_atom_names else: # dump previous range, start new one # print "Dumping range" if len(atoms_for_dumping) > 0: atoms_sel = get_atom_str( previous_res_selected_atom_names) else: atoms_sel = "" if prev_all_atoms_present else get_atom_str( previous_res_selected_atom_names) if prev_all_atoms_present is None: atoms_sel = "" if cur_all_atoms_present else get_atom_str( cur_res_selected_atom_names) res_sel = update_res_sel(res_sel=res_sel, first_resid=first_resid, last_resid=last_resid, atoms_selection=atoms_sel) # print "res_sel", res_sel first_resid = cur_resid last_resid = cur_resid atoms_in_current_range = cur_res_selected_atom_names if not cur_all_atoms_present: atoms_for_dumping = cur_res_selected_atom_names else: atoms_for_dumping = [] prev_all_atoms_present = None # print "DUMPING THE LAST RANGE" # print "prev_all_atoms_present", prev_all_atoms_present atoms_sel = "" if prev_all_atoms_present else get_atom_str( previous_res_selected_atom_names) if prev_all_atoms_present or prev_all_atoms_present is None: atoms_sel = "" if cur_all_atoms_present else get_atom_str( cur_res_selected_atom_names) # print "atoms_sel", atoms_sel omit_resids = (first_resid == chains_info[ch_id].resid[0] and last_resid == chains_info[ch_id].resid[-1]) res_sel = update_res_sel(res_sel, first_resid, last_resid, atoms_sel, omit_resids) s = get_clean_selection_string(ch_sel, res_sel) sel_list.append(s) # add parenthesis what selection is more than just a chain s_l = [] sel_list.sort() for s in sel_list: if len(s) > 10: s = '(' + s + ')' s_l.append(s) sel_str = ' or '.join(s_l) # This check could take up to ~90% of runtime of this function... # Nevertheless, this helps to spot bugs early. So this should remain # here, let's say for a year. If no bugs discovered, this could be removed. # When ready to remove, don't forget to remove atom_selection_cache # parameter as well. # Current removal date: Jan 22, 2017 # Removed on Feb, 7, 2018. # if atom_selection_cache is None: # atom_selection_cache = pdb_h.atom_selection_cache() # isel = atom_selection_cache.iselection(sel_str) # # pdb_h.select(isel).write_pdb_file("selected_string.pdb") # # pdb_h.select(selection).write_pdb_file("selected_isel.pdb") # assert len(isel) == len(selection), ""+\ # "%d (result) != %d (input): conversion to string selects different number of atoms!.\n" \ # % (len(isel), len(selection)) +\ # "String lead to error: '%s'" % sel_str return sel_str
def __init__( self, hierarchy=None, # XXX warning, ncs_phil_groups can be changed inside... ncs_phil_groups=None, params=None, log=None, ): """ TODO: 1. Transfer get_ncs_info_as_spec() to ncs/ncs.py:ncs Select method to build ncs_group_object order of implementation: 1) ncs_phil_groups - user-supplied definitions are filtered 2) hierarchy only - Performing NCS search Args: ----- ncs_phil_groups: iotbx.phil.parse(ncs_group_phil_str).extract().ncs_group chain_max_rmsd (float): limit of rms difference between chains to be considered as copies min_percent (float): Threshold for similarity between chains similarity define as: (number of matching res) / (number of res in longer chain) chain_similarity_threshold (float): min similarity between matching chains residue_match_radius (float): max allow distance difference between pairs of matching atoms of two residues """ self.number_of_ncs_groups = 0 # consider removing/replacing with function self.ncs_restraints_group_list = class_ncs_restraints_group_list() # keep hierarchy for writing (To have a source of atoms labels) self.hierarchy = hierarchy # residues common to NCS copies. Used for .spec representation self.common_res_dict = {} # Collect messages, recommendation and errors self.messages = '' # Not used outside... self.old_i_seqs = None self.original_hierarchy = None self.truncated_hierarchy = None self.truncated_h_asc = None self.chains_info = None extension = '' # set search parameters self.params = params if self.params is None: self.params = input.get_default_params().ncs_search # if log is None: self.log = sys.stdout else: self.log = log if hierarchy: # for a in hierarchy.atoms(): # print "oo", a.i_seq, a.id_str() # print "=====" hierarchy.atoms().reset_i_seq() self.original_hierarchy = hierarchy.deep_copy() self.original_hierarchy.reset_atom_i_seqs() if self.params.exclude_selection is not None: # pdb_hierarchy_inp.hierarchy.write_pdb_file("in_ncs_pre_before.pdb") cache = hierarchy.atom_selection_cache() sel = cache.selection("not (%s)" % self.params.exclude_selection) self.truncated_hierarchy = hierarchy.select(sel) else: # this could be to save iseqs but I'm not sure self.truncated_hierarchy = hierarchy.select( flex.size_t_range(hierarchy.atoms_size())) self.old_i_seqs = self.truncated_hierarchy.atoms().extract_i_seq() # print "self.old_i_seqs", list(self.old_i_seqs) # self.truncated_hierarchy.atoms().reset_i_seq() self.truncated_hierarchy.reset_atom_i_seqs() self.truncated_h_asc = self.truncated_hierarchy.atom_selection_cache( ) # self.truncated_hierarchy.write_pdb_file("in_ncs_pre_after.pdb") self.chains_info = ncs_search.get_chains_info( self.truncated_hierarchy) if self.truncated_hierarchy.atoms_size() == 0: return # # print "ncs_groups before validation", ncs_phil_groups validated_ncs_phil_groups = None validated_ncs_phil_groups = self.validate_ncs_phil_groups( pdb_h=self.truncated_hierarchy, ncs_phil_groups=ncs_phil_groups, asc=self.truncated_h_asc) if validated_ncs_phil_groups is None: # print "Last chance, building from hierarchy" self.build_ncs_obj_from_pdb_asu(pdb_h=self.truncated_hierarchy, asc=self.truncated_h_asc) # error handling if self.ncs_restraints_group_list.get_n_groups() == 0: print >> self.log, '========== WARNING! ============\n' print >> self.log, ' No NCS relation were found !!!\n' print >> self.log, '================================\n' if self.messages != '': print >> self.log, self.messages
def selection_string_from_selection(pdb_hierarchy_inp, selection, chains_info=None): """ !!! if selection contains alternative conformations, the assertion in the end will fail. This is to prevent using this function with such selections. This limits its application to search NCS only and at the same time asserts that found NCS groups don't contain alternative conformations. Convert a selection array to a selection string. The function tries to minimise the selection string as possible, using chain names, resseq ranges and when there is not other option residues IDs Limitations: When pdb_hierarchy_inp contains multiple conformations, selection must not include residues with alternate locations Args: pdb_hierarchy_inp : iotbx.pdb.hierarchy.input (or iotbx.pdb.hierarchy) selection (flex.bool or flex.size_t) chains_info : object containing chains (str): chain IDs OR selections string res_name (list of str): list of residues names resid (list of str): list of residues sequence number, resid atom_names (list of list of str): list of atoms in residues atom_selection (list of list of list of int): the location of atoms in ph chains_atom_number (list of int): list of number of atoms in each chain Returns: sel_str (str): atom selection string """ if selection.size() == 0: raise Sorry('Empty atom selection') # create a hierarchy from the selection if hasattr(pdb_hierarchy_inp,"hierarchy"): pdb_hierarchy_inp = pdb_hierarchy_inp.hierarchy # pdb_hierarchy_inp is a hierarchy if isinstance(selection,flex.bool): selection = selection.iselection(True) selection_set = set(selection) sel_list = [] # pdb_hierarchy_inp.select(selection).write_pdb_file("selected_in.pdb") # using chains_info to improve performance if not chains_info: chains_info = get_chains_info(pdb_hierarchy_inp,exclude_water=False) # print "chains_info" # for k, v in chains_info.iteritems(): # print k, v # print "\n\n" chain_ids = sorted(chains_info) for ch_id in chain_ids: # print "chains_info[ch_id].atom_selection", chains_info[ch_id].atom_selection # this "unfolds" the atom_selection array which is [[],[],[],[]...] into # a set a_sel = {x for xi in chains_info[ch_id].atom_selection for x in xi} ch_sel = "chain '%s'" % convert_wildcards_in_chain_id(ch_id) test_set = a_sel.intersection(selection_set) if not test_set: continue # if there is water in chain, specify residues numbers water_present = (len(a_sel) != chains_info[ch_id].chains_atom_number) complete_ch_not_present = (test_set != a_sel) or water_present if bool(chains_info[ch_id].no_altloc): no_altloc = chains_info[ch_id].no_altloc no_altloc_present = no_altloc.count(False) > 0 else: no_altloc_present = False # exclude residues with alternative locations complete_ch_not_present |= no_altloc_present # print "complete_ch_not_present", complete_ch_not_present res_sel = [] if complete_ch_not_present: # collect continuous ranges of residues when possible res_len = len(chains_info[ch_id].resid) # prev_resid = None prev_all_atoms_present = None cur_all_atoms_present = None atoms_for_dumping = [] # all_prev_atoms_in_range previous_res_selected_atom_names = [] a_sel = set(chains_info[ch_id].atom_selection[0]) cur_res_selected_atom_names = get_atom_names_from_test_set( a_sel.intersection(selection_set), a_sel, chains_info[ch_id].atom_names[0]) atoms_in_current_range = cur_res_selected_atom_names seqence_was_broken = False first_resid = chains_info[ch_id].resid[0] last_resid = None for i in xrange(res_len): cur_resid = chains_info[ch_id].resid[i] # test that all atoms in residue are included in selection a_sel = set(chains_info[ch_id].atom_selection[i]) # print "a_sel", a_sel test_set = a_sel.intersection(selection_set) # if not bool(test_set): continue if len(test_set) == 0: # None of residue's atoms are selected # print "Breaking 1" seqence_was_broken = True continue if no_altloc_present and not no_altloc[i]: # print "Breaking 2" seqence_was_broken = True continue all_atoms_present = (test_set == a_sel) prev_all_atoms_present = cur_all_atoms_present cur_all_atoms_present = all_atoms_present previous_res_selected_atom_names = cur_res_selected_atom_names cur_res_selected_atom_names = get_atom_names_from_test_set( test_set, a_sel, chains_info[ch_id].atom_names[i]) # print "all_atoms_present (cur/prev), test_set", chains_info[ch_id].resid[i], cur_all_atoms_present, prev_all_atoms_present, test_set, chains_info[ch_id].atom_names[i] # prev_resid = cur_resid cur_resid = chains_info[ch_id].resid[i] # print "prev_resid, cur_resid", prev_resid, cur_resid # new range is needed when previous selection doesn't match current # selection. # print "cur/prev res_sel", cur_res_selected_atom_names, previous_res_selected_atom_names # print "atoms_for_dumping", atoms_for_dumping # print "atoms_in_current_range", atoms_in_current_range # print "intersecting sets:", set(cur_res_selected_atom_names) ^ set(previous_res_selected_atom_names) continue_range = False continue_range = ((cur_all_atoms_present and prev_all_atoms_present) or (len(set(cur_res_selected_atom_names) ^ set(atoms_in_current_range))==0)) # print "continue range 1", continue_range # residues are consequtive continue_range = continue_range and not seqence_was_broken # print "continue range 2", continue_range if len(atoms_for_dumping) > 0: continue_range = continue_range and ( len(set(atoms_for_dumping)^set(cur_res_selected_atom_names))==0) seqence_was_broken = False # print "continue range 3", continue_range if continue_range: # continue range # print "Continuing range" last_resid = cur_resid atoms_in_current_range = list(set(atoms_in_current_range)|set(cur_res_selected_atom_names)) if not cur_all_atoms_present: # all_prev_atoms_in_range |= set(cur_res_selected_atom_names) atoms_for_dumping = cur_res_selected_atom_names else: # dump previous range, start new one # print "Dumping range" if len(atoms_for_dumping) > 0: atoms_sel = get_atom_str(previous_res_selected_atom_names) else: atoms_sel = "" if prev_all_atoms_present else get_atom_str(previous_res_selected_atom_names) if prev_all_atoms_present is None: atoms_sel = "" if cur_all_atoms_present else get_atom_str(cur_res_selected_atom_names) res_sel = update_res_sel( res_sel=res_sel, first_resid=first_resid, last_resid=last_resid, atoms_selection=atoms_sel) # print "res_sel", res_sel first_resid = cur_resid last_resid = cur_resid atoms_in_current_range = cur_res_selected_atom_names if not cur_all_atoms_present: atoms_for_dumping = cur_res_selected_atom_names else: atoms_for_dumping = [] prev_all_atoms_present = None atoms_sel = "" if prev_all_atoms_present else get_atom_str(previous_res_selected_atom_names) if prev_all_atoms_present or prev_all_atoms_present is None: atoms_sel = "" if cur_all_atoms_present else get_atom_str(cur_res_selected_atom_names) omit_resids = (first_resid == chains_info[ch_id].resid[0] and last_resid == chains_info[ch_id].resid[-1]) res_sel = update_res_sel( res_sel,first_resid,last_resid, atoms_sel, omit_resids) s = get_clean_selection_string(ch_sel,res_sel) sel_list.append(s) # add parenthesis what selection is more than just a chain s_l = [] sel_list.sort() for s in sel_list: if len(s) > 10: s = '(' + s + ')' s_l.append(s) sel_str = ' or '.join(s_l) # This check could take up to ~90% of runtime of this function... # Nevertheless, this helps to spot bugs early. So this should remain # here, let's say for a year. If no bugs discovered, this could be removed. # Current removal date: Jan 22, 2017 isel = pdb_hierarchy_inp.atom_selection_cache().iselection(sel_str) # pdb_hierarchy_inp.select(isel).write_pdb_file("selected_out.pdb") assert len(isel) == len(selection), ""+\ "%d (result) != %d (input): conversion to string selects different number of atoms!.\n" \ % (len(isel), len(selection)) +\ "String lead to error: '%s'" % sel_str # print "sel_str", sel_str # STOP() return sel_str
def solvent_to_nearest_chain(self, model, solvent_model=None): """ Add solvent to nearest macromolecule chain Parameters ---------- model : mmtbx.model.model.manager, a model possibly containing solvent solvent_model : mmtbx.model.model.manager, a model containing solvent to add Returns ------- new_model : mmtbx.model.model.manager, a new model with the solvent added to the nearest macromolecule chain """ model = model.deep_copy() chain_info = get_chains_info(model.get_hierarchy()) solvent_selection = model.selection(self.solvent_sel_str) non_solvent_model = model.select(~solvent_selection) if solvent_model is None: solvent_model = model.select(solvent_selection) model = non_solvent_model solvent_xyz = solvent_model.get_sites_cart().as_numpy_array() nonsolvent_xyz = non_solvent_model.get_sites_cart().as_numpy_array() non_solvent_chains = [ atom.parent().parent().parent() for atom in non_solvent_model.get_hierarchy().atoms() ] tree = KDTree(nonsolvent_xyz) dists, inds = tree.query(solvent_xyz, k=1) dists, inds = dists[:, 0], inds[:, 0] atom_serial_i = 1 atom_serial_max = 0 for atom in model.get_hierarchy().atoms(): if atom.serial_as_int() > atom_serial_max: atom_serial_max = atom.serial_as_int() resseq_current = 1 for i, atom in enumerate(solvent_model.get_hierarchy().atoms()): j = inds[i] # nearest atom index in non solvent model chain = non_solvent_chains[j] rg = atom.parent().parent() resname = rg.unique_resnames()[0] new_ag = iotbx.pdb.hierarchy.atom_group(altloc="", resname=resname) for i_seq, new_atom in enumerate(rg.atoms()): new_ag.append_atom(atom=new_atom.detached_copy()) resseq = chain_info[chain.id].resid_max + resseq_current resseq_current += 1 new_rg = iotbx.pdb.hierarchy.residue_group( resseq=iotbx.pdb.resseq_encode(value=resseq), icode=" ") new_rg.append_atom_group(atom_group=new_ag) for atom in new_rg.atoms(): atom.serial = atom_serial_max + atom_serial_i atom_serial_i += 1 new_chain = iotbx.pdb.hierarchy.chain(id=chain.id) new_chain.append_residue_group(residue_group=new_rg) m = model.get_hierarchy().only_model() m.append_chain(new_chain) new_model = mmtbx.model.manager( model_input=None, pdb_hierarchy=model.get_hierarchy(), crystal_symmetry=model.crystal_symmetry(), ) new_model.get_hierarchy().atoms().reset_i_seq() return new_model