def topOccupancy(PDB): import os, sys from iotbx import pdb from iotbx.pdb import hierarchy import itertools occ = float(0.1) pdb_in = hierarchy.input(PDB) symm = pdb_in.crystal_symmetry() obj_pdb = pdb_in.construct_hierarchy() selected_atoms = obj_pdb.atom_selection_cache().iselection("occupancy>" + str(occ) + " ") counter = int(len(selected_atoms)) # counter=17 if (counter > 0): while (counter > 2): occ = occ + float(0.02) # print ("value of counter is %d and occ is %f",counter, occ) selected_atoms = obj_pdb.atom_selection_cache().iselection( "occupancy>" + str(occ) + " ") counter = int(len(selected_atoms)) if (counter < 6): # print ("value of counter inside if is %d ",counter) newHi = obj_pdb.select(selected_atoms) newHi.write_pdb_file( file_name=os.path.join(str(counter) + "_.pdb")) else: print("occupancy is lower than 0.1")
def get_lig(pdb): lig_names = ["LIG", "UNL", "DRG"] # read into iotbx.hierarchy pdb_in = hierarchy.input(file_name=pdb) # read into iotbx.selection cache sel_cache = pdb_in.hierarchy.atom_selection_cache() lig_pos = [] for lig in lig_names: sel = sel_cache.selection("resname {}".format(lig)) hier = pdb_in.hierarchy.select(sel) if hier.models_size() == 0: continue for chain in hier.only_model().chains(): for residue_group in chain.residue_groups(): for atom_group in residue_group.atom_groups(): chain_id = chain.id resname = atom_group.resname resseq = str(int(residue_group.resseq)) lig_pos.append((chain_id, resname, resseq)) return lig_pos
def map_sites_to_asu(spacegroup, pdb_in, pdb_out, invert=False): '''Map sites to asu of input spacegroup (as sites from shelxd claim P1 in CRYST1 record) inverting if necessary. N.B. if inverting sites also need to invert spacegroup.''' from cctbx.crystal import symmetry, direct_space_asu from iotbx.pdb import hierarchy from scitbx.array_family import flex sg = space_group(space_group_symbols(spacegroup).hall()) coords = hierarchy.input(file_name=pdb_in) cs = coords.input.crystal_symmetry() uc = cs.unit_cell() cs2 = symmetry(unit_cell=uc, space_group=sg) xs = coords.xray_structure_simple().customized_copy(crystal_symmetry=cs2) if invert: xs = xs.change_hand() am = xs.crystal_symmetry().asu_mappings(0.0) xyz = xs.sites_cart() am.process_sites_cart(xyz) xyz = flex.vec3_double() for m in am.mappings(): xyz.append(m[0].mapped_site()) xs.set_sites_cart(xyz) open(pdb_out, 'w').write(xs.as_pdb_file()) return
def set_b_factor_pdb(row, rerun=False): pdb_in_path = row["refine_pdb"] pdb_out_path = row["site_b_factor_path"] # don't do if output file exists, or refine pdb doesn't exist, unless rerun flag set if (not os.path.exists(pdb_out_path) and os.path.exists(pdb_in_path)) or rerun: pdb_in = hierarchy.input(file_name=pdb_in_path) sites = row["sites"] # for each site listed in site set the B factor of that site # to the mean b factor of that site for site in sites: allocated_col = site[0] sel = residue_list_to_selection(row[site[0]]) lig_col = allocated_col.replace("allocated", "lig") if len(row[lig_col]) > 1: raise ValueError("More than one allocated residue") else: chain = list(row[lig_col])[0][0] lig_num = list(row[lig_col])[0][1] lig_sel = "(chain " + chain + " and resid " + str(int(lig_num)) + ")" sel = sel + " or " + lig_sel b_fac = site[1] pdb_in = set_b_factor(pdb_in=pdb_in, sel=sel, b_fac=b_fac) if len(sites) != 0: with open(pdb_out_path, "w") as out_pdb_file: out_pdb_file.write( pdb_in.hierarchy.as_pdb_string( crystal_symmetry=pdb_in.input.crystal_symmetry() ) )
def get_occ_b(pdb, chain, resid, altloc=""): """ Get occupancy and b factor of a single residue Parameters ---------- pdb: str path to pdb file chain: str chain of interest resid: str residue of interest altloc: str altloc of interest Returns ------- mean_occ: float mean occupancy of residue mean_b: float mean b factor of residue std_b: float standard deviation of b factor of refisude """ # read into iotbx.hierarchy pdb_in = hierarchy.input(file_name=pdb) # read into iotbx.selection cache sel_cache = pdb_in.hierarchy.atom_selection_cache() # Get selection object which corresponds to supplied chain residue id and altloc sel = sel_cache.selection("chain {} resid {} altloc {}".format( chain, resid, altloc)) # Select that residue from main hierarchy hier = pdb_in.hierarchy.select(sel) resnames = [] for chain in hier.only_model().chains(): for residue_group in chain.residue_groups(): for atom_group in residue_group.atom_groups(): resnames.append(atom_group.resname) # Get B factor and occ information on residue by looking a individual atoms b = [] occ = [] for atom in atom_group.atoms(): b.append(atom.b) occ.append(atom.occ) mean_occ = np.mean(occ) mean_b = np.mean(b) std_b = np.std(b) return mean_occ, mean_b, std_b
def write_minima_pdb(input_pdb, output_pdb, csv_name, params): """ Write pdb from the minima in exhaustive search Parameters ---------- input_pdb: str path to input pdb to take structure from output_pdb: str path to write strucutre to csv_name: str path to exhaustive search csv params: str parameter Returns ------- """ min_occ, min_u_iso, _ = get_minimum_fofc(csv_name) bound_states, ground_states = get_bound_ground_states(input_pdb, params) pdb_inp = iotbx.pdb.input(input_pdb) hier = pdb_inp.construct_hierarchy() for chain in hier.only_model().chains(): for residue_group in chain.residue_groups(): for atom_group in residue_group.atom_groups(): for atom in atom_group.atoms(): for ground_state in ground_states: num_altlocs = ground_state[1] if ground_state[0][atom.i_seq]: atom.occ = (1 - min_occ) / num_altlocs atom.b = u_iso_to_b_fac(min_u_iso) for bound_state in bound_states: num_altlocs = bound_state[1] if bound_state[0][atom.i_seq]: atom.set_occ(min_occ / num_altlocs) atom.set_b(u_iso_to_b_fac(min_u_iso)) with open(output_pdb, "w") as f: f.write( hier.as_pdb_string(crystal_symmetry=hierarchy.input( input_pdb).crystal_symmetry()))
def residues_near_ligs(pdb, cutoff): """ Get residues within angstrom cutoff of LIG. Parameters ---------- pdb: str, path path to pdb file cutoff: float angstrom cutoff for distance Returns ------- ag_set: set a set of dicts ligand chain: str ligand resseq: str protein chain: str protein resname: str protein resseq: str """ if not os.path.exists(pdb): return None # Load the structure prot_i = hierarchy.input(pdb) prot_h = prot_i.hierarchy # Extract the ligands from the hierarchy lig_ags = [ag for ag in prot_h.atom_groups() if ag.resname == "LIG"] # all non ligand atom groups not_lig_ags = [ag for ag in prot_h.atom_groups() if ag.resname != "LIG"] # atom_groups_near_lig ag_set = set() for lig in lig_ags: lig_chain = lig.parent().parent().id lig_resseq = lig.parent().resseq for ag in not_lig_ags: ag_chain = ag.parent().parent().id if is_within(cutoff, ag.atoms().extract_xyz(), lig.atoms().extract_xyz()): ag_set.add( (lig_chain, lig_resseq, ag_chain, ag.resname, ag.parent().resseq,) ) return ag_set
def residue_select_hierarchy_from_pdb(pdb_path, residues_select, invert_selection=False): """ Produce hierarchy selection object based on supplied residue list Parameters ---------- pdb_path: str path to pdb file from whcih atoms are taken residues_select: list list of residues in format [[chain,resid],[chain,resid]]: [['A', '24'], ['A', '25']] invert_selection: bool Flag to invert selection to residues that are not in residues_select Returns ------- new_atoms_hier """ # read in PDB file from which atoms are to be taken from pdb_in = hierarchy.input(file_name=pdb_path) sel_cache = pdb_in.hierarchy.atom_selection_cache() # produce a hierarchy with atoms to copied selection_string_list = [] chains_new = set() for residue_new in residues_select: selection_string = "(resid {} and chain {})".format(residue_new[1], residue_new[0]) selection_string_list.append(selection_string) chains_new.add(residue_new[0]) selection_string = "or".join(selection_string_list) # Used to select all atoms but residues_select if invert_selection: selection_string = "not ({})".format(selection_string) new_atoms_sel = sel_cache.selection(selection_string) new_atoms_hier = pdb_in.hierarchy.select(new_atoms_sel) return new_atoms_hier
def exercise_misc () : import libtbx.load_env if (not libtbx.env.has_module("iotbx")) : return from iotbx.pdb import hierarchy # Pair 1: 0.5 A apart, mean displacement = 0.4 A # Pair 2: 1.5 A apart, mean displacement = 0.6 A pdb_in = hierarchy.input(pdb_string=""" CRYST1 10.000 11.000 12.000 70.00 80.00 90.00 P 1 HETATM 1 O AHOH A 1 4.000 5.000 3.000 1.00 12.63 O HETATM 2 O BHOH A 1 4.500 5.000 3.000 1.00 12.63 O HETATM 3 O AHOH A 2 7.000 1.000 6.000 1.00 28.42 O HETATM 4 O BHOH A 2 8.500 1.000 6.000 1.00 28.42 O END""") xrs = pdb_in.input.xray_structure_simple() unit_cell = xrs.unit_cell() sc = xrs.scatterers() delta12 = adptbx.intersection( u_1=sc[0].u_iso, u_2=sc[1].u_iso, site_1=sc[0].site, site_2=sc[1].site, unit_cell=xrs.unit_cell()) xrs.convert_to_anisotropic() delta12_aniso = adptbx.intersection( u_1=sc[0].u_star, u_2=sc[1].u_star, site_1=sc[0].site, site_2=sc[1].site, unit_cell=xrs.unit_cell()) # XXX on certain platforms the floating-point precision fails us assert approx_equal(delta12_aniso, delta12, eps=0.0000000000001) assert approx_equal(delta12, 0.2999, eps=0.0001) delta34 = adptbx.intersection( u_1=sc[2].u_star, u_2=sc[3].u_star, site_1=sc[2].site, site_2=sc[3].site, unit_cell=xrs.unit_cell()) assert approx_equal(delta34, -0.300094, eps=0.000001) delta34b = xrs.intersection_of_scatterers(2,3) assert (delta34b == delta34)
def read_occupancy_b(pdb_path, selection): """Extract occupancy and B factor of pdb given selection""" if not os.path.exists(pdb_path): return None # Read in single PDB file pdb_in = hierarchy.input(file_name=pdb_path) sel_cache = pdb_in.hierarchy.atom_selection_cache() sel = sel_cache.selection(selection) sel_hierarchy = pdb_in.hierarchy.select(sel) occ_b = [] # Get occupancy & B factor of ligand for model in sel_hierarchy.models(): for chain in model.chains(): for rg in chain.residue_groups(): for ag in rg.atom_groups(): for atom in ag.atoms(): occ_b.append( [ ag.resname, rg.resseq, ag.altloc, atom.name, atom.occ, atom.b, ] ) return pd.DataFrame( occ_b, columns=[ "Residue", "resseq", "altloc", "Atom", "Occupancy", "B_factor", ], )
def get_occupancy_groups(pdb, params): """ Calculate occupancy groups given pdb file path. Wrapper of giant.structure.restraints.occupancy: overlapping_occupancy_groups(), that generates hierarchy from pdb file path Parameters ---------- :param pdb: :param params: Returns ------- """ logging.info("Gathering occupancy group information from PDB: %s", pdb) print("Gathering occupancy group information from PDB: %s", pdb) pdb_in = hierarchy.input(pdb) resnames = params.select.resnames.split(",") logging.info("Looking for ligands with resname {!s}".format( " or ".join(resnames))) occupancy_groups = overlapping_occupancy_groups( hierarchy=pdb_in.hierarchy, resnames=resnames, group_dist=params.select.group_dist, overlap_dist=params.select.overlap_dist, complete_groups=params.select.complete_groups, exclude_altlocs=params.select.exclude_altlocs.split(",") if params.select.exclude_altlocs else [], verbose=params.select.verbose, ) return occupancy_groups
def sortOccupancy(PDB): ''' This function generates PDB file of atoms having better than 0.5 occupancy. It sorts the atoms according to the decending order of their occupancy and writes a pair atoms from combination of top 5 atoms into separate PDB files. ''' import os, sys from iotbx import pdb from iotbx.pdb import hierarchy import itertools mylist = [] pdb_in = hierarchy.input(PDB) symm = pdb_in.crystal_symmetry() obj_pdb = pdb_in.construct_hierarchy() selected_atoms = obj_pdb.atom_selection_cache().iselection("occupancy>0.5") if (len(selected_atoms) > 1): for e in selected_atoms: mylist.append(obj_pdb.atoms()[e]) sorted_atoms = sorted(mylist, key=lambda thisatom: thisatom.occ, reverse=True) atoms2pdb(sorted_atoms).write_pdb_file(file_name="topOcc_.pdb") ##the following will generate PDB for each atom in the topOcc_.pdb file for e in range(0, len(sorted_atoms)): atoms2pdb([sorted_atoms[e]]).write_pdb_file( file_name="topOcc_" + str(e) + "_.pdb", crystal_symmetry=pdb_in.input.crystal_symmetry(), append_end=True) ##the following list will generate the combination of the top 5 atoms iterableList = itertools.combinations(sorted_atoms[0:5], 2) counter = int(1) for e in list(iterableList): atoms2pdb(e).write_pdb_file( file_name="combination" + str(counter) + "_.pdb", crystal_symmetry=pdb_in.input.crystal_symmetry(), append_end=True) counter = counter + 1 else: print("occupancy is lower than 0.5")
def exercise_misc(): import libtbx.load_env if not libtbx.env.has_module("iotbx"): return from iotbx.pdb import hierarchy # Pair 1: 0.5 A apart, mean displacement = 0.4 A # Pair 2: 1.5 A apart, mean displacement = 0.6 A pdb_in = hierarchy.input( pdb_string=""" CRYST1 10.000 11.000 12.000 70.00 80.00 90.00 P 1 HETATM 1 O AHOH A 1 4.000 5.000 3.000 1.00 12.63 O HETATM 2 O BHOH A 1 4.500 5.000 3.000 1.00 12.63 O HETATM 3 O AHOH A 2 7.000 1.000 6.000 1.00 28.42 O HETATM 4 O BHOH A 2 8.500 1.000 6.000 1.00 28.42 O END""" ) xrs = pdb_in.input.xray_structure_simple() unit_cell = xrs.unit_cell() sc = xrs.scatterers() delta12 = adptbx.intersection( u_1=sc[0].u_iso, u_2=sc[1].u_iso, site_1=sc[0].site, site_2=sc[1].site, unit_cell=xrs.unit_cell() ) xrs.convert_to_anisotropic() delta12_aniso = adptbx.intersection( u_1=sc[0].u_star, u_2=sc[1].u_star, site_1=sc[0].site, site_2=sc[1].site, unit_cell=xrs.unit_cell() ) # XXX on certain platforms the floating-point precision fails us assert approx_equal(delta12_aniso, delta12, eps=0.0000000000001) assert approx_equal(delta12, 0.2999, eps=0.0001) delta34 = adptbx.intersection( u_1=sc[2].u_star, u_2=sc[3].u_star, site_1=sc[2].site, site_2=sc[3].site, unit_cell=xrs.unit_cell() ) assert approx_equal(delta34, -0.300094, eps=0.000001) delta34b = xrs.intersection_of_scatterers(2, 3) assert delta34b == delta34
def read_ligand_occupancy_b(pdb_path, params): """Extract occupancy and B factor of ligand of interest from one PDB file into a dataframe""" # Input: A PDB structure. XCE database via params # Options: Read the surrounding structure as well as the ligand. Angstrom distance? # Output: Occupancy for ligand in supplied pdb. Dict including chain & altloc? # Get ligand chain that is associated with Event pandda_lig_chain = get_pandda_or_any_lig_chain(pdb_path, params) # This should be the case when the dataset has not passed through pandda.export if pandda_lig_chain is None: return None # Read in single PDB file print(pdb_path) pdb_in = hierarchy.input(file_name=pdb_path) sel_cache = pdb_in.hierarchy.atom_selection_cache() lig_sel = sel_cache.selection("chain {}".format(pandda_lig_chain)) lig_hierarchy = pdb_in.hierarchy.select(lig_sel) print("Pandda_lig_chain:".format(pandda_lig_chain)) print_hier_atoms(lig_hierarchy) lig_occ_b = [] # Get occupancy & B factor of ligand for model in lig_hierarchy.models(): for chain in model.chains(): for rg in chain.residue_groups(): for ag in rg.atom_groups(): for atom in ag.atoms(): lig_occ_b.append([atom.name, atom.occ, atom.b]) occ_b_df = pd.DataFrame(lig_occ_b, columns=["Atom", "Occupancy", "B_factor"]) return occ_b_df
from iotbx.pdb import hierarchy pdb_in = hierarchy.input(file_name="6f0o.pdb") pdb_atoms = pdb_in.hierarchy.atoms() for i in pdb_atoms: print i.xyz print i.b xray_structure = pdb_in.input.xray_structure_simple() sel_cache = pdb_in.hierarchy.atom_selection_cache() c_alpha_sel = sel_cache.selection("name ca") # XXX not case sensitive! c_alpha_atoms = pdb_atoms.select(c_alpha_sel) c_alpha_xray_structure = xray_structure.select(c_alpha_sel) c_alpha_hierarchy = pdb_in.hierarchy.select(c_alpha_sel)
# for residue_group in chain.residue_groups(): # print(int(residue_group.resid()), # int(residue_group.resseq), # chain.id, # copy_chain.id) # print() # copy_chain.remove_residue_group(int(residue_group.resid())) for residue_group in chain.residue_groups(): if int(residue_group.resseq) < min(loop_resid): new_chain.append_residue_group(residue_group.detached_copy()) for residue_group in chain.residue_groups(): if int(residue_group.resseq) in loop_resid: new_chain.append_residue_group(residue_group.detached_copy()) for residue_group in chain.residue_groups(): if int(residue_group.resseq) > max(loop_resid): new_chain.append_residue_group(residue_group.detached_copy()) multiple_loop_hier_copy.only_model().append_chain(new_chain) multiple_loop_hier_copy.reset_i_seq_if_necessary() base_pdb_in = hierarchy.input(base_pdb) f = open(os.path.join(path,"alt_multiple_loop.pdb"), "w+") f.write(multiple_loop_hier_copy.as_pdb_string( atoms_reset_serial_first_value = 1, crystal_symmetry=base_pdb_in.input.crystal_symmetry())) f.close()
def update_from_pdb(pdb_df): """ Find residue name, B factors given DataFrame with LIG Carries out cctbx.iotbx dependent searching of pdb file. Requires a dataframe where the row has at least, pdb_latest: The Parameters ---------- pdb_df: Pandas.DataFrame Returns ------- pandas.DataFrame: """ # loop over rows/ residues rows = [] for index, row in pdb_df.iterrows(): # read into iotbx.hierarchy pdb_in = hierarchy.input(file_name=row.pdb_latest) # read into iotbx.selection cache sel_cache = pdb_in.hierarchy.atom_selection_cache() print(row.pdb_latest) sel = sel_cache.selection("resname LIG") # Select that residue from main hierarchy hier = pdb_in.hierarchy.select(sel) # catch when multiple models are in pdb file try: model = hier.only_model() except AssertionError: pass try: model = hier.models()[0] except IndexError: continue for chain in model.chains(): for residue_group in chain.residue_groups(): for atom_group in residue_group.atom_groups(): # copy the row so that the append doesn't # end up appending a series of pointers # to the same object copy_row = row.copy(deep=True) b = [] occ = [] # Get B factor information on residue by looking a individual atoms for atom in atom_group.atoms(): b.append(atom.b) occ.append(atom.occ) # print(atom_group.resname, # residue_group.resseq, # atom_group.altloc, # atom.b, # atom.occ) occupancy = np.mean(occ) mean_b = np.mean(b) std_b = np.std(b) copy_row["chain"] = chain.id copy_row["resseq"] = residue_group.resseq copy_row["altloc"] = atom_group.altloc copy_row["occupancy"] = occupancy copy_row["B_mean"] = mean_b copy_row["B_std"] = std_b rows.append(copy_row) # else: # raise ValueError( # "Multiple residues for selection" # # "chain {} resid {} altloc {} " # # "of pdb: {}".format(row.chain, row.resid, row.alte, pdb) # ) # Append rows pdb_df = pd.concat(rows, axis=1).T # As series are single datatype, # one should not work row by row # This will cause the whole dataframe # to be of object datatype. # This is a poor quality fix for working row by row pdb_df["occupancy"] = pdb_df["occupancy"].astype(float) pdb_df["B_mean"] = pdb_df["B_mean"].astype(float) pdb_df["B_std"] = pdb_df["B_std"].astype(float) # Aggregation can combine rows with different methods. # here we sum occupancy across altloc # and average the other quantities for the resseq pdb_df = pdb_df.groupby( [ "resseq", "crystal_name", "pdb_latest", "mtz_latest", "refine_log", "chain" ], as_index=False, ).agg({ "occupancy": "sum", "B_mean": "mean", "B_std": "mean" }) return pdb_df
except ValueError: template = "data.lat" else: template = args.pop(idx).split("=")[1] # Unit cell file try: idx = [(a.find("cell") == 0 or a.find("cell_file") == 0) for a in args].index(True) except ValueError: cell_file = "cell" else: cell_file = args.pop(idx).split("=")[1] pdb_in = hierarchy.input(file_name=pdb_file) xrs = pdb_in.input.xray_structure_simple() if (bfacs == "zero"): xrs.convert_to_isotropic() xrs.set_b_iso(0.0) if (bfacs == "iso"): xrs.convert_to_isotropic() fcalc = xrs.structure_factors(d_min=1.0).f_calc() fc_square = fcalc.as_intensity_array() fc_square_p1 = fc_square.expand_to_p1() f = open("tmp.hkl", 'w')
avg = sum(times) / (5 + runs) stdev = np.std(times) print(name + "\t" + str(avg) + "\t" + str(stdev) + "\t" + str(5 + runs)) def time_function_multiple(fn, subjects, global_name): for (name, subject) in subjects: time_function(fn, subject, global_name + "\t" + name) names = [ ("small", "example-pdbs/1ubq.pdb"), ("medium", "example-pdbs/1yyf.pdb"), ("big", "example-pdbs/pTLS-6484.pdb"), ] proteins = [ ("small", Hierarchy.input(file_name="example-pdbs/1ubq.pdb")), ("medium", Hierarchy.input(file_name="example-pdbs/1yyf.pdb")), ("big", Hierarchy.input(file_name="example-pdbs/pTLS-6484.pdb")), ] time_function_multiple(open_pdb, names, "open") time_function_multiple(transformation, proteins, "transformation") time_function_multiple(remove, proteins, "remove") time_function_multiple(iteration, proteins, "iteration") time_function_multiple(iteration_build_in, proteins, "iteration_build_in") time_function_multiple(renumber, proteins, "renumber") time_function_multiple(clone, proteins, "clone") time_function_multiple(save, proteins, "save")
def update_from_pdb(pdb_df): """ Find residue name, B factors given DataFrame with chain, residue id and altloc Carries out cctbx.iotbx dependent searching of pdb file. Requires a dataframe where the row has at least, pdb_latest: The Parameters ---------- pdb_df: Pandas.DataFrame Returns ------- pandas.DataFrame: """ # Load pdb path from DataFrame # need to select first unique value as there will be duplicates # of name for every residue pdb = pdb_df.pdb_latest.unique()[0] # read into iotbx.hierarchy pdb_in = hierarchy.input(file_name=pdb) # read into iotbx.selection cache sel_cache = pdb_in.hierarchy.atom_selection_cache() # loop over rows/ residues rows = [] for index, row in pdb_df.iterrows(): try: # Get selection object which corresponds to supplied chain residue id and altloc # Type conversion in res.id neeed otherwise nothing is selected sel = sel_cache.selection( "chain {} and resid {} and altloc {}".format( row.chain, str(int(row.resid)), row.alte)) except AttributeError: # Use ligand LIG instead of chain resid and alte # This doesn't work at the next step, a large number # are being dropped under "Likely dummy atoms" sel = sel_cache.selection("resname LIG") # Select that residue from main hierarchy hier = pdb_in.hierarchy.select(sel) resnames = [] # catch when multiple models are in pdb file try: model = hier.only_model() except AssertionError: pass try: model = hier.models()[0] except IndexError: continue for chain in model.chains(): for residue_group in chain.residue_groups(): for atom_group in residue_group.atom_groups(): resnames.append(atom_group.resname) # Get B factor information on residue by looking a individual atoms b = [] for atom in atom_group.atoms(): b.append(atom.b) mean_b = np.mean(b) std_b = np.std(b) # Append information to row # if len(resnames) == 1: row["resname"] = resnames[0] row["B_mean"] = mean_b row["B_std"] = std_b rows.append(row) # else: # raise ValueError( # "Multiple residues for selection" # # "chain {} resid {} altloc {} " # # "of pdb: {}".format(row.chain, row.resid, row.alte, pdb) # ) # Append rows) pdb_df = pd.concat(rows, axis=1) # Transpose to get in same orientation as input return pdb_df.T
def copy_b(pdb, ref_pdb, out_pdb, chain, resid, altloc=""): """ Copy b factor of a single residue to another pdb file Parameters ---------- pdb: str path to pdb file ref_pdb: str path to reference pdb file chain: str chain of interest resid: str residue of interest altloc: str altloc of interest Returns ------- """ # read into iotbx.hierarchy ref_pdb_in = hierarchy.input(file_name=ref_pdb) # read into iotbx.selection cache sel_cache = ref_pdb_in.hierarchy.atom_selection_cache() # Get selection object which corresponds to supplied chain residue id and altloc if altloc == "": ref_sel = sel_cache.selection("chain {} resid {}".format(chain, resid)) else: ref_sel = sel_cache.selection("chain {} resid {} altloc {}".format( chain, resid, altloc)) # Select that residue from main hierarchy ref_hier = ref_pdb_in.hierarchy.select(ref_sel) ref_lig = {} for ref_chain in ref_hier.only_model().chains(): for residue_group in ref_chain.residue_groups(): for atom_group in residue_group.atom_groups(): # Get B factor and occ information on residue by looking a individual atoms for atom in atom_group.atoms(): ref_lig[atom.name] = atom.b # read into iotbx.hierarchy pdb_in = hierarchy.input(file_name=pdb) # read into iotbx.selection cache sel_cache = pdb_in.hierarchy.atom_selection_cache() # Get selection object which corresponds to supplied chain residue id and altloc if altloc == "": sel = sel_cache.selection("chain {} resid {}".format(chain, resid)) else: sel = sel_cache.selection("chain {} resid {} altloc {}".format( chain, resid, altloc)) hier = pdb_in.hierarchy.select(sel) # Select that residue from main hierarchy for current_chain in hier.only_model().chains(): for residue_group in current_chain.residue_groups(): for atom_group in residue_group.atom_groups(): for atom in atom_group.atoms(): atom.b = ref_lig[atom.name] if not os.path.isdir(os.path.dirname(out_pdb)): os.makedirs(os.path.dirname(out_pdb)) with open(out_pdb, "w") as out: out.write( pdb_in.hierarchy.as_pdb_string( crystal_symmetry=pdb_in.input.crystal_symmetry()))
def copy_atoms(copy_params): """ Copy atoms from one pdb file to many, then refine. Copy dimple pdb, mtz and cif with cys bond Copy ligand atoms from existing coordinates Run giant.merge_conformations to generate a multi state model Copy link records suitable for both conformers of the ligand Run quick refine to generate refined ligand """, # generate output directory if it doesn't exist if not os.path.exists(copy_params.output.out_dir): os.mkdir(copy_params.output.out_dir) # read in PDB file from which atoms are to be taken from (ground structure) pdb_in = hierarchy.input(file_name=copy_params.input.base_pdb) sel_cache = pdb_in.hierarchy.atom_selection_cache() # produce a hierarchy with atoms to copied selection_string_list = [] chains_new = set() for atom_new in copy_params.input.atoms_new: selection_string = "(resid {} and chain {})".format(atom_new[1], atom_new[0]) selection_string_list.append(selection_string) chains_new.add(atom_new[0]) selection_string = "or".join(selection_string_list) new_atoms_sel = sel_cache.selection(selection_string) new_atoms_hier = pdb_in.hierarchy.select(new_atoms_sel) # Produce a selection string to determine which atoms are removed selection_string_list = [] if copy_params.input.atoms_remove is not None: for atom_remove in copy_params.input.atoms_remove: selection_string = "(resid {} and chain {})".format( atom_remove[1], atom_remove[0] ) selection_string_list.append(selection_string) selection_string = "or".join(selection_string_list) not_selection_string = "not ({})".format(selection_string) # Define xtals to loop over xtals = copy_params.input.xtal_list for num in range( copy_params.input.start_xtal_number, copy_params.input.end_xtal_number + 1 ): xtal_name = copy_params.input.prefix + "{0:0>4}".format(num) xtals.append(xtal_name) # Loop over all xtals for xtal_name in xtals: # For quick rerun if ( os.path.exists( os.path.join( copy_params.output.out_dir, xtal_name, copy_params.output.refine_pdb ) ) and not copy_params.settings.overwrite ): print("Skipping {}, as attempted".format(xtal_name)) continue # Run only if sufficent input data if not os.path.exists( os.path.join(copy_params.input.path, xtal_name, copy_params.input.pdb_style) ): print( "pdb does not exist: {}".format( os.path.join( copy_params.input.path, xtal_name, copy_params.input.pdb_style ) ) ) continue print("Trying to run {}".format(xtal_name)) pdb_in_refine = hierarchy.input( file_name=os.path.join( copy_params.input.path, xtal_name, copy_params.input.pdb_style ) ) acceptor_hierarchy = pdb_in_refine.construct_hierarchy() # remove atoms from xtal if copy_params.input.atoms_remove is not None: refine_sel_cache = pdb_in_refine.hierarchy.atom_selection_cache() remove_atoms_sel = refine_sel_cache.selection(not_selection_string) removed_hier = acceptor_hierarchy.select(remove_atoms_sel) working_hier = removed_hier else: working_hier = acceptor_hierarchy # Add atoms from base_pdb donor_hierarchy = new_atoms_hier acceptor_hier = transfer_residue_groups_from_other( working_hier, donor_hierarchy, in_place=False, verbose=False ) # Generate output xtal directories if not os.path.exists(os.path.join(copy_params.output.out_dir, xtal_name)): os.mkdir(os.path.join(copy_params.output.out_dir, xtal_name)) # Write output pdb with changed atoms f = open( os.path.join(copy_params.output.out_dir, xtal_name, copy_params.output.pdb), "w+", ) f.write( acceptor_hier.as_pdb_string( crystal_symmetry=pdb_in_refine.input.crystal_symmetry() ) ) f.close() # Copy the input pdb to output directory os.chdir(os.path.join(copy_params.output.out_dir, xtal_name)) os.system( "cp {} {}".format( os.path.join( copy_params.input.path, xtal_name, copy_params.input.pdb_style ), os.path.join( copy_params.output.out_dir, xtal_name, copy_params.input.pdb_style ), ) ) # Copy the input cif to output_directory os.system( "cp {} {}".format( copy_params.input.cif, os.path.join( copy_params.output.out_dir, xtal_name, os.path.basename(copy_params.input.cif), ), ) ) # Copy the input mtz to output directory os.system( "cp -rL {} {}".format( os.path.join( copy_params.input.path, xtal_name, copy_params.input.mtz_style ), os.path.join( copy_params.output.out_dir, xtal_name, copy_params.input.mtz_style ), ) ) # Run giant.merge_conforamtions os.system( "giant.merge_conformations major={} minor={}".format( os.path.join( copy_params.output.out_dir, xtal_name, copy_params.input.pdb_style ), os.path.join( copy_params.output.out_dir, xtal_name, copy_params.output.pdb ), ) ) # Add link record strings into multimodel pdb file, prior to refinement if copy_params.input.link_record_list is not None: with open( os.path.join( copy_params.output.out_dir, xtal_name, copy_params.output.multi_state_model_pdb, ), "r", ) as original: multi_model = original.read() with open( os.path.join( copy_params.output.out_dir, xtal_name, copy_params.output.multi_state_model_pdb, ), "w", ) as modified: for link_record in copy_params.input.link_record_list: modified.write(link_record) modified.write(multi_model) # Add extra params if copy_params.input.extra_params is not None: with open( "multi-state-restraints.{}.params".format(copy_params.settings.program), "a+", ) as param_file: if copy_params.input.extra_params not in param_file.read(): param_file.write(copy_params.input.extra_params) if copy_params.settings.program == "phenix": cmds = "module load phenix\n" elif copy_params.settings.program == "buster": cmds = "module load buster\n" else: cmds = "\n" cmds += "source {}\n".format(copy_params.settings.ccp4_path) # Run giant.quick_refine cmds += "giant.quick_refine {} {} {} params={} program={}\n".format( os.path.join( copy_params.output.out_dir, xtal_name, copy_params.output.multi_state_model_pdb, ), os.path.join( copy_params.output.out_dir, xtal_name, copy_params.input.mtz_style ), os.path.join(copy_params.output.out_dir, xtal_name, copy_params.input.cif), os.path.join( copy_params.output.out_dir, xtal_name, copy_params.settings.param_file ), copy_params.settings.program, ) cmds += "giant.split_conformations refine.pdb" if copy_params.settings.qsub: f = open( os.path.join( copy_params.output.out_dir, xtal_name, "{}_quick_refine.sh".format(xtal_name), ), "w", ) f.write(cmds) f.close() os.system( "qsub {}".format( os.path.join( copy_params.output.out_dir, xtal_name, "{}_quick_refine.sh".format(xtal_name), ) ) ) else: os.system(cmds)
from scitbx.array_family import flex if __name__ == "__main__": """ Copy a water atom into the centroid of ligand. """ # parse path top ground and bound pdb parser = argparse.ArgumentParser("copy water atom to ligand centroid") parser.add_argument("--bound_pdb") parser.add_argument("--ground_pdb") parser.add_argument("--output_pdb") param = parser.parse_args() # Get centroid of ligand from bound pdb bound_pdb_in = hierarchy.input(file_name=param.bound_pdb) bound_sel_cache = bound_pdb_in.hierarchy.atom_selection_cache() selection_string = "resname LIG" lig_sel = bound_sel_cache.selection(selection_string) lig_hier = bound_pdb_in.hierarchy.select(lig_sel) lig_centroid = lig_hier.atoms().extract_xyz().mean() # read in ground state pdb ground_pdb_in = hierarchy.input(file_name=param.ground_pdb) ground_sel_cache = ground_pdb_in.hierarchy.atom_selection_cache() # get water selection wat_sel = bound_sel_cache.selection("water") wat_hier = bound_pdb_in.hierarchy.select(wat_sel) wat_resseq = wat_hier.atoms()[-1].parent().parent().resseq
def open_pdb(filename): Hierarchy.input(file_name=filename)
def run(self, args, command_name, out=sys.stdout): command_line = (iotbx_option_parser( usage="%s [options]" % command_name, description='Example: %s data.mtz data.mtz ref_model.pdb'%command_name) .option(None, "--show_defaults", action="store_true", help="Show list of parameters.") ).process(args=args) cif_file = None processed_args = utils.process_command_line_args( args = args, log = sys.stdout, master_params = master_phil) params = processed_args.params if(params is None): params = master_phil self.params = params.extract().ensemble_probability pdb_file_names = processed_args.pdb_file_names if len(pdb_file_names) != 1 : raise Sorry("Only one PDB structure may be used") pdb_file = file_reader.any_file(pdb_file_names[0]) self.log = multi_out() self.log.register(label="stdout", file_object=sys.stdout) self.log.register( label="log_buffer", file_object=StringIO(), atexit_send_to=None) sys.stderr = self.log log_file = open(pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.log', "w") self.log.replace_stringio( old_label="log_buffer", new_label="log", new_file_object=log_file) utils.print_header(command_name, out = self.log) params.show(out = self.log) # f_obs = None r_free_flags = None reflection_files = processed_args.reflection_files if self.params.fobs_vs_fcalc_post_nll: if len(reflection_files) == 0: raise Sorry("Fobs from input MTZ required for fobs_vs_fcalc_post_nll") if len(reflection_files) > 0: crystal_symmetry = processed_args.crystal_symmetry print >> self.log, 'Reflection file : ', processed_args.reflection_file_names[0] utils.print_header("Model and data statistics", out = self.log) rfs = reflection_file_server( crystal_symmetry = crystal_symmetry, reflection_files = processed_args.reflection_files, log = self.log) parameters = utils.data_and_flags_master_params().extract() determine_data_and_flags_result = utils.determine_data_and_flags( reflection_file_server = rfs, parameters = parameters, data_parameter_scope = "refinement.input.xray_data", flags_parameter_scope = "refinement.input.xray_data.r_free_flags", data_description = "X-ray data", keep_going = True, log = self.log) f_obs = determine_data_and_flags_result.f_obs number_of_reflections = f_obs.indices().size() r_free_flags = determine_data_and_flags_result.r_free_flags test_flag_value = determine_data_and_flags_result.test_flag_value if(r_free_flags is None): r_free_flags=f_obs.array(data=flex.bool(f_obs.data().size(), False)) # process PDB pdb_file.assert_file_type("pdb") # pdb_in = hierarchy.input(file_name=pdb_file.file_name) ens_pdb_hierarchy = pdb_in.construct_hierarchy() ens_pdb_hierarchy.atoms().reset_i_seq() ens_pdb_xrs_s = pdb_in.input.xray_structures_simple() number_structures = len(ens_pdb_xrs_s) print >> self.log, 'Number of structure in ensemble : ', number_structures # Calculate sigmas from input map only if self.params.assign_sigma_from_map and self.params.ensemble_sigma_map_input is not None: # process MTZ input_file = file_reader.any_file(self.params.ensemble_sigma_map_input) if input_file.file_type == "hkl" : if input_file.file_object.file_type() != "ccp4_mtz" : raise Sorry("Only MTZ format accepted for map input") else: mtz_file = input_file else: raise Sorry("Only MTZ format accepted for map input") miller_arrays = mtz_file.file_server.miller_arrays map_coeffs_1 = miller_arrays[0] # xrs_list = [] for n, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): # get sigma levels from ensemble fc for each structure xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy, ens_pdb_xrs = ens_pdb_xrs, map_coeffs_1 = map_coeffs_1, residue_detail = self.params.residue_detail, ignore_hd = self.params.ignore_hd, log = self.log) xrs_list.append(xrs) # write ensemble pdb file, occupancies as sigma level filename = pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_vs_' + self.params.ensemble_sigma_map_input.replace('.mtz','') + '_pensemble.pdb' write_ensemble_pdb(filename = filename, xrs_list = xrs_list, ens_pdb_hierarchy = ens_pdb_hierarchy ) # Do full analysis vs Fobs else: model_map_coeffs = [] fmodel = None # Get <fcalc> for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): ens_pdb_xrs.set_occupancies(1.0) if model == 0: # If mtz not supplied get fobs from xray structure... # Use input Fobs for scoring against nll if self.params.fobs_vs_fcalc_post_nll: dummy_fobs = f_obs else: if f_obs == None: if self.params.fcalc_high_resolution == None: raise Sorry("Please supply high resolution limit or input mtz file.") dummy_dmin = self.params.fcalc_high_resolution dummy_dmax = self.params.fcalc_low_resolution else: print >> self.log, 'Supplied mtz used to determine high and low resolution cuttoffs' dummy_dmax, dummy_dmin = f_obs.d_max_min() # dummy_fobs = abs(ens_pdb_xrs.structure_factors(d_min = dummy_dmin).f_calc()) dummy_fobs.set_observation_type_xray_amplitude() # If mtz supplied, free flags are over written to prevent array size error r_free_flags = dummy_fobs.array(data=flex.bool(dummy_fobs.data().size(),False)) # fmodel = utils.fmodel_simple( scattering_table = "wk1995", xray_structures = [ens_pdb_xrs], f_obs = dummy_fobs, target_name = 'ls', bulk_solvent_and_scaling = False, r_free_flags = r_free_flags ) f_calc_ave = fmodel.f_calc().array(data = fmodel.f_calc().data()*0).deep_copy() # XXX Important to ensure scale is identical for each model and <model> fmodel.set_scale_switch = 1.0 f_calc_ave_total = fmodel.f_calc().data().deep_copy() else: fmodel.update_xray_structure(xray_structure = ens_pdb_xrs, update_f_calc = True, update_f_mask = False) f_calc_ave_total += fmodel.f_calc().data().deep_copy() print >> self.log, 'Model :', model+1 print >> self.log, "\nStructure vs real Fobs (no bulk solvent or scaling)" print >> self.log, 'Rwork : %5.4f '%fmodel.r_work() print >> self.log, 'Rfree : %5.4f '%fmodel.r_free() print >> self.log, 'K1 : %5.4f '%fmodel.scale_k1() fcalc_edm = fmodel.electron_density_map() fcalc_map_coeffs = fcalc_edm.map_coefficients(map_type = 'Fc') fcalc_mtz_dataset = fcalc_map_coeffs.as_mtz_dataset(column_root_label ='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_mtz_dataset.mtz_object().write(file_name = str(model+1)+"_Fc.mtz") model_map_coeffs.append(fcalc_map_coeffs.deep_copy()) fmodel.update(f_calc = f_calc_ave.array(f_calc_ave_total / number_structures)) print >> self.log, "\nEnsemble vs real Fobs (no bulk solvent or scaling)" print >> self.log, 'Rwork : %5.4f '%fmodel.r_work() print >> self.log, 'Rfree : %5.4f '%fmodel.r_free() print >> self.log, 'K1 : %5.4f '%fmodel.scale_k1() # Get <Fcalc> map fcalc_ave_edm = fmodel.electron_density_map() fcalc_ave_map_coeffs = fcalc_ave_edm.map_coefficients(map_type = 'Fc').deep_copy() fcalc_ave_mtz_dataset = fcalc_ave_map_coeffs.as_mtz_dataset(column_root_label ='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_ave_mtz_dataset.mtz_object().write(file_name = "aveFc.mtz") fcalc_ave_map_coeffs = fcalc_ave_map_coeffs.fft_map() fcalc_ave_map_coeffs.apply_volume_scaling() fcalc_ave_map_data = fcalc_ave_map_coeffs.real_map_unpadded() fcalc_ave_map_stats = maptbx.statistics(fcalc_ave_map_data) print >> self.log, "<Fcalc> Map Stats :" fcalc_ave_map_stats.show_summary(f = self.log) offset = fcalc_ave_map_stats.min() model_neg_ll = [] number_previous_scatters = 0 # Run through structure list again and get probability xrs_list = [] for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): if self.params.verbose: print >> self.log, '\n\nModel : ', model+1 # Get model atom sigmas vs Fcalc fcalc_map = model_map_coeffs[model].fft_map() fcalc_map.apply_volume_scaling() fcalc_map_data = fcalc_map.real_map_unpadded() fcalc_map_stats = maptbx.statistics(fcalc_map_data) if self.params.verbose: print >> self.log, "Fcalc map stats :" fcalc_map_stats.show_summary(f = self.log) xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy, ens_pdb_xrs = ens_pdb_xrs, fft_map_1 = fcalc_map, model_i = model, residue_detail = self.params.residue_detail, ignore_hd = self.params.ignore_hd, number_previous_scatters = number_previous_scatters, log = self.log) fcalc_sigmas = xrs.scatterers().extract_occupancies() del fcalc_map # Get model atom sigmas vs <Fcalc> xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy, ens_pdb_xrs = ens_pdb_xrs, fft_map_1 = fcalc_ave_map_coeffs, model_i = model, residue_detail = self.params.residue_detail, ignore_hd = self.params.ignore_hd, number_previous_scatters = number_previous_scatters, log = self.log) ### For testing other residue averaging options #print xrs.residue_selections fcalc_ave_sigmas = xrs.scatterers().extract_occupancies() # Probability of model given <model> prob = fcalc_ave_sigmas / fcalc_sigmas # XXX debug option if False: for n,p in enumerate(prob): print >> self.log, ' {0:5d} {1:5.3f}'.format(n,p) # Set probabilty between 0 and 1 # XXX Make Histogram / more stats prob_lss_zero = flex.bool(prob <= 0) prob_grt_one = flex.bool(prob > 1) prob.set_selected(prob_lss_zero, 0.001) prob.set_selected(prob_grt_one, 1.0) xrs.set_occupancies(prob) xrs_list.append(xrs) sum_neg_ll = sum(-flex.log(prob)) model_neg_ll.append((sum_neg_ll, model)) if self.params.verbose: print >> self.log, 'Model probability stats :' print >> self.log, prob.min_max_mean().show() print >> self.log, ' Count < 0.0 : ', prob_lss_zero.count(True) print >> self.log, ' Count > 1.0 : ', prob_grt_one.count(True) # For averaging by residue number_previous_scatters += ens_pdb_xrs.sites_cart().size() # write ensemble pdb file, occupancies as sigma level write_ensemble_pdb(filename = pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.pdb', xrs_list = xrs_list, ens_pdb_hierarchy = ens_pdb_hierarchy ) # XXX Test ordering models by nll # XXX Test removing nth percentile atoms if self.params.sort_ensemble_by_nll_score or self.params.fobs_vs_fcalc_post_nll: for percentile in [1.0,0.975,0.95,0.9,0.8,0.6,0.2]: model_neg_ll = sorted(model_neg_ll) f_calc_ave_total_reordered = None print_list = [] for i_neg_ll in model_neg_ll: xrs = xrs_list[i_neg_ll[1]] nll_occ = xrs.scatterers().extract_occupancies() # Set q=0 nth percentile atoms sorted_nll_occ = sorted(nll_occ, reverse=True) number_atoms = len(sorted_nll_occ) percentile_prob_cutoff = sorted_nll_occ[int(number_atoms * percentile)-1] cutoff_selections = flex.bool(nll_occ < percentile_prob_cutoff) cutoff_nll_occ = flex.double(nll_occ.size(), 1.0).set_selected(cutoff_selections, 0.0) #XXX Debug if False: print '\nDebug' for x in xrange(len(cutoff_selections)): print cutoff_selections[x], nll_occ[x], cutoff_nll_occ[x] print percentile print percentile_prob_cutoff print cutoff_selections.count(True) print cutoff_selections.size() print cutoff_nll_occ.count(0.0) print 'Count q = 1 : ', cutoff_nll_occ.count(1.0) print 'Count scatterers size : ', cutoff_nll_occ.size() xrs.set_occupancies(cutoff_nll_occ) fmodel.update_xray_structure(xray_structure = xrs, update_f_calc = True, update_f_mask = True) if f_calc_ave_total_reordered == None: f_calc_ave_total_reordered = fmodel.f_calc().data().deep_copy() f_mask_ave_total_reordered = fmodel.f_masks()[0].data().deep_copy() cntr = 1 else: f_calc_ave_total_reordered += fmodel.f_calc().data().deep_copy() f_mask_ave_total_reordered += fmodel.f_masks()[0].data().deep_copy() cntr+=1 fmodel.update(f_calc = f_calc_ave.array(f_calc_ave_total_reordered / cntr).deep_copy(), f_mask = f_calc_ave.array(f_mask_ave_total_reordered / cntr).deep_copy() ) # Update solvent and scale # XXX Will need to apply_back_trace on latest version fmodel.set_scale_switch = 0 fmodel.update_all_scales() # Reset occ for outout xrs.set_occupancies(nll_occ) # k1 updated vs Fobs if self.params.fobs_vs_fcalc_post_nll: print_list.append([cntr, i_neg_ll[0], i_neg_ll[1], fmodel.r_work(), fmodel.r_free()]) # Order models by nll and print summary print >> self.log, '\nModels ranked by nll <Fcalc> R-factors recalculated' print >> self.log, 'Percentile cutoff : {0:5.3f}'.format(percentile) xrs_list_sorted_nll = [] print >> self.log, ' | NLL <Rw> <Rf> Ens Model' for info in print_list: print >> self.log, ' {0:4d} | {1:8.1f} {2:8.4f} {3:8.4f} {4:12d}'.format( info[0], info[1], info[3], info[4], info[2]+1, ) xrs_list_sorted_nll.append(xrs_list[info[2]]) # Output nll ordered ensemble write_ensemble_pdb(filename = 'nll_ordered_' + pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.pdb', xrs_list = xrs_list_sorted_nll, ens_pdb_hierarchy = ens_pdb_hierarchy )
def run(self, args, command_name, out=sys.stdout): command_line = (iotbx_option_parser( usage="%s [options]" % command_name, description='Example: %s data.mtz data.mtz ref_model.pdb' % command_name).option( None, "--show_defaults", action="store_true", help="Show list of parameters.")).process(args=args) cif_file = None processed_args = utils.process_command_line_args( args=args, log=sys.stdout, master_params=master_phil) params = processed_args.params if (params is None): params = master_phil self.params = params.extract().ensemble_probability pdb_file_names = processed_args.pdb_file_names if len(pdb_file_names) != 1: raise Sorry("Only one PDB structure may be used") pdb_file = file_reader.any_file(pdb_file_names[0]) self.log = multi_out() self.log.register(label="stdout", file_object=sys.stdout) self.log.register(label="log_buffer", file_object=StringIO(), atexit_send_to=None) sys.stderr = self.log log_file = open( pdb_file_names[0].split('/')[-1].replace('.pdb', '') + '_pensemble.log', "w") self.log.replace_stringio(old_label="log_buffer", new_label="log", new_file_object=log_file) utils.print_header(command_name, out=self.log) params.show(out=self.log) # f_obs = None r_free_flags = None reflection_files = processed_args.reflection_files if self.params.fobs_vs_fcalc_post_nll: if len(reflection_files) == 0: raise Sorry( "Fobs from input MTZ required for fobs_vs_fcalc_post_nll") if len(reflection_files) > 0: crystal_symmetry = processed_args.crystal_symmetry print('Reflection file : ', processed_args.reflection_file_names[0], file=self.log) utils.print_header("Model and data statistics", out=self.log) rfs = reflection_file_server( crystal_symmetry=crystal_symmetry, reflection_files=processed_args.reflection_files, log=self.log) parameters = extract_xtal_data.data_and_flags_master_params( ).extract() determine_data_and_flags_result = extract_xtal_data.run( reflection_file_server=rfs, parameters=parameters, data_parameter_scope="refinement.input.xray_data", flags_parameter_scope="refinement.input.xray_data.r_free_flags", data_description="X-ray data", keep_going=True, log=self.log) f_obs = determine_data_and_flags_result.f_obs number_of_reflections = f_obs.indices().size() r_free_flags = determine_data_and_flags_result.r_free_flags test_flag_value = determine_data_and_flags_result.test_flag_value if (r_free_flags is None): r_free_flags = f_obs.array( data=flex.bool(f_obs.data().size(), False)) # process PDB pdb_file.assert_file_type("pdb") # pdb_in = hierarchy.input(file_name=pdb_file.file_name) ens_pdb_hierarchy = pdb_in.construct_hierarchy() ens_pdb_hierarchy.atoms().reset_i_seq() ens_pdb_xrs_s = pdb_in.input.xray_structures_simple() number_structures = len(ens_pdb_xrs_s) print('Number of structure in ensemble : ', number_structures, file=self.log) # Calculate sigmas from input map only if self.params.assign_sigma_from_map and self.params.ensemble_sigma_map_input is not None: # process MTZ input_file = file_reader.any_file( self.params.ensemble_sigma_map_input) if input_file.file_type == "hkl": if input_file.file_object.file_type() != "ccp4_mtz": raise Sorry("Only MTZ format accepted for map input") else: mtz_file = input_file else: raise Sorry("Only MTZ format accepted for map input") miller_arrays = mtz_file.file_server.miller_arrays map_coeffs_1 = miller_arrays[0] # xrs_list = [] for n, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): # get sigma levels from ensemble fc for each structure xrs = get_map_sigma(ens_pdb_hierarchy=ens_pdb_hierarchy, ens_pdb_xrs=ens_pdb_xrs, map_coeffs_1=map_coeffs_1, residue_detail=self.params.residue_detail, ignore_hd=self.params.ignore_hd, log=self.log) xrs_list.append(xrs) # write ensemble pdb file, occupancies as sigma level filename = pdb_file_names[0].split('/')[-1].replace( '.pdb', '') + '_vs_' + self.params.ensemble_sigma_map_input.replace( '.mtz', '') + '_pensemble.pdb' write_ensemble_pdb(filename=filename, xrs_list=xrs_list, ens_pdb_hierarchy=ens_pdb_hierarchy) # Do full analysis vs Fobs else: model_map_coeffs = [] fmodel = None # Get <fcalc> for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): ens_pdb_xrs.set_occupancies(1.0) if model == 0: # If mtz not supplied get fobs from xray structure... # Use input Fobs for scoring against nll if self.params.fobs_vs_fcalc_post_nll: dummy_fobs = f_obs else: if f_obs == None: if self.params.fcalc_high_resolution == None: raise Sorry( "Please supply high resolution limit or input mtz file." ) dummy_dmin = self.params.fcalc_high_resolution dummy_dmax = self.params.fcalc_low_resolution else: print( 'Supplied mtz used to determine high and low resolution cuttoffs', file=self.log) dummy_dmax, dummy_dmin = f_obs.d_max_min() # dummy_fobs = abs( ens_pdb_xrs.structure_factors( d_min=dummy_dmin).f_calc()) dummy_fobs.set_observation_type_xray_amplitude() # If mtz supplied, free flags are over written to prevent array size error r_free_flags = dummy_fobs.array( data=flex.bool(dummy_fobs.data().size(), False)) # fmodel = utils.fmodel_simple( scattering_table="wk1995", xray_structures=[ens_pdb_xrs], f_obs=dummy_fobs, target_name='ls', bulk_solvent_and_scaling=False, r_free_flags=r_free_flags) f_calc_ave = fmodel.f_calc().array( data=fmodel.f_calc().data() * 0).deep_copy() # XXX Important to ensure scale is identical for each model and <model> fmodel.set_scale_switch = 1.0 f_calc_ave_total = fmodel.f_calc().data().deep_copy() else: fmodel.update_xray_structure(xray_structure=ens_pdb_xrs, update_f_calc=True, update_f_mask=False) f_calc_ave_total += fmodel.f_calc().data().deep_copy() print('Model :', model + 1, file=self.log) print("\nStructure vs real Fobs (no bulk solvent or scaling)", file=self.log) print('Rwork : %5.4f ' % fmodel.r_work(), file=self.log) print('Rfree : %5.4f ' % fmodel.r_free(), file=self.log) print('K1 : %5.4f ' % fmodel.scale_k1(), file=self.log) fcalc_edm = fmodel.electron_density_map() fcalc_map_coeffs = fcalc_edm.map_coefficients(map_type='Fc') fcalc_mtz_dataset = fcalc_map_coeffs.as_mtz_dataset( column_root_label='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_mtz_dataset.mtz_object().write( file_name=str(model + 1) + "_Fc.mtz") model_map_coeffs.append(fcalc_map_coeffs.deep_copy()) fmodel.update(f_calc=f_calc_ave.array(f_calc_ave_total / number_structures)) print("\nEnsemble vs real Fobs (no bulk solvent or scaling)", file=self.log) print('Rwork : %5.4f ' % fmodel.r_work(), file=self.log) print('Rfree : %5.4f ' % fmodel.r_free(), file=self.log) print('K1 : %5.4f ' % fmodel.scale_k1(), file=self.log) # Get <Fcalc> map fcalc_ave_edm = fmodel.electron_density_map() fcalc_ave_map_coeffs = fcalc_ave_edm.map_coefficients( map_type='Fc').deep_copy() fcalc_ave_mtz_dataset = fcalc_ave_map_coeffs.as_mtz_dataset( column_root_label='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_ave_mtz_dataset.mtz_object().write(file_name="aveFc.mtz") fcalc_ave_map_coeffs = fcalc_ave_map_coeffs.fft_map() fcalc_ave_map_coeffs.apply_volume_scaling() fcalc_ave_map_data = fcalc_ave_map_coeffs.real_map_unpadded() fcalc_ave_map_stats = maptbx.statistics(fcalc_ave_map_data) print("<Fcalc> Map Stats :", file=self.log) fcalc_ave_map_stats.show_summary(f=self.log) offset = fcalc_ave_map_stats.min() model_neg_ll = [] number_previous_scatters = 0 # Run through structure list again and get probability xrs_list = [] for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): if self.params.verbose: print('\n\nModel : ', model + 1, file=self.log) # Get model atom sigmas vs Fcalc fcalc_map = model_map_coeffs[model].fft_map() fcalc_map.apply_volume_scaling() fcalc_map_data = fcalc_map.real_map_unpadded() fcalc_map_stats = maptbx.statistics(fcalc_map_data) if self.params.verbose: print("Fcalc map stats :", file=self.log) fcalc_map_stats.show_summary(f=self.log) xrs = get_map_sigma( ens_pdb_hierarchy=ens_pdb_hierarchy, ens_pdb_xrs=ens_pdb_xrs, fft_map_1=fcalc_map, model_i=model, residue_detail=self.params.residue_detail, ignore_hd=self.params.ignore_hd, number_previous_scatters=number_previous_scatters, log=self.log) fcalc_sigmas = xrs.scatterers().extract_occupancies() del fcalc_map # Get model atom sigmas vs <Fcalc> xrs = get_map_sigma( ens_pdb_hierarchy=ens_pdb_hierarchy, ens_pdb_xrs=ens_pdb_xrs, fft_map_1=fcalc_ave_map_coeffs, model_i=model, residue_detail=self.params.residue_detail, ignore_hd=self.params.ignore_hd, number_previous_scatters=number_previous_scatters, log=self.log) ### For testing other residue averaging options #print xrs.residue_selections fcalc_ave_sigmas = xrs.scatterers().extract_occupancies() # Probability of model given <model> prob = fcalc_ave_sigmas / fcalc_sigmas # XXX debug option if False: for n, p in enumerate(prob): print(' {0:5d} {1:5.3f}'.format(n, p), file=self.log) # Set probabilty between 0 and 1 # XXX Make Histogram / more stats prob_lss_zero = flex.bool(prob <= 0) prob_grt_one = flex.bool(prob > 1) prob.set_selected(prob_lss_zero, 0.001) prob.set_selected(prob_grt_one, 1.0) xrs.set_occupancies(prob) xrs_list.append(xrs) sum_neg_ll = sum(-flex.log(prob)) model_neg_ll.append((sum_neg_ll, model)) if self.params.verbose: print('Model probability stats :', file=self.log) print(prob.min_max_mean().show(), file=self.log) print(' Count < 0.0 : ', prob_lss_zero.count(True), file=self.log) print(' Count > 1.0 : ', prob_grt_one.count(True), file=self.log) # For averaging by residue number_previous_scatters += ens_pdb_xrs.sites_cart().size() # write ensemble pdb file, occupancies as sigma level write_ensemble_pdb( filename=pdb_file_names[0].split('/')[-1].replace('.pdb', '') + '_pensemble.pdb', xrs_list=xrs_list, ens_pdb_hierarchy=ens_pdb_hierarchy) # XXX Test ordering models by nll # XXX Test removing nth percentile atoms if self.params.sort_ensemble_by_nll_score or self.params.fobs_vs_fcalc_post_nll: for percentile in [1.0, 0.975, 0.95, 0.9, 0.8, 0.6, 0.2]: model_neg_ll = sorted(model_neg_ll) f_calc_ave_total_reordered = None print_list = [] for i_neg_ll in model_neg_ll: xrs = xrs_list[i_neg_ll[1]] nll_occ = xrs.scatterers().extract_occupancies() # Set q=0 nth percentile atoms sorted_nll_occ = sorted(nll_occ, reverse=True) number_atoms = len(sorted_nll_occ) percentile_prob_cutoff = sorted_nll_occ[ int(number_atoms * percentile) - 1] cutoff_selections = flex.bool( nll_occ < percentile_prob_cutoff) cutoff_nll_occ = flex.double(nll_occ.size(), 1.0).set_selected( cutoff_selections, 0.0) #XXX Debug if False: print('\nDebug') for x in range(len(cutoff_selections)): print(cutoff_selections[x], nll_occ[x], cutoff_nll_occ[x]) print(percentile) print(percentile_prob_cutoff) print(cutoff_selections.count(True)) print(cutoff_selections.size()) print(cutoff_nll_occ.count(0.0)) print('Count q = 1 : ', cutoff_nll_occ.count(1.0)) print('Count scatterers size : ', cutoff_nll_occ.size()) xrs.set_occupancies(cutoff_nll_occ) fmodel.update_xray_structure(xray_structure=xrs, update_f_calc=True, update_f_mask=True) if f_calc_ave_total_reordered == None: f_calc_ave_total_reordered = fmodel.f_calc().data( ).deep_copy() f_mask_ave_total_reordered = fmodel.f_masks( )[0].data().deep_copy() cntr = 1 else: f_calc_ave_total_reordered += fmodel.f_calc().data( ).deep_copy() f_mask_ave_total_reordered += fmodel.f_masks( )[0].data().deep_copy() cntr += 1 fmodel.update( f_calc=f_calc_ave.array( f_calc_ave_total_reordered / cntr).deep_copy(), f_mask=f_calc_ave.array( f_mask_ave_total_reordered / cntr).deep_copy()) # Update solvent and scale # XXX Will need to apply_back_trace on latest version fmodel.set_scale_switch = 0 fmodel.update_all_scales() # Reset occ for outout xrs.set_occupancies(nll_occ) # k1 updated vs Fobs if self.params.fobs_vs_fcalc_post_nll: print_list.append([ cntr, i_neg_ll[0], i_neg_ll[1], fmodel.r_work(), fmodel.r_free() ]) # Order models by nll and print summary print( '\nModels ranked by nll <Fcalc> R-factors recalculated', file=self.log) print('Percentile cutoff : {0:5.3f}'.format(percentile), file=self.log) xrs_list_sorted_nll = [] print(' | NLL <Rw> <Rf> Ens Model', file=self.log) for info in print_list: print(' {0:4d} | {1:8.1f} {2:8.4f} {3:8.4f} {4:12d}'. format( info[0], info[1], info[3], info[4], info[2] + 1, ), file=self.log) xrs_list_sorted_nll.append(xrs_list[info[2]]) # Output nll ordered ensemble write_ensemble_pdb( filename='nll_ordered_' + pdb_file_names[0].split('/')[-1].replace('.pdb', '') + '_pensemble.pdb', xrs_list=xrs_list_sorted_nll, ens_pdb_hierarchy=ens_pdb_hierarchy)
# Initialize MPI if mpi_enabled(): from mpi4py import MPI mpi_comm = MPI.COMM_WORLD mpi_rank = mpi_comm.Get_rank() mpi_size = mpi_comm.Get_size() else: mpi_comm = None mpi_rank = 0 mpi_size = 1 # read .pdb file. It's used as a template, so don't sort it. if mpi_rank == 0: pdb_in = hierarchy.input(file_name=top_file, sort_atoms=False) # MEW use cctbx.xray.structure.customized_copy() here to change the unit cell and space group as needed symm = pdb_in.input.crystal_symmetry() if unit_cell_str is None: unit_cell = symm.unit_cell() else: unit_cell = unit_cell_str if space_group_str is None: space_group_info = symm.space_group_info() else: space_group_info = cctbx.sgtbx.space_group_info( symbol=space_group_str) xrs = pdb_in.input.xray_structure_simple( crystal_symmetry=crystal.symmetry(