def test_convex_hull_exhaustive_search(self): self.params.exhaustive.output.csv_name = os.path.join( self.params.output.out_dir, "test.csv") exhaustive(self.params) bound_occ, u_iso, fofc = get_minimum_fofc( self.params.exhaustive.output.csv_name) self.assertAlmostEqual(0.6, bound_occ) self.assertAlmostEqual(0.35, u_iso)
def test_multiple_exhaustive_search(self): """ Test with minimal number of parameters changed from default.""" self.params.exhaustive.output.csv_name = os.path.join( self.params.output.out_dir, "test.csv") multiple_exhaustive(self.params) bound_occ, u_iso, fofc = get_minimum_fofc( self.params.exhaustive.output.csv_name) self.assertAlmostEqual(0.6, bound_occ) self.assertAlmostEqual(0.33, u_iso)
def test_exhaustive_search_multiprocessing(self): """ Test with minimal number of parameters changed from default.""" self.params.settings.processes = 4 self.params.exhaustive.output.csv_name = os.path.join( self.params.output.out_dir, "test.csv") self.params.exhaustive.options.step = 0.2 exhaustive(self.params) bound_occ, u_iso, fofc = get_minimum_fofc( self.params.exhaustive.output.csv_name) self.assertAlmostEqual(0.6, bound_occ) self.assertAlmostEqual(0.4, u_iso)
def write_minima_pdb(input_pdb, output_pdb, csv_name, params): """ Write pdb from the minima in exhaustive search Parameters ---------- input_pdb: str path to input pdb to take structure from output_pdb: str path to write strucutre to csv_name: str path to exhaustive search csv params: str parameter Returns ------- """ min_occ, min_u_iso, _ = get_minimum_fofc(csv_name) bound_states, ground_states = get_bound_ground_states(input_pdb, params) pdb_inp = iotbx.pdb.input(input_pdb) hier = pdb_inp.construct_hierarchy() for chain in hier.only_model().chains(): for residue_group in chain.residue_groups(): for atom_group in residue_group.atom_groups(): for atom in atom_group.atoms(): for ground_state in ground_states: num_altlocs = ground_state[1] if ground_state[0][atom.i_seq]: atom.occ = (1 - min_occ) / num_altlocs atom.b = u_iso_to_b_fac(min_u_iso) for bound_state in bound_states: num_altlocs = bound_state[1] if bound_state[0][atom.i_seq]: atom.set_occ(min_occ / num_altlocs) atom.set_b(u_iso_to_b_fac(min_u_iso)) with open(output_pdb, "w") as f: f.write( hier.as_pdb_string(crystal_symmetry=hierarchy.input( input_pdb).crystal_symmetry()))
def plot_fofc_occ(start_occ, end_occ, step, dataset_prefix, set_b): """Plot the difference in occupancy/fofc at the simulated occupancy and minima.""" min_fofcs = [] min_occs = [] fofcs = [] occs = [] for lig_occupancy in np.arange(start_occ, end_occ + (step / 5), step): csv_name = "occ_{}_b_{}_u_iso".format( str(lig_occupancy).replace(".", "_"), str(set_b).replace(".", "_")) min_occ, min_u_iso, fo_fc_at_min = get_minimum_fofc(csv_name) fofc = get_fofc_from_csv(csv_name, lig_occupancy, round_step(b_to_u_iso(set_b)), step) fofcs.append(fofc) occs.append(lig_occupancy) min_fofcs.append(fo_fc_at_min) min_occs.append(min_occ) fig, ax = plt.subplots() min_plot, = ax.plot(min_occs, min_fofcs, "k+") occ_plot, = ax.plot(occs, fofcs, "ro") for i in np.arange(0, len(occs)): connectpoints(occs, fofcs, min_occs, min_fofcs, i) ax.legend( (min_plot, occ_plot), ("Minima of mean |Fo-Fc|", "Mean |Fo-Fc| at simulated occupancy"), prop={"size": 8}, numpoints=1, bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, ) ax.set_xlabel("Occupancy") ax.set_ylabel("Mean |Fo-Fc|") plt.title( "{}: Delta mean|Fo-Fc| " "and Delta Occupancy".format(dataset_prefix), fontsize=10, ) plt.savefig("{}-delta_fofc_occ.png".format(dataset_prefix))
if not "exhaustive" in program_folder: if os.path.exists(pdb): occ_df = read_ligand(pdb_path=pdb) else: continue else: csv = os.path.join(out_root, program_folder, xtal, "exhaustive_search.csv") if not os.path.exists(csv): continue occ, u_iso, fo_fc = get_minimum_fofc(csv_name=csv) exhaustive_data = {"Occupancy": occ, "B_factor":u_iso_to_b_fac(u_iso), "min_fo_fc": fo_fc} occ_df = pd.DataFrame(data=exhaustive_data, index=[0]) # Dealin with two copies of ligand if compound == "OX210": occ_df_2 = occ_df.copy(deep=True) occ_df['resid'] = 1 occ_df_2['resid'] = 2 occ_df = pd.concat([occ_df, occ_df_2]) else: occ_df['resid'] = 1
def plot_DCP2B(): """ Plotting code for edtstats pairplots on DCP2B Notes ------------- Place holder to be better functioanlised when more generalised case is apparent """ es_minima_csv = "/dls/science/groups/i04-1/elliot-dev/Work/exhaustive_search_data/DCP2B_18_09_20_exhaus/es_minima.csv" edstats_csv = "/dls/science/groups/i04-1/elliot-dev/Work/exhaustive_search_data/DCP2B_18_09_20_exhaus/edstats.csv" out_dir = "/dls/science/groups/i04-1/elliot-dev/Work/exhaustive_search_data/DCP2B_18_09_20_exhaus" database_path = ( "/dls/labxchem/data/2016/lb13385-64/processing/database/soakDBDataFile.sqlite" ) edstats_df = pd.read_csv(edstats_csv) es_minima_df = pd.read_csv( es_minima_csv, names=["Dataset", "ES_occ", "es_bfac", "min_fofc"]) summary_df = pd.merge(edstats_df, es_minima_df, on="Dataset") refinement_outcomes = "'4 - CompChem ready', '5 - Deposition ready','6 - Deposited'" print(database_path) conn = sqlite3.connect(database_path) main_table_df = pd.read_sql_query("select * from mainTable", conn) cur = conn.cursor() cur.execute( "SELECT CrystalName, CompoundCode, RefinementResolution " "FROM mainTable WHERE RefinementOutcome in ({})" " AND (RefinementPDB_latest AND RefinementMTZ_latest) IS NOT NULL". format(refinement_outcomes)) refinement_xtals = cur.fetchall() # Close connection to the database cur.close() summary_df = summary_df.rename(columns={"Dataset": "CrystalName"}) # summary_df = pd.merge(summary_df, main_table_df, on='CrystalName') compounds = {} es_occ_b = [] for xtal_name, compound_code, resolution in refinement_xtals: # xtal_name = xtal_name.encode('ascii') # compound_code = compound_code.encode('ascii') compounds[xtal_name] = compound_code if compound_code == "FMOPL000435a": csv = os.path.join(out_dir, xtal_name, "exhaustive_search.csv") occ, u_iso, _ = get_minimum_fofc(csv) es_occ_b.append([xtal_name, occ, u_iso_to_b_fac(u_iso)]) comp_df = pd.DataFrame(list(compounds.items()), columns=["CrystalName", "compound_code"]) summary_df = pd.merge(summary_df, comp_df, on="CrystalName") FMOPL000435a_df = summary_df[summary_df["compound_code"] == "FMOPL000435a"] if not os.path.exists(os.path.join(out_dir, "pairplot.png")): pairplot = labelled_pairplot(summary_df, hue_column="compound_code") fig = pairplot.fig fig.savefig(os.path.join(out_dir, "pairplot.png"), dpi=300) if not os.path.exists(os.path.join(out_dir, "FMOPL000435a_pairplot.png")): FMOPL000435a_pairplot = labelled_pairplot(FMOPL000435a_df) fig = FMOPL000435a_pairplot.fig fig.savefig(os.path.join(out_dir, "FMOPL000435a_pairplot.png"), dpi=300) FMOPL000435a_df = FMOPL000435a_df.rename(index=str, columns={ "ES_occ": "es_occupancy", "Occupancy": "occupancy" }) params = master_phil.extract() params.output.out_dir = "/dls/science/groups/i04-1/elliot-dev/Work/exhaustive_search_data/DCP2B_18_09_20_exhaus" occupancy_histogram_with_exhaustive_search(FMOPL000435a_df, protein_name="DCP2B", compound="FMOPL000435a", params=params) # occupancy_b_factor_scatter_plot(FMOPL000435a_df, # protein_name=protein_name, # compound=compound, # params=params) summary_df.to_csv(os.path.join(out_dir, "DCP2B_edstats_summary.csv")) FMOPL000435a_df.to_csv( os.path.join(out_dir, "FMOPL000435a_edstats_summary.csv")) duplicate_compound_df = pd.concat( g for _, g in summary_df.groupby("compound_code") if len(g) > 1) summary_duplicate_df_list = [] for duplicate_compound in duplicate_compound_df["compound_code"].unique(): duplicate_df = summary_df[summary_df["compound_code"] == duplicate_compound] summary = { "compound": [duplicate_compound], "number refined hits": [len(duplicate_df.index)], "RSCC min": [duplicate_df["RSCC"].min()], "RSCC max": [duplicate_df["RSCC"].max()], "Occ refined min": [duplicate_df["Occupancy"].min()], "Occ refined max": [duplicate_df["Occupancy"].max()], "Occ ES min": [duplicate_df["ES_occ"].min()], "Occ ES max": [duplicate_df["ES_occ"].max()], } summary_duplicate_df_list.append(pd.DataFrame(data=summary)) pd.concat(summary_duplicate_df_list).to_csv( os.path.join(out_dir, "DCP2B_edstats_duplicates.csv"))
from exhaustive.utils.utils import get_minimum_fofc exh_b_fix = "/dls/science/groups/i04-1/elliot-dev/Work/NUDT7A_mass_spec_refinements/copy_atoms/exhaustive_b_fix/2019-06-17/" folders = [ os.path.join(exh_b_fix, d) for d in os.path.listdir(exh_b_fix) if os.path.isdir(os.path.join(exh_b_fix, d)) ] for folder in exh_b_fix: exh_csv = os.path.join(exh_b_fix, folder, "exhaustive_search.csv") if os.path.exists(exh_b_fix, folder) == exh_csv: print(get_minimum_fofc(csv_name=exh_csv))
def process_exhaustive_search(compound_codes, initial_model_dir, in_dir, out_dir, protein_name, preferred_cif=None): protein_prefix = protein_name + "-x" for compound in compound_codes: es_occ_b = [] for dataset in datasets_from_compound(protein_prefix, compound_folder=os.path.join( in_dir, compound)): # Define paths es_csv = os.path.join(out_dir, compound, dataset, dataset + "_exhaustive_search_occ_u_iso.csv") refine_pdb = os.path.join(in_dir, compound, dataset, "refine.pdb") es_pdb = os.path.join(out_dir, compound, dataset, "es_minima.pdb") es_refine_pdb = os.path.join(out_dir, compound, dataset, "exhaustive_search0001", "es_refine.pdb") es_refine_mtz = os.path.join(out_dir, compound, dataset, "exhaustive_search0001", "es_refine.mtz") input_mtz = os.path.join(in_dir, compound, dataset, "refine.mtz") # input_mtz = os.path.join(initial_model_dir, dataset, # dataset + ".free.mtz") input_mtz = os.path.join(initial_model_dir, dataset, "refine.mtz") input_pdb = os.path.join(initial_model_dir, dataset, "refine.pdb") print(input_mtz) print(input_pdb) print("---------------------") dataset_dir = os.path.join(initial_model_dir, dataset) # Generate split conformations if not os.path.exists( os.path.join(in_dir, compound, dataset, "refine.split.ground-state.pdb")): split_params = split_phil.extract() split_params.input.pdb = [ os.path.join(in_dir, compound, dataset, "refine.pdb") ] split_params.output.log = "cat.log" print(split_params.input.pdb) print(split_phil.format(python_object=split_params).as_str()) try: split_conformations(split_params) except IOError: print("Split confs: Issue in parsing:{}".format(dataset)) continue # Write Minima pdb if not os.path.exists(es_pdb): try: write_minima_pdb(input_pdb=refine_pdb, output_pdb=es_pdb, csv_name=es_csv, params=params) except IOError: print("Issue in parsing:{}".format(dataset)) continue input_cif = get_cif_file_from_dataset(dataset_dir, preferred_cif) # Refinement of minima pdb # TODO remove explicit call to 0001 if not os.path.exists(es_refine_pdb): try: os.chdir(os.path.join(out_dir, compound, dataset)) os.system("giant.quick_refine input.pdb={} " "input.mtz={} input.cif={} " "dir_prefix='exhaustive_search' " "output.out_prefix='es_refine' ".format( es_pdb, input_mtz, input_cif)) except IOError: print("Skipping crystal") continue es_minima_plot_folder = os.path.join(out_dir, compound, dataset, "Plots") # Plotting spider plots of minima pdb if not os.path.exists(es_minima_plot_folder): score_params = score_phil.extract() score_params.input.pdb1 = input_pdb score_params.input.mtz1 = input_mtz score_params.input.pdb2 = es_refine_pdb score_params.input.mtz2 = es_refine_mtz score_params.output.out_dir = es_minima_plot_folder try: score_model(score_params) except: print("skipping edstats for {}".format(dataset)) # Minima occupancy and B factor for histogram/ scatter summary try: occ, u_iso, _ = get_minimum_fofc(es_csv) except IOError: print("Issue in parsing:{}".format(dataset)) continue es_occ_b.append([dataset, occ, u_iso_to_b_fac(u_iso)]) print(es_occ_b) es_occ_df = pd.DataFrame( data=es_occ_b, columns=['dataset', 'es_occupancy', 'es_b_fac']) refine_occ_df = get_occ_b(refinement_dir=os.path.join( in_dir, compound), lig_chain="B", pdb_name="refine.split.bound-state.pdb") occ_df = pd.merge(es_occ_df, refine_occ_df, on='dataset', how='outer') params.output.out_dir = os.path.join(out_dir, compound) occupancy_histogram_with_exhaustive_search(occ_df, protein_name=protein_name, compound=compound, params=params) occupancy_b_factor_scatter_plot(occ_df, protein_name=protein_name, compound=compound, params=params) edstats_df = collate_edstats_scores(protein_prefix=protein_prefix, compound_folder=os.path.join( out_dir, compound)) if edstats_df is not None: plot_edstats_across_soaks(edstats_df=edstats_df, compound_folder=os.path.join( out_dir, compound), compound=compound, protein_name=protein_name, title_suffix="Exhaustive minima")
refine_occ_df = get_occ_b(refinement_dir=covalent_dir, lig_chain="E", pdb_name="refine.split.bound-state.pdb") exhaustive_search_csvs = [ os.path.join(covalent_dir, xtal_dir, xtal_dir.rstrip('_LIG_CYS'), "exhaustive_search.csv") for xtal_dir in os.listdir(covalent_dir) if os.path.isdir(os.path.join(covalent_dir, xtal_dir)) and xtal_dir.endswith("LIG_CYS") ] es_occ_b = [] for csv in exhaustive_search_csvs: if os.path.exists(csv): occ, u_iso, _ = get_minimum_fofc(csv) es_occ_b.append([ os.path.basename(os.path.dirname(csv)).rstrip('_LIG_CYS'), occ, u_iso_to_b_fac(u_iso) ]) es_occ_df = pd.DataFrame(data=es_occ_b, columns=['dataset', 'es_occupancy', 'es_b_fac']) occ_df = pd.merge(es_occ_df, refine_occ_df, on='dataset', how='outer') occupancy_histogram_with_exhaustive_search(occ_df, protein_name="NUDT7A", compound="NUOOA000181a", params=params)