def split(self, dataset, seed=None, frac_train=.8, frac_valid=.1, frac_test=.1, log_every_n=None): """ Splits internal compounds randomly into train/validation/test. """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) if seed is None: seed = random.randint(0, 2**30) np.random.seed(seed) num_datapoints = len(dataset) train_cutoff = int(frac_train * num_datapoints) valid_cutoff = int((frac_train + frac_valid) * num_datapoints) num_train = train_cutoff num_valid = valid_cutoff - train_cutoff num_test = num_datapoints - valid_cutoff all_mols = [] for ind, smiles in enumerate(dataset.ids): all_mols.append(Chem.MolFromSmiles(smiles)) fps = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in all_mols ] def distance(i, j): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) picker = MaxMinPicker() testIndices = picker.LazyPick(distFunc=distance, poolSize=num_datapoints, pickSize=num_test, seed=seed) validTestIndices = picker.LazyPick(distFunc=distance, poolSize=num_datapoints, pickSize=num_valid + num_test, firstPicks=testIndices, seed=seed) allSet = set(range(num_datapoints)) testSet = set(testIndices) validSet = set(validTestIndices) - testSet trainSet = allSet - testSet - validSet assert len(testSet & validSet) == 0 assert len(testSet & trainSet) == 0 assert len(validSet & trainSet) == 0 assert (validSet | trainSet | testSet) == allSet return sorted(list(trainSet)), sorted(list(validSet)), sorted( list(testSet))
def diverse_mols_indexes(mol_list, n_pick, radius=4, seed=42): fps = [GetMorganFingerprint(mol, radius) for mol in mol_list] picker = MaxMinPicker() n_fps = len(fps) def fp_distance(i, j): return 1 - \ DataStructs.DiceSimilarity(fps[i], fps[j]) indexes = picker.LazyPick(fp_distance, n_fps, n_pick, seed=seed) return indexes
def pick_diverse( mols: List[Chem.rdchem.Mol], npick: int, initial_picks: List[int] = None, feature_fn: Callable = None, dist_fn: Callable = None, seed: int = 42, n_jobs: Optional[int] = 1, ): r"""Pick a set of diverse molecules based on they fingerprint. Args: mols: a list of molecules. npick: Number of element to pick from mols, including the preselection. initial_picks: Starting list of index for molecules that should be in the set of picked molecules. Default to None. feature_fn: A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None. dist_fn: A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. seed: seed for reproducibility n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. Returns: picked_inds: index of the molecule that have been picked mols: molecules that have been picked """ if feature_fn is None: feature_fn = functools.partial(dm.to_fp, as_array=False) features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs) def distij(i, j, features=features): return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j]) if dist_fn is None: dist_fn = distij picker = MaxMinPicker() initial_picks = [] if initial_picks is None else initial_picks picked_inds = picker.LazyPick(dist_fn, len(mols), npick, firstPicks=initial_picks, seed=seed) picked_inds = np.array(picked_inds) picked_mols = [mols[x] for x in picked_inds] return picked_inds, picked_mols
def query(self, n): idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb] if self.args.data_pool is not None: idxs_unlabeled = np.random.choice(idxs_unlabeled, self.args.data_pool, replace=False) embedding = self.get_embedding( MoleculeDataset(self.data[idxs_unlabeled])) def distij(i, j, data=embedding): return sum( np.sqrt(np.square(np.array(data[i]) - np.array(data[j])))) picker = MaxMinPicker() pickIndices = picker.LazyPick(distij, embedding.shape[0], n) return idxs_unlabeled[pickIndices]
def SelectMoleculesUsingMaxMin(Mols, MolsFingerprints): """Select diverse molecules using MaxMin methodology.""" MiscUtil.PrintInfo( "\nSelecting diverse molecules using MaxMin methodology and %s similarity metric..." % OptionsInfo["SimilarityMetric"]) DiverseMols = [] PoolSize = len(MolsFingerprints) PickSize = OptionsInfo["NumMols"] SimilarityFunction = OptionsInfo["SimilarityFunction"] Picker = MaxMinPicker() PairwiseDistance = lambda i, j: 1 - SimilarityFunction( MolsFingerprints[i], MolsFingerprints[j]) MolIndices = Picker.LazyPick(PairwiseDistance, PoolSize, PickSize) for Index in list(MolIndices): DiverseMols.append(Mols[Index]) return DiverseMols
def pick_subset(mols, num=5, radius=3, seed=-1): """ Pick a disparate subset of molecules using Morgan Fingerprints. https://towardsdatascience.com/a-practical-introduction-to-the-use-of-molecular-fingerprints-in-drug-discovery-7f15021be2b1 :param mols: an iterable of molecules :param num: number of molecules to pick :param radius: :return: list of integer locations of the subset of molecules """ fps = [GetMorganFingerprint(mol, radius) for mol in mols] def distij(i, j, fps=fps): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) return list(MaxMinPicker().LazyPick(distij, len(fps), num, seed=seed))
def diversity_pick(self, n, firstpicks=[]): """ Picks a maximally diverse subset of a sert of molecules using the RDKit MaxMinPicker. Optionally, a list of names with already chosen molecules can be specified. """ assert type(firstpicks) in [ list, set ], 'Error, the input firstpicks must be a list or set' assert all([ type(s) is str for s in firstpicks ]), 'Error, each item in the input list firstpicks must be a string' assert type(n) is int, 'Error, the input n must be an integer' firstpicks = [clean_name(s) for s in firstpicks] assert all([ s in self.input_names for s in firstpicks ]), 'Error, not all firstpicks are part of the molecule list' assert n < len(self.input_names) - len( firstpicks ), 'Error, you have specified an n that is greater or equal to the available molecule number' # get indices of already picked molecules ind = [] for x in firstpicks: ind.append( self.input_names.index(x)) # indices of picked molecules # compute all pairwise similarity scores ds = [] score = self.metrics[self.metric] for i in range(1, len(self.fingerprint_data)): ds.extend( score(self.fingerprint_data[i], self.fingerprint_data[:i], returnDistance=True)) # make the selection (returns indeces) ids = MaxMinPicker().Pick(np.array(ds), len(self.fingerprint_data), n, ind) return [self.input_names[s] for s in ids]
#This will hold a series of dicts of metrics that we then build into a dataframe metric_dict_list = [] #using longest most imformantive embeddings classifier_dict = { 'SVM': train_SVM, 'RF': train_RF, 'LGBM': train_LGBM, 'DNN': train_DNN, 'GCNN_pytorch': train_PyTorchGCNN } #model_list = ['GCNN_pytorch','SVM','RF','LGBM','DNN'] model_list = ['RF'] num_models = len(model_list) mmp = MaxMinPicker() #define how we select after inital training run selection_type = 'Diverse' #define size of iter after first 10% train relative to that trainsize iterRel2Start = 0.5 end_iter = 1 + (4 / iterRel2Start) for AID in AID_list: for model_type in ['SVM']: if 'win' in sys.platform: AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data', AID) else: AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) save_path = AID_path + '/' + AID + 'graph_processed.pkl' pickle_off = open(save_path, 'rb')
classifier_dict = { 'SVM': train_SVM, 'RF': train_RF, 'LGBM': train_LGBM, 'DNN': train_DNN, 'GCNN_pytorch': train_PyTorchGCNN, 'random': train_random_classifier } model_list = ['GCNN_pytorch', 'SVM', 'RF', 'LGBM', 'DNN', 'random'] #model_list = ['RF'] #model_list = ['GCNN_pytorch'] #model_list = ['SVM'] num_models = len(model_list) mmp = MaxMinPicker() def getRandomIterInds(firstPicksList, fplist, bottom_to_select): full_list_index = np.arange(len(fplist)) unselected_inds = list(set(full_list_index) - set(firstPicksList)) random_selection = np.random.choice(unselected_inds, bottom_to_select, replace=False) start_indexs = np.concatenate((firstPicksList, random_selection), axis=0) return start_indexs def getNextIterInds(firstPicksList, fplist, bottom_to_select): diverse_picks = mmp.LazyBitVectorPick( fplist, len(fplist),
print(novel / len(smiles_list)) ## Diversity sampling from rdkit import Chem from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint from rdkit import DataStructs from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker ms = [Chem.MolFromSmiles(s) for s in smiles_list] start = time() fps = [GetMorganFingerprint(x, 3) for x in ms] nfps = len(fps) end = time() print(f'Time for {nfps} fingerprints: ', end - start) def distij(i, j, fps=fps): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) picker = MaxMinPicker() start = time() pickIndices = picker.LazyPick(distij, nfps, 1000, seed=23) end = time() idces = list(pickIndices) print('Time for picker: ', end - start) m_selected = [ms[i] for i in idces] img = Draw.MolsToGridImage(m_selected) img img.save('diverse.png')
def pick_centroids( mols: List[Chem.rdchem.Mol], npick: int = 0, initial_picks: List[int] = None, threshold: float = 0.5, feature_fn: Callable = None, dist_fn: Callable = None, seed: int = 42, method: str = "sphere", n_jobs: Optional[int] = 1, ): r"""Pick a set of `npick` centroids from a list of molecules. Args: mols: a list of molecules. npick: Number of element to pick from mols, including the preselection. threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods. initial_picks: Starting list of index for molecules that should be in the set of picked molecules. Default to None. feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None. dist_fn: A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. seed: seed for reproducibility method: Picking method to use. One of `sphere`, `maxmin` or any supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma` n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. Returns: picked_inds: index of the molecule that have been selected as centroids mols: molecules that have been picked """ n_mols = len(mols) if feature_fn is None: feature_fn = functools.partial(dm.to_fp, as_array=False) features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs) def distij(i, j, features=features): return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j]) if dist_fn is None: dist_fn = distij initial_picks = [] if initial_picks is None else initial_picks if method == "maxmin": picker = MaxMinPicker() picked_inds, _ = picker.LazyPickWithThreshold( dist_fn, n_mols, pickSize=npick, threshold=threshold, firstPicks=initial_picks, seed=seed, ) elif method == "sphere": picker = LeaderPicker() picked_inds = picker.LazyPick(dist_fn, n_mols, threshold=threshold, pickSize=npick, firstPicks=initial_picks) elif method.upper() in ClusterMethod.names.keys() and npick: if initial_picks: logger.warning( "Initial picks is not supported by hierarchical clustering. You pick has been discarded." ) dist_mat = dm.parallelized(distij, list( zip(*np.tril_indices(len(mols), k=-1))), arg_type="args") dist_mat = np.asarray(dist_mat) picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()]) picked_inds = picker.Pick(dist_mat, n_mols, npick) else: raise ValueError( f"Picking method {method} with {npick} elements to pick is not supported." ) picked_inds = np.array(picked_inds) picked_mols = [mols[x] for x in picked_inds] return picked_inds, picked_mols
reverse=True) #print top ten matches [ print(Chem.MolToSmiles(m), m.GetProp('GENERIC_NAME'), sim) for m, sim in sorted_m__qs[:10] ] ### simple diversity picking # define our way of getting distances between fps under given indices def fp_distance(i, j, fingerprints=[fp for m, fp in molecule__fp]): return 1 - DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j]) # instantiate the RDKit diversity picker picker = MaxMinPicker() # run the picker with the fp_distance function, get 10 diverse pick indices pickIndices = picker.LazyPick(fp_distance, len(molecule__fp), 10, seed=42) # grab the Molecule instances that have those picked indices picks = [molecule__fp[i][0] for i in pickIndices] [print(Chem.MolToSmiles(m), m.GetProp('GENERIC_NAME')) for m in picks] ## now inverse it for similarity picking # redefine the function def fp_distance(i, j, fingerprints=[fp for m, fp in molecule__fp]): return DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j]) # run the picker again & print the picked molecules. # They should be noticeably more similar than those in previous set
def execute(self): """ TODO """ print() print("Loading input file with path: " + ZincPicker.input_file_path) zinc_for_sale_mol_supplier = Chem.SmilesMolSupplier( ZincPicker.input_file_path) num_none_mols = 0 print("Output file path: " + ZincPicker.output_file_path) writer = Chem.SmilesWriter(ZincPicker.output_file_path) lower_index = 0 upper_index = ZincPicker.pool_size print("Entering picking iterations...") print() for y in range(0, ZincPicker.num_iterations): print("Number of iteration: ", y) print("Lower index: ", lower_index) print("Upper index: ", upper_index) print("Loading molecules now...") molecules = [] for x in range(lower_index, upper_index): mol = zinc_for_sale_mol_supplier[x] if mol is None: num_none_mols += 1 continue molecules.append(mol) while molecules.count(None): molecules.remove(None) # radius 3 print("Number of molecules loaded: ", len(molecules)) print("Calculating fingerprints...") self.fingerprint_list = [ GetMorganFingerprint(x, 3) for x in molecules ] nfps = len(self.fingerprint_list) print("Number of fingerprints: ", nfps) print("Now min-max picking ", ZincPicker.pick_size, " out of the finger print list...") picker = MaxMinPicker() pickIndices = picker.LazyPick( self.calculate_dice_similarity_distance, nfps, ZincPicker.pick_size, seed=23) print("Finished picking, writing to file...") for z in pickIndices: writer.write(molecules[z]) # clear memory molecules = [] self.fingerprint_list = [] nfps = 0 picker = None pickIndices = [] # raise indices lower_index = lower_index + ZincPicker.pool_size upper_index = upper_index + ZincPicker.pool_size print("Finished this iteration, entering the next...") print() print("Execution successful.") print("Picked ", ZincPicker.pick_size * ZincPicker.num_iterations - num_none_mols, " out of ", ZincPicker.num_iterations * ZincPicker.pool_size, " molecules in ", ZincPicker.num_iterations, " iterations, while picking ", ZincPicker.pick_size, " in each iteration.")
def main(repitition_number): '''import''' from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="iter_baseline", workspace="gdreiman1", disabled = False ) exp.log_code = True exp.log_other('Hypothesis','''Taking 5% batches, 80% of batch is the top ranked compounds, remaining 20% is diverse selection for first 5 iterations, then reverts to random sampling''') import pickle, sys, os from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker import numpy as np from Iterative_help_funcs import get_Scaled_Data,train_SVM,train_DNN,train_RF,train_LGBM,calc_and_save_metrics,train_PyTorchGCNN from imblearn.over_sampling import RandomOverSampler #choosing a 3:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy= 0.33) import pandas as pd from joblib import Parallel, delayed from joblib.externals.loky import set_loky_pickler from joblib import parallel_backend import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR) #%% '''Load data''' AID_list =['AID_1345083','AID_624255','AID_449739','AID_995','AID_938','AID_628','AID_596','AID_893','AID_894'] #AID_list =['AID_893','AID_894'] #This will hold a series of dicts of metrics that we then build into a dataframe metric_dict_list = [] multi_dump_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', 'ranked_diverse_run'+str(repitition_number)+'.pkl') exp.log_other('Metrics Dict Path',multi_dump_path) #using longest most imformantive embeddings classifier_dict = {'SVM': train_SVM, 'RF': train_RF, 'LGBM':train_LGBM,'DNN':train_DNN,'GCNN_pytorch':train_PyTorchGCNN} model_list = ['GCNN_pytorch','SVM','RF','LGBM','DNN'] model_list = ['RF'] num_models = len(model_list) mmp = MaxMinPicker() #define how we select after inital training run selection_type = 'Diverse' #define size of iter after first 10% train relative to that trainsize iterRel2Start = 0.5 end_iter = 10 for AID in AID_list: if 'win' in sys.platform: AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data', AID) else: AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) save_path = AID_path+ '/' + AID +'graph_processed.pkl' pickle_off = open(save_path,'rb') activity_table=pickle.load(pickle_off) pickle_off.close() '''Pick diverse starting point of 10% of library''' fplist = [x for x in activity_table['bit_MFP']] '''start_indexs holds the indexes of molecules already scanned at the start of each iteration. So for the first iter it hold the diversity selection. For the second, it holds both the diversity selection and the molecules screened based on the results of the first training iteration etc''' start_index_meta_list = [] for i in range(rep_number): start_indexs = np.array(mmp.LazyBitVectorPick(fplist,len(fplist),int(len(fplist)/10))) '''store in a list that will vary as each model makes its predictions''' start_ind_list=[start_indexs for i in range(num_models)] index_name = "starting_sample_"+str(i) start_index_meta_dict[index_name] = start_ind_list diverse_size_list = [0 for i in range(num_models*)] fp_metalist = [fplist for i in range(num_models)] library_size = len(fplist) iter_num = 0 while iter_num < end_iter: print("Beginning Iteration ",iter_num) if iter_num < 5: selection_type = 'Diverse' else: selection_type = 'Random' '''run thru models and get their preds for this iter''' for list_idx,[model_type,start_indexs] in enumerate(zip(model_list,start_ind_list)): '''Get data for the starting molecules, it will be graphs for GCNN, else the MFP_MolChars''' test_index = list(set(activity_table.index)-set(start_indexs)) #check that we haven't exceeded 50% of library if len(test_index) > int(0.5 *library_size): if model_type == 'GCNN_pytorch': embedding_type = 'Graph' X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type) else: embedding_type = 'MFPMolChars' X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type) #oversample to 2:1 Inactive to Active '''what happens when ratio is better than ros??? Bad boy errors!! Now need another check to get out of our pkl with over enrichment (hahaha)''' if (len(y_train)/sum(y_train))<0.25: #need to ros iff ratio is less than 3:1 if model_type == 'GCNN_pytorch': '''ros doesn't like the data apparently''' over_X_train,over_y_train = ros.fit_resample(np.arange(len(X_train)).reshape((-1,1)),y_train) over_X_train = over_X_train.reshape(-1) over_X_train = [X_train[i] for i in over_X_train] else: over_X_train,over_y_train = ros.fit_resample(X_train,y_train) else: #just use current enriched sample over_X_train,over_y_train = X_train,y_train '''Inital train run''' train_and_predict_model = classifier_dict[model_type] #have this split here so that I can deal w fact that DNNs #are now returning the history if model_type =='DNN' or model_type=='GCNN_pytorch': train_predicted_probs,test_predicted_probs,base_test_predicted_probs,hist = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train) else: train_predicted_probs,test_predicted_probs,base_test_predicted_probs = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train) hist = None metric_dict_list = calc_and_save_metrics(y_test,test_predicted_probs,model_type, embedding_type,AID,metric_dict_list,iter_num,'test',hist) metric_dict_list = calc_and_save_metrics(over_y_train,train_predicted_probs,model_type, embedding_type,AID,metric_dict_list,iter_num,'train') metric_dict_list = calc_and_save_metrics(y_train,base_test_predicted_probs,model_type, embedding_type,AID,metric_dict_list,iter_num,'base_train') '''Now select next 5% section''' '''Put labels and preds in df, sort them. take top 80% of tier size of the top predictions then do a diverse selection or random selection for remaining 20% more''' preds_df = pd.DataFrame({'activity_table_index':np.array(test_index),'prob_active':np.array(test_predicted_probs)},columns= np.array(['activity_table_index','prob_active'])) preds_df.sort_values('prob_active',ascending=False,inplace=True,axis=0) next_inds=[] top_to_select = int(len(activity_table)*0.04) explore_select = int(len(activity_table)*0.01) next_inds=next_inds+preds_df.head(top_to_select)['activity_table_index'].tolist() firstPicksList = next_inds+(start_indexs.tolist()) start_ind_list[list_idx] = firstPicksList diverse_size_list[list_idx] = explore_select def getRandomIterInds(firstPicksList,fplist,bottom_to_select): full_list_index = np.arange(len(fplist)) unselected_inds = list(set(full_list_index) - set(firstPicksList)) random_selection = np.random.choice(unselected_inds,bottom_to_select,replace=False) start_indexs = np.concatenate((firstPicksList,random_selection),axis=0) return start_indexs def getNextIterInds(firstPicksList,fplist,bottom_to_select): diverse_picks = mmp.LazyBitVectorPick(fplist,len(fplist),len(firstPicksList)+bottom_to_select,firstPicksList) start_indexs = np.array(diverse_picks) return start_indexs with parallel_backend('multiprocessing'): if selection_type == 'Diverse': start_ind_list = Parallel(n_jobs=5)(delayed(getNextIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list)) elif selection_type == 'Random': start_ind_list = Parallel(n_jobs=5)(delayed(getRandomIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list)) metrics_df = pd.DataFrame(metric_dict_list) metrics_df.to_pickle(multi_dump_path) iter_num +=1 exp.end()
def main(prior_name, name, max_samples, diversity_picker, oracle, w_min): prior_model = model_from_json(prior_name) # We start by creating another prior instance, then replace it with the actual weights # name = search_vae search_model = model_from_json(prior_name) model_weights_path = os.path.join(script_dir, 'results', name, 'weights.pth') search_model.load(model_weights_path) samples, weights = get_samples(prior_model, search_model, max=max_samples, w_min=w_min) # if diversity picker < max_samples, we subsample with rdkit picker : if 0 < diversity_picker < max_samples: mols = [Chem.MolFromSmiles(s) for s in samples] fps = [GetMorganFingerprint(x, 3) for x in mols] picker = MaxMinPicker() def distij(i, j, fps=fps): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) pickIndices = picker.LazyPick(distij, max_samples, diversity_picker) idces = list(pickIndices) samples = [samples[i] for i in idces] weights = [weights[i] for i in idces] # Since we don't maintain a dict for qed, we just give everything to the docker if oracle != 'docking' or True: dump_path = os.path.join(script_dir, 'results', name, 'docker_samples.p') pickle.dump(samples, open(dump_path, 'wb')) # Dump for the trainer dump_path = os.path.join(script_dir, 'results', name, 'samples.p') pickle.dump((samples, weights), open(dump_path, 'wb')) else: # Memoization, we split the list into already docked ones and dump a simili-docking csv whole_path = os.path.join(script_dir, '..', 'data', 'drd3_scores.pickle') docking_whole_results = pickle.load(open(whole_path, 'rb')) filtered_smiles = list() already_smiles = list() already_scores = list() for i, smile in enumerate(samples): if smile in docking_whole_results: already_smiles.append(smile) already_scores.append(docking_whole_results[smile]) else: filtered_smiles.append(smile) # Dump simili-docking dump_path = os.path.join(script_dir, 'results', name, 'docking_small_results', 'simili.csv') df = pd.DataFrame.from_dict({ 'smile': already_smiles, 'score': already_scores }) df.to_csv(dump_path) # Dump for the docker dump_path = os.path.join(script_dir, 'results', name, 'docker_samples.p') pickle.dump(filtered_smiles, open(dump_path, 'wb')) # Dump for the trainer dump_path = os.path.join(script_dir, 'results', name, 'samples.p') pickle.dump((samples, weights), open(dump_path, 'wb'))