def get_reward_MO(predictor, smile): """ This function takes the predictor model and the SMILES string and returns a numerical reward. ---------- predictor: object of the predictive model that accepts a trajectory and returns a numerical prediction of desired property for the given trajectory smile: SMILES string of the generated molecule Returns ------- Outputs the reward value for the predicted property of the input SMILES """ list_ss = [smile] # SAScore prediction list_ss[0] = Chem.MolFromSmiles(smile) sas_list = SAscore(list_ss) sas_smiles = sas_list[0] reward_sas = np.exp(-sas_smiles + 3) # pIC50 for kor prediction list_ss = [smile] pred = predictor.predict(list_ss) reward_kor = np.exp(pred / 4 - 1) # reward = np.exp(pred/10) - 1 return reward_kor, reward_sas
def get_reward(predictor, smile, property_identifier): """ This function takes the predictor model and the SMILES string and returns a numerical reward. ---------- predictor: object of the predictive model that accepts a trajectory and returns a numerical prediction of desired property for the given trajectory smile: SMILES string of the generated molecule property_identifier: String identifying the property Returns ------- Outputs the reward value for the predicted property of the input SMILES """ list_ss = [smile] if property_identifier == 'sas': list_ss[0] = Chem.MolFromSmiles(smile) reward_list = SAscore(list_ss) reward = reward_list[0] else: pred = predictor.predict(list_ss) reward = np.exp(pred / 4 - 1) # reward = np.exp(pred/10) - 1 return reward
def generate2file(predictor, generator, configReinforce, n2generate, original_model): """ Function that generates a specified number of SMILES strings and predicts its SA score and possibly its pIC50 for KOR. This function also saves the valid SMILES and respective predictions to a folder called "Generated". Parameters ---------- predictor: Predictor model object, to predict the desired property generator: Generator model object, to generate new SMILES strings configReinforce: Configuration file n2generate: Integer specifying the number of SMILES to generate original_model: String that indicates which generator was used: The G_0 or the G_optimized Returns ------- This function doesn't return anything, but saves a file with the generated SMILES and the respective predicted properties. """ if original_model: model_type = 'original' else: model_type = 'biased' generated = [] pbar = tqdm(range(n2generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(generator, None, False, 0, configReinforce) generated.append(predictSMILES.sample()) sanitized, valid = canonical_smiles(generated, sanitize=True, throw_warning=False) unique_smiles = list(np.unique(sanitized))[1:] mol_list = smiles2mol(unique_smiles) prediction_sas = SAscore(mol_list) if predictor != None: prediction_prop = predictor.predict(unique_smiles) with open("Generated/smiles_prop_" + model_type + ".smi", 'w') as f: for i, cl in enumerate(unique_smiles): data = str(unique_smiles[i]) + "," + str( prediction_prop[i]) + "," + str(prediction_sas[i]) f.write("%s\n" % data) else: with open("Generated/smiles_sas_" + model_type + ".smi", 'w') as f: for i, cl in enumerate(unique_smiles): data = str(unique_smiles[i]) + "," + str(prediction_sas[i]) f.write("%s\n" % data)
def generate2file(predictor, generator, configReinforce, n2generate, original_model): """ Function that generates new SMILES strings and predicts some properties. The SMILES and predictions are saved to files. Parameters ---------- predictor: Predictor model generator: Generator model configReinforce: Configuration file n2generate: Number of SMILES strings to generate original_model: Boolean that indicates if we use the Original or the Biased Generator Returns ------- Saves the file with the newly generated SMILES """ if original_model: model_type = 'original' else: model_type = 'biased' generated = [] pbar = tqdm(range(n2generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(generator, None, False, 0, configReinforce) generated.append(predictSMILES.sample()) sanitized, valid = canonical_smiles(generated, sanitize=True, throw_warning=False) unique_smiles = list(np.unique(sanitized))[1:] mol_list = smiles2mol(unique_smiles) prediction_sas = SAscore(mol_list) if predictor != None: prediction_prop = predictor.predict(unique_smiles) with open("Generated/generated_prop_" + model_type + ".smi", 'w') as f: # f.write("SMILES, Property, SA_score, LogP\n" ) for i, cl in enumerate(unique_smiles): data = str(unique_smiles[i]) + "," + str( prediction_prop[i]) + "," + str(prediction_sas[i]) f.write("%s\n" % data) else: with open("Generated/generated_sas" + model_type + ".smi", 'w') as f: # f.write("SMILES, SA_score, LogP\n" ) for i, cl in enumerate(unique_smiles): data = str(unique_smiles[i]) + "," + str(prediction_sas[i]) f.write("%s\n" % data)
def properties_violin(filepaths, labels, pred_type): properties = [] for i, fname in enumerate(filepaths): with open(filepaths[i], 'r') as f: reader = csv.reader(f) it = iter(reader) # next(it, None) # skip first item. for row in it: if pred_type == 'pIC50': properties.append( [labels[i], 'IC50 for KOR', float(row[1])]) if i != 0: properties.append([labels[i], 'SA score', float(row[2])]) try: mol = Chem.MolFromSmiles(row[0]) q = QED.qed(mol) # x, y = desc.MolWt(mol), Crippen.MolLogP(mol) # properties.append([labels[i],'Molecular weight',x]) # properties.append([labels[i],'logP',y]) properties.append([labels[i], 'QED', q]) except: print("Non-Canonical SMILES: " + row[0]) else: try: mole = smiles2mol(row[0]) prediction_sas = SAscore(mole) properties.append( [labels[i], 'SA score', float(prediction_sas[0])]) mol = Chem.MolFromSmiles(row[0]) q = QED.qed(mol) # x, y = desc.MolWt(mol), Crippen.MolLogP(mol) # properties.append([labels[i],'Molecular weight',x]) # properties.append([labels[i],'logP',y]) properties.append([labels[i], 'QED', q]) except: print("Non-Canonical SMILES: " + row[0]) df = pd.DataFrame(properties, columns=['Sets', 'Property', 'Value']) return df
def get_reward_MO(predictor_kor, smile, uniq, memory_smiles): """ This function takes the predictor model and the SMILES string to return the numerical rewards from both the KOR and QED properties. ---------- predictor: object of the predictive model that accepts a trajectory and returns a numerical prediction of KOR affinity for the given trajectory smile: SMILES string of the generated molecule Returns ------- Outputs the reward values for the KOR and QED properties """ rewards = [] list_ss = [smile] # pIC50 for kor prediction list_ss = [smile] pred = predictor_kor.predict(list_ss) reward_kor = np.exp(pred / 4 - 1) # reward = np.exp(pred/10) - 1 rewards.append(reward_kor[0]) # QED property mol = smiles2mol(list_ss[0]) reward_qed = qed_calculator(mol) reward_qed = np.exp(reward_qed[0] / 4) rewards.append(reward_qed) # SAScore property list_ss[0] = Chem.MolFromSmiles(smile) sas_list = SAscore(list_ss) rew_sas = np.exp(-sas_list[0] / 5 + 1) rewards.append(rew_sas) # logP property mol = Chem.MolFromSmiles(smile) pred = Descriptors.MolLogP(mol) if pred > -1 and pred < 3: reward_logP = 1 else: reward_logP = 0 rewards.append(reward_logP) # uniqueness # if uniq == True: # rew_uniq = 0.8 # else: # rew_uniq = 0.2 # rewards.append(rew_uniq) diversity = 1 if len(memory_smiles) > 30: diversity = external_diversity(smile, memory_smiles) if diversity < 0.75: rew_div = 0.01 print("\Alert: Similar compounds") else: rew_div = 1 rewards.append(rew_div) return rewards
def property_checker(self, n_to_generate): """ Function to generate molecules with the specified generator model. Parameters: ----------- n_to_generate: Integer that indicates the number of molecules to generate iteration: Integer that indicates the current iteration. It will be used to build the filename of the generated molecules original_model: Boolean that specifies generator model. If it is 'True' we load the original model, otherwise, we load the fine-tuned model Returns ------- The plot containing the distribuiton of the property we want to optimize. It saves one file containing the generated SMILES strings. """ # sample = True self.generator.model.load_weights( self.configReinforce.model_name_biased + "_" + self.scalarization_mode + "_" + self.best_model + ".h5") print("....................................") print("updated model load_weights is DONE!") generated = [] pbar = tqdm(range(n_to_generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(self.generator, None, False, self.threshold_greedy, self.configReinforce, sample) generated.append(predictSMILES.sample()) sanitized, valid = canonical_smiles(generated, sanitize=True, throw_warning=False) san_with_repeated = [] for smi in sanitized: if len(smi) > 1: san_with_repeated.append(smi) unique_smiles = list(set(san_with_repeated)) percentage_unq = (len(unique_smiles) / len(san_with_repeated)) * 100 vld = (valid / n_to_generate) * 100 prediction_a2d = self.predictor_a2d.predict(unique_smiles, "a2d") prediction_bbb = self.predictor_bbb.predict(unique_smiles, "bbb") # desirable_mols = [] # for idx in range(0,len(prediction)): # if prediction[idx] > 6.5: # desirable_mols.append(san_with_repeated[idx]) # # perc_desirable = len(desirable_mols)/len(san_with_repeated) # perc_unique_desirable = len(list(set(desirable_mols)))/len(desirable_mols) # Compute the internal diversity div = diversity(unique_smiles) with open(self.configReinforce.file_path_generated + ".smi", 'w') as f: f.write("Number of molecules: %s\n" % str(len(unique_smiles))) f.write("Percentage of valid molecules: %s\n" % str(vld)) f.write("Internal Tanimoto similarity: %s\n\n" % str(div)) f.write("SMILES, pIC50, Active, BBB, MW, logP, SAS, QED\n") for i, smi in enumerate(unique_smiles): mol = Chem.MolFromSmiles(smi) list_mol = smiles2mol(smi) prediction_sas = SAscore(list_mol) active = "0" permeable = "0" if prediction_a2d[i] > 6.5: active = "1" if prediction_bbb[i] > 0.98: permeable = "1" q = QED.qed(mol) mw, logP = Descriptors.MolWt(mol), Crippen.MolLogP(mol) data = str(unique_smiles[i]) + " ," + str( np.round(prediction_a2d[i], 2) ) + " ," + active + " ," + permeable + " ," + str( np.round(mw, 2)) + " ," + str(np.round( logP, 2)) + " ," + str(np.round( prediction_sas[0], 2)) + " ," + str(np.round(q, 2)) f.write("%s\n" % data)
def compare_models(self, n_to_generate, individual_plot): """ Function to generate molecules with the both models Parameters: ----------- n_to_generate: Integer that indicates the number of molecules to generate individual_plot: Boolean that indicates if we want to represent the property distribution of the pre-trained model. Returns ------- The plot that contains the distribuitons of the property we want to optimize originated by the original and fine-tuned models. Besides this, it saves a "generated.smi" file containing the valid generated SMILES and the respective property value in "data\" folder """ self.generator.model.load_weights( self.configReinforce.model_name_unbiased) print("\n --------- Original model LOADED! ---------") generated_unb = [] pbar = tqdm(range(n_to_generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(self.generator, None, False, self.threshold_greedy, self.configReinforce) generated_unb.append(predictSMILES.sample()) sanitized_unb, valid_unb = canonical_smiles(generated_unb, sanitize=False, throw_warning=False) unique_smiles_unb = list(np.unique(sanitized_unb))[1:] if self.property_identifier == 'kor' or self.property_identifier == 'a2d': prediction_unb = self.predictor.predict(unique_smiles_unb) elif self.property_identifier == 'qed': mol_list = smiles2mol(unique_smiles_unb) prediction_unb = qed_calculator(mol_list) elif self.property_identifier == 'sas': mol_list = smiles2mol(unique_smiles_unb) prediction_unb = SAscore(unique_smiles_unb) if individual_plot: plot_hist(prediction_unb, n_to_generate, valid_unb, self.property_identifier) # Load Biased Generator Model self.generator.model.load_weights( self.configReinforce.model_name_biased + ".h5") print("\n --------- Updated model LOADED! ---------") generated_b = [] pbar = tqdm(range(n_to_generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(self.generator, None, False, self.threshold_greedy, self.configReinforce) generated_b.append(predictSMILES.sample()) sanitized_b, valid_b = canonical_smiles(generated_b, sanitize=False, throw_warning=False) # validar unique_smiles_b = list(set(sanitized_b)) san_with_repeated_b = [] for smi in sanitized_b: if len(smi) > 1: san_with_repeated_b.append(smi) percentage_unq_b = (len(unique_smiles_b) / len(san_with_repeated_b)) * 100 # percentage_unq_b = (len(unique_smiles_b)/len(sanitized_b))*100 # percentage_unq_b = (len(unique_smiles_b)/len(valid_mol))*100 if self.property_identifier == 'kor' or self.property_identifier == 'a2d': prediction_b = self.predictor.predict(unique_smiles_b) elif self.property_identifier == 'qed': mol_list = smiles2mol(unique_smiles_b) prediction_b = qed_calculator(mol_list) elif self.property_identifier == 'sas': mol_list = smiles2mol(unique_smiles_b) prediction_b = SAscore(unique_smiles_b) dif, valid = plot_hist_both(prediction_unb, prediction_b, n_to_generate, valid_unb, valid_b, self.property_identifier) div = diversity(unique_smiles_b) desirable = 0 for pred in prediction_b: if pred >= 6.5: desirable += 1 perc_desirable = desirable / len(san_with_repeated_b) return dif, div, valid, percentage_unq_b, perc_desirable
def test_generator(self, n_to_generate, iteration, original_model): """ Function to generate molecules with the specified generator model. Parameters: ----------- n_to_generate: Integer that indicates the number of molecules to generate iteration: Integer that indicates the current iteration. It will be used to build the filename of the generated molecules original_model: Boolean that specifies generator model. If it is 'True' we load the original model, otherwise, we load the fine-tuned model Returns ------- The plot containing the distribuiton of the property we want to optimize. It saves one file containing the generated SMILES strings. """ # if original_model: self.generator.model.load_weights( self.configReinforce.model_name_unbiased) print("....................................") print("original model load_weights is DONE!") else: self.generator.model.load_weights( self.configReinforce.model_name_biased + ".h5") print("....................................") print("updated model load_weights is DONE!") # generated = [] pbar = tqdm(range(n_to_generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(self.generator, None, False, self.threshold_greedy, self.configReinforce) generated.append(predictSMILES.sample()) sanitized, valid = canonical_smiles(generated, sanitize=True, throw_warning=False) san_with_repeated = [] for smi in sanitized: if len(smi) > 1: san_with_repeated.append(smi) unique_smiles = list(set(san_with_repeated)) percentage_unq = (len(unique_smiles) / len(san_with_repeated)) * 100 # rep = [] # for smi in unique_smiles: # if smi in data_smiles: # rep.append(smi) # percentage_valid = (valid / n_to_generate) * 100 # percentage_unique = (1 - (len(rep)/len(unique_smiles)))*100 if self.property_identifier == 'kor' or self.property_identifier == 'a2d': prediction = self.predictor.predict(san_with_repeated) elif self.property_identifier == 'sas': mol_list = smiles2mol(san_with_repeated) prediction = SAscore(mol_list) elif self.property_identifier == 'qed': mol_list = smiles2mol(san_with_repeated) prediction = qed_calculator(mol_list) vld = plot_hist(prediction, n_to_generate, valid, self.property_identifier) with open( self.configReinforce.file_path_generated + '_' + str(len(san_with_repeated)) + '_iter' + str(iteration) + ".smi", 'w') as f: for i, cl in enumerate(san_with_repeated): data = str(san_with_repeated[i]) + " ," + str(prediction[i]) f.write("%s\n" % data) # Compute the internal diversity div = diversity(unique_smiles) desirable = 0 for pred in prediction: if pred >= 6.5: desirable += 1 perc_desirable = desirable / len(san_with_repeated) return generated, prediction, percentage_valid, percentage_unq, div, perc_desirable
def compare_models(self, n_to_generate, individual_plot): """ Function to generate molecules with the both models Parameters: ----------- n_to_generate: Integer that indicates the number of molecules to generate individual_plot: Boolean that indicates if we want to represent the property distribution of the pre-trained model. Returns ------- The plot that contains the distribuitons of the property we want to optimize originated by the original and fine-tuned models. Besides this, it saves a "generated.smi" file containing the valid generated SMILES and the respective property value in "data\" folder """ self.generator.model.load_weights( self.configReinforce.model_name_unbiased) print("\n --------- Original model LOADED! ---------") generated_unb = [] pbar = tqdm(range(n_to_generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(self.generator, None, False, self.threshold_greedy, self.configReinforce) generated_unb.append(predictSMILES.sample()) sanitized_unb, valid_unb = canonical_smiles( generated_unb, sanitize=False, throw_warning=False) # validar unique_smiles_unb = list(np.unique(sanitized_unb))[1:] if self.property_identifier != 'sas': prediction_unb = self.predictor.predict(unique_smiles_unb) else: prediction_unb = SAscore(unique_smiles_unb) if individual_plot: plot_hist(prediction_unb, n_to_generate, valid_unb, self.property_identifier) # Load Biased Generator Model self.generator.model.load_weights( self.configReinforce.model_name_biased + ".h5") print("\n --------- Updated model LOADED! ---------") generated_b = [] pbar = tqdm(range(n_to_generate)) for i in pbar: pbar.set_description("Generating molecules...") predictSMILES = PredictSMILES(self.generator, None, False, self.threshold_greedy, self.configReinforce) generated_b.append(predictSMILES.sample()) sanitized_b, valid_b = canonical_smiles(generated_b, sanitize=False, throw_warning=False) # validar unique_smiles_b = list(np.unique(sanitized_b))[1:] if self.property_identifier != 'sas': prediction_b = self.predictor.predict(unique_smiles_b) else: prediction_b = SAscore(unique_smiles_b) plot_hist_both(prediction_unb, prediction_b, n_to_generate, valid_unb, valid_b, self.property_identifier)