def tanimoto_sml(queryism, targetism): """ Returns the Tanimoto similarity between the two molecules in SMILES format. """ # no Unicode here querymol = MolFromSmiles(str(queryism)) targetmol = MolFromSmiles(str(targetism)) if querymol and targetmol: queryfp = RDKFingerprint(querymol) targetfp = RDKFingerprint(targetmol) return TanimotoSimilarity(queryfp, targetfp)
def similarity_search(fps_db, smile): fps_test = RDKFingerprint(MolFromSmiles(smile)) ts = [] for i, s_top in enumerate(fps_db): ts.append(DataStructs.FingerprintSimilarity(s_top, fps_test)) ts = np.array(ts) return ts.mean() # ts.max()
def _set_target_fps(self, pickaxe: Pickaxe): for smiles in pickaxe.target_smiles: mol = MolFromSmiles(smiles) if self.fingerprint_method == "Morgan": fp = AllChem.GetMorganFingerprintAsBitVect(mol, **self.fingerprint_args) else: fp = RDKFingerprint(mol) self.target_fps.append(fp)
def fingerprint(self, m, fpsize=1024, bitsperhash=2, tgtDensity=0.3): """Compute bit fingerprint of molecule m""" from rdkit.Chem import RDKFingerprint #from rdkit import DataStructs #fp=RDKFingerprint(m, minPath=1, maxPath=7,fpSize=1024, bitsPerHash=2, useHs=False, tgtDensity=0.3) return RDKFingerprint(m, minPath=1, maxPath=7, fpSize=fpsize, nBitsPerHash=bitsperhash, tgtDensity=tgtDensity, minSize=fpsize)
def calculate_fp(mol, method='maccs', n_bits=2048): # mol = Chem molecule object # Function to calculate molecular fingerprints given the number of bits and the method if method == 'maccs': return MACCSkeys.GenMACCSKeys(mol) if method == 'ecfp4': return GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits, useFeatures=False) if method == 'ecfp6': return GetMorganFingerprintAsBitVect(mol, 3, nBits=n_bits, useFeatures=False) if method == 'torsion': return GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=n_bits) if method == 'rdk5': return RDKFingerprint(mol, maxPath=5, fpSize=1024, nBitsPerHash=2)
def transform(self): super().transform() fts = [] self.mol_names = [] for mol in self.structures: fp = RDKFingerprint(mol) arr = np.zeros((0, ), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, arr) fts.append(arr) self.features = np.array(fts) self.mol_names.append(mol.GetProp("_Name")) self.columns = [str(i) for i in list(range(self.features.shape[1]))] return self.features
def rdk_molstring(molecule, fptype): """ Method for make molstring for rdk fingerprint :param molecule: molecule object :param fptype: type, radius and size of fingerprint :type fptype: dict :return: molstring for rdk fingerprint """ arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray( RDKFingerprint(molecule, fpSize=fptype['Size']), arr) return arr
def ensemble_test(self, submission, data_list, reversed_token_map, transform): """ ensemble test function :param submission: submission file :param data_list: list of test data path :param reversed_token_map: converts prediction to readable format :param transform: normalize function """ # load .yaml file that contains information about each model with open('model/prediction_models.yaml') as f: p_configs = yaml.load(f) predictors = [] for conf in p_configs.values(): predictors.append( Predict.remote(conf, self._device, self._gpu_non_block, self._decode_length, self._model_load_path)) loop = asyncio.get_event_loop() async def process_async_calculate_similarity(combination_of_smiles, combination_index): return { idx: await self.async_fps(comb[0], comb[1]) for comb, idx in zip(combination_of_smiles, combination_index) } def ray_prediction(imgs): return ray.get([p.decode.remote(imgs) for p in predictors]) conf_len = len(p_configs) # configure length == number of model to use fault_counter = 0 sequence = None model_contribution = np.zeros(conf_len) for i, dat in enumerate(data_list): imgs = Image.open(self._test_file_path + dat) imgs = self.png_to_tensor(imgs) imgs = transform(imgs).pin_memory().cuda() # predict SMILES sequence form each predictors preds_raw = ray_prediction(imgs) preds = [] for p in preds_raw: # predicted sequence token value SMILES_predicted_sequence = list( torch.argmax(p.detach().cpu(), -1).numpy())[0] # converts prediction to readable format from sequence token value decoded_sequences = decode_predicted_sequences( SMILES_predicted_sequence, reversed_token_map) preds.append(decoded_sequences) del (preds_raw) # fault check: whether the prediction satisfies the SMILES format or not ms = {} for idx, p in enumerate(preds): m = MolFromSmiles(p) if m != None: ms.update({idx: m}) if len( ms ) == 0: # there is no decoded sequence that matches to SMILES format print('decode fail') fault_counter += 1 sequence = preds[0] elif len( ms ) == 1: # there is only one decoded sequence that matches to SMILES format sequence = preds[list(ms.keys())[0]] else: # there is more than two decoded sequence that matches to SMILES format # result ensemble ms_to_fingerprint = [RDKFingerprint(x) for x in ms.values()] combination_of_smiles = list(combinations( ms_to_fingerprint, 2)) ms_to_index = [x for x in ms] combination_index = list(combinations(ms_to_index, 2)) # calculate similarity score smiles_dict = loop.run_until_complete( process_async_calculate_similarity(combination_of_smiles, combination_index)) # sort the pairs by similarity score smiles_dict = sorted(smiles_dict.items(), key=(lambda x: x[1]), reverse=True) if smiles_dict[0][ 1] == 1.0: # if a similar score is 1 we assume to those predictions are correct. sequence = preds[smiles_dict[0][0][0]] else: score_board = np.zeros(conf_len) for i, (idx, value) in enumerate(smiles_dict): score_board[list(idx)] += conf_len - i pick = int(np.argmax(score_board) ) # choose the index that has the highest score sequence = preds[pick] # pick the decoded sequence model_contribution[pick] += 1 # logging witch model used sequence = preds[np.argmax(score_board)] print('{} sequence:, {}'.format(i, sequence)) # print('decode_time:', time.time() - start_time) submission.loc[submission['file_name'] == dat, 'SMILES'] = sequence del (preds) loop.close() print('total fault:', fault_counter) print('model contribution:', model_contribution) return submission
def rdk_fp(self): """ Receives the csv file which is used to generate rdk fingerprints (2048) and saves as numpy file Parameter --------- input smiles : str Compouds in the form of smiles are used return : np.array Features are saved in the form of numpy files """ df = pd.read_csv(self.csv_path) smiles_list = df['Smiles'].tolist() fingerprints = [] not_found = [] for i in tqdm(range(len(smiles_list))): try: mol = Chem.MolFromSmiles(smiles_list[i]) fp = RDKFingerprint(mol, nBitsPerHash=1) bits_array = (np.fromstring(fp.ToBitString(), 'u1') - ord('0')) fingerprints.append(bits_array) except: fingerprints.append(np.nan) not_found.append(i) pass df.drop(not_found, axis=0, inplace=True) print('Number of FPs not found: {}'.format(len(not_found))) df.reset_index(drop=True, inplace=True) labelencoder = LabelEncoder() Y = labelencoder.fit_transform(df['Label'].values) Y = Y.reshape(Y.shape[0], 1) print('Output shape: {}'.format(Y.shape)) fp_array = (np.asarray((fingerprints), dtype=object)) X = np.delete(fp_array, not_found, axis=0) X = np.vstack(X).astype(np.float32) print('Input shape: {}'.format(X.shape)) final_array = np.concatenate((X, Y), axis=1) # Removing rows, from final_array, where duplicate FPs are present final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)] _, unq_row_indices = np.unique(final_array_slice, return_index=True, axis=0) final_array_unique = final_array[unq_row_indices] print( 'Number of Duplicate FPs: {}'.format(final_array.shape[0] - final_array_unique.shape[0])) print('Final Numpy array shape: {}'.format(final_array_unique.shape)) print('Type of final array: {}'.format(type(final_array_unique))) final_numpy_array = np.asarray((final_array_unique), dtype=np.float32) return final_numpy_array
def fingerprint(fingerprint_method, keyword_dict, smi): mol = AllChem.MolFromSmiles(smi) if fingerprint_method == "Morgan": return AllChem.GetMorganFingerprintAsBitVect(mol, **keyword_dict) else: return RDKFingerprint(mol)
def main_bo(vocab_path, model_path, save_dir, descriptor_path, sampling=60, iterations=2, epochs=2, hidden_size=450, latent_size=56, depthT=20, depthG=3, random_seed=1, pIC50_weight=0, QED_weight=0, logP_weight=1, SA_weight=1, cycle_weight=1, sim_weight=0): if os.path.isdir(save_dir) is False: os.makedirs(save_dir) vocab = [x.strip("\r\n ") for x in open(vocab_path)] vocab = Vocab(vocab) model = JTNNVAE(vocab, hidden_size, latent_size, depthT, depthG) model.load_state_dict(torch.load(model_path)) model = model.cuda() if sim_weight != 0: df_100 = pd.read_csv( '../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv') ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()] fps_db = [RDKFingerprint(x) for x in ms_db] # We load the random seed np.random.seed(random_seed) # Path of the files latent_feature = os.path.join(descriptor_path, './latent_features.txt') target = os.path.join(descriptor_path, './targets.txt') logp_value = os.path.join(descriptor_path, './logP_values.txt') QED_value = os.path.join(descriptor_path, './QED_values.txt') pIC50_value = os.path.join(descriptor_path, './pIC50_values.txt') sa_score = os.path.join(descriptor_path, './SA_scores.txt') cycle_score = os.path.join(descriptor_path, './cycle_scores.txt') sim_score = os.path.join(descriptor_path, './sim_values.txt') # We load the data (y is minued!) X = np.loadtxt(latent_feature) y = -np.loadtxt(target) y = y.reshape((-1, 1)) n = X.shape[0] permutation = np.random.choice(n, n, replace=False) X_train = X[permutation, :][0:np.int(np.round(0.9 * n)), :] X_test = X[permutation, :][np.int(np.round(0.9 * n)):, :] y_train = y[permutation][0:np.int(np.round(0.9 * n))] y_test = y[permutation][np.int(np.round(0.9 * n)):] np.random.seed(random_seed) pIC50_values = np.loadtxt(pIC50_value) QED_values = np.loadtxt(QED_value) logP_values = np.loadtxt(logp_value) SA_scores = np.loadtxt(sa_score) cycle_scores = np.loadtxt(cycle_score) sim_values = np.loadtxt(sim_score) iteration = 0 while iteration < iterations: # We fit the GP np.random.seed(iteration * random_seed) M = 500 sgp = SparseGP(X_train, 0 * X_train, y_train, M) # TODO: test hyperparameters sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0, y_test, minibatch_size=10 * M, max_iterations=5, learning_rate=0.001) pred, uncert = sgp.predict(X_test, 0 * X_test) error = np.sqrt(np.mean((pred - y_test)**2)) testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert))) print('Test RMSE: ', error) print('Test ll: ', testll) pred, uncert = sgp.predict(X_train, 0 * X_train) error = np.sqrt(np.mean((pred - y_train)**2)) trainll = np.mean( sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert))) print('Train RMSE: ', error) print('Train ll: ', trainll) # We pick the next 60 inputs next_inputs = sgp.batched_greedy_ei(sampling, np.min(X_train, 0), np.max(X_train, 0)) # joblib.dump(next_inputs, './next_inputs.pkl') # next_inputs = joblib.load('./next_inputs.pkl') valid_smiles = [] new_features = [] for i in tqdm(range(sampling)): all_vec = next_inputs[i].reshape((1, -1)) tree_vec, mol_vec = np.hsplit(all_vec, 2) tree_vec = create_var(torch.from_numpy(tree_vec).float()) mol_vec = create_var(torch.from_numpy(mol_vec).float()) tree_vecs, _ = model.rsample(tree_vec, model.T_mean, model.T_var) mol_vecs, _ = model.rsample(mol_vec, model.G_mean, model.G_var) s = model.decode(tree_vecs, mol_vecs, prob_decode=False) if s is not None: valid_smiles.append(s) new_features.append(all_vec) print(len(valid_smiles), "molecules are found") valid_smiles = valid_smiles new_features = next_inputs new_features = np.vstack(new_features) save_object( valid_smiles, os.path.join(save_dir, "valid_smiles{}.pkl".format(iteration))) scores = [] if pIC50_weight != 0: current_pIC50 = calculate_pIC50(valid_smiles) for i in range(len(valid_smiles)): current_pIC50_normalized = ( current_pIC50[i] - np.mean(pIC50_values)) / np.std(pIC50_values) else: current_pIC50_normalized = [] for i in range(len(valid_smiles)): current_pIC50_normalized.append(0) for i in range(len(valid_smiles)): if sim_weight != 0: current_sim_value = similarity_search(fps_db, valid_smiles[i]) current_sim_value_normalized = ( current_sim_value - np.mean(sim_values)) / np.std(sim_values) else: current_sim_value_normalized = 0 current_QED_value = QED.qed(MolFromSmiles(valid_smiles[i])) current_log_P_value = Descriptors.MolLogP( MolFromSmiles(valid_smiles[i])) current_SA_score = -sascorer.calculateScore( MolFromSmiles(valid_smiles[i])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles( valid_smiles[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length current_SA_score_normalized = ( current_SA_score - np.mean(SA_scores)) / np.std(SA_scores) current_QED_value_normalized = ( current_QED_value - np.mean(QED_values)) / np.std(QED_values) current_log_P_value_normalized = ( current_log_P_value - np.mean(logP_values)) / np.std(logP_values) current_cycle_score_normalized = ( current_cycle_score - np.mean(cycle_scores)) / np.std(cycle_scores) score = (SA_weight * current_SA_score_normalized + QED_weight * current_QED_value_normalized + logP_weight * current_log_P_value_normalized + cycle_weight * current_cycle_score_normalized + pIC50_weight * current_pIC50_normalized[i] + sim_weight * current_sim_value_normalized) scores.append(-score) # target is always minused print(valid_smiles) print(scores) save_object(scores, os.path.join(save_dir, "scores{}.pkl".format(iteration))) if len(new_features) > 0: X_train = np.concatenate([X_train, new_features], 0) y_train = np.concatenate([y_train, np.array(scores)[:, None]], 0) iteration += 1
def rdkit_fingerprint(mol, **kwargs): return list(RDKFingerprint(mol, **kwargs).GetOnBits())
def scorer(smiles, pIC50_weight, QED_weight, logP_weight, SA_weight, cycle_weight, sim_weight): smiles_rdkit = [] for i in range(len(smiles)): smiles_rdkit.append( MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True)) # calculate IC50 of training set using MPNN #IC50_scores=calculateScore(smiles_rdkit) # read in IC50 of training set from database IC50_scores = np.loadtxt('../data/covid/ic50-fulltrain.txt') IC50_scores = [x for x in IC50_scores] IC50_scores_normalized = (np.array(IC50_scores) - np.mean(IC50_scores)) / np.std(IC50_scores) if sim_weight != 0: # df_100 = list of molecules to match similarity df_100 = pd.read_csv('../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv') ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()] fps_db = [RDKFingerprint(x) for x in ms_db] sim_values = [] for i in range(len(smiles)): sim_values.append( similarity_search(fps_db, smiles_rdkit[i])) sim_values_normalized = ( np.array(sim_values) - np.mean(sim_values)) / np.std(sim_values) else: sim_values, sim_values_normalized = [], [] for i in range(len(smiles)): sim_values.append(0) sim_values_normalized.append(0) sim_values_normalized=np.array(sim_values_normalized) logP_values = [] for i in range(len(smiles)): logP_values.append( Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i]))) qed_values = [] for i in range(len(smiles)): qed_values.append( QED.qed(MolFromSmiles(smiles_rdkit[i]))) SA_scores = [] for i in range(len(smiles)): SA_scores.append( -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i]))) cycle_scores = [] for i in range(len(smiles)): cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_scores.append(-cycle_length) SA_scores_normalized = ( np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores) qed_values_normalized = ( np.array(qed_values) - np.mean(qed_values)) / np.std(qed_values) cycle_scores_normalized = ( np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores) logP_values_normalized = ( np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values) targets = (pIC50_weight * IC50_scores_normalized + logP_weight * logP_values_normalized + SA_weight * SA_scores_normalized + QED_weight * qed_values_normalized + cycle_weight * cycle_scores_normalized + sim_weight * sim_values_normalized) return (IC50_scores, qed_values, logP_values, SA_scores, cycle_scores, sim_values, targets)