def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}): # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError( '"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise self.DepictionError( 'Import error from depiction "{}" of type "{}"'.format( idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles( rdmol ) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError( '"{}" is not a valid output type'.format(otype)) return odepic
def load_data(self, preprocess=False, stereochem=1., augment=1): all_mols = read_smiles_file(self.dataset) if preprocess: all_mols = preprocess_smiles(all_mols, stereochem) self.molecules = all_mols self.smiles = all_mols self.inchi = [MolToInchiKey(MolFromSmiles(s)) for s in all_mols] del all_mols print("%i molecules loaded from %s..." % (len(self.molecules), self.dataset)) self.maxlen = max([len(m) for m in self.molecules]) + 2 print("Maximal sequence length: %i" % (self.maxlen - 2)) if augment > 1: print("augmenting SMILES %i-fold..." % augment) augmented_mols = randomize_smileslist(self.molecules, num=augment) print("%i SMILES strings generated for %i molecules" % (len(augmented_mols), len(self.molecules))) self.smiles = self.molecules self.molecules = augmented_mols del augmented_mols self.padded = pad_seqs(["^%s$" % m for m in self.molecules], ' ', given_len=self.maxlen) self.n_mols = len(self.molecules) self.val_mols, self.train_mols = np.split( np.random.choice(range(self.n_mols), self.n_mols, replace=False), [int(self.validation * self.n_mols)]) print("Using %i examples for training and %i for valdiation" % (len(self.train_mols), len(self.val_mols))) self.build_tokenizer()
def convert_depiction(idepic, itype='smiles', otype={'inchikey'}): """Convert chemical depiction to others type of depictions :param idepic: string depiction to be converted, str :param itype: type of depiction provided as input, str :param otype: types of depiction to be generated, {"", "", ..} :return odepic: generated depictions, {"otype1": "odepic1", ..} Usage example: - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'}) - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'}) """ # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError('"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise Exception('Import error from depiction "{}" of type "{}"'.format(idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles(rdmol) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError('"{}" is not a valid output type'.format(otype)) return odepic
def analyze(self, smiles: List[str], only_drugs=True) -> pd.DataFrame: features = self.preprocessor.transform(smiles) # RDKit molecular properties inchikey = [] weight = [] logp = [] hdonors = [] hacceptors = [] for example in smiles: mol = MolFromSmiles(example) if not mol: raise ValueError("Malformed molecule passed in to analyze") inchikey.append(MolToInchiKey(mol)) weight.append(ExactMolWt(mol)) logp.append(MolLogP(mol)) hdonors.append(NumHDonors(mol)) hacceptors.append(NumHAcceptors(mol)) # Scores safety = self.safety.predict(features) feasibility = self.feasibility.predict(features) bbbp = self.bbbp.predict_proba(features) dataframe = pd.DataFrame( { "key": inchikey, "smiles": smiles, "weight": weight, "logp": logp, "hdonors": hdonors, "hacceptors": hacceptors, "safety": safety, "feasibility": feasibility, "bbbp": (i[1] for i in bbbp), } ) if only_drugs: # Lipinsky's rules dataframe = dataframe[dataframe.weight < 500] dataframe = dataframe[dataframe.hdonors <= 5] dataframe = dataframe[dataframe.hacceptors <= 10] dataframe = dataframe[dataframe.logp <= 5] # Filter too toxic and infeasible compounds dataframe = dataframe[dataframe.safety > 0.75] dataframe = dataframe[dataframe.feasibility > 0.75] dataframe = dataframe.reset_index(drop=True) return dataframe
def standardize_chemical(rdmol, add_hs=True, rm_stereo=True, heavy=False): """Standardize a chemical using RDKit sanitize method. :param rdmol: RDKit mol object :param add_hs: append Hs, bool (default: True) :param rm_stereo: remove stereo, bool (default: True) :param heavy: perform custom in depth standardization (default: False) :returns rdmol: RDKit mol object """ # if not rm_stereo: # logging.warning("Stereo not handled at the time being.") # raise ChemConversionError("Stereo not handled at the time being.") simple_standardisation = { 'OP_REMOVE_ISOTOPE': False, 'OP_NEUTRALISE_CHARGE': False, 'OP_REMOVE_STEREO': rm_stereo, 'OP_COMMUTE_INCHI': True, 'OP_KEEP_BIGGEST': False, 'OP_ADD_HYDROGEN': add_hs, 'OP_KEKULIZE': False, 'OP_NEUTRALISE_CHARGE_LATE': True } heavy_standardisation = { 'OP_REMOVE_ISOTOPE': True, 'OP_NEUTRALISE_CHARGE': True, 'OP_REMOVE_STEREO': rm_stereo, 'OP_COMMUTE_INCHI': True, 'OP_KEEP_BIGGEST': True, 'OP_ADD_HYDROGEN': add_hs, 'OP_KEKULIZE': False, 'OP_NEUTRALISE_CHARGE_LATE': True } try: if heavy: rdmol = Standardizer(sequence_fun='sequence_tunable', params=heavy_standardisation).compute(rdmol) logging.debug( "Performing heavy standardisation for compound {}".format( MolToInchiKey(rdmol))) else: rdmol = Standardizer(sequence_fun='sequence_tunable', params=simple_standardisation).compute(rdmol) return rdmol except Exception as e: logging.warning(e) raise e
def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}): """Convert chemical depiction to others type of depictions Usage example: - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'}) - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'}) :param idepic: Input string :param itype: The type of input :param otype: Type of output. Valid options: inchi, smiles, inchikey :type idepic: str :type itype: str :type otype: dict :rtype: dict :return: Dictionnary of results """ # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError('"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise self.DepictionError('Import error from depiction "{}" of type "{}"'.format(idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles(rdmol) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError('"{}" is not a valid output type'.format(otype)) return odepic
def test4MolToInchiKey(self): m = MolFromSmiles("CC=C(N)C") inchi = MolToInchi(m) k1 = InchiToInchiKey(inchi) k2 = MolToInchiKey(m) self.assertEqual(k1, k2)
def train_model(self, n_sample=100): print("Training model...") log_dir = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S") writer = tf.summary.create_file_writer(log_dir) # writer = tf.compat.v1.summary.FileWriter('./logs/' + self.run_name, graph=tf.Graph()) mol_file = open("./generated/" + self.run_name + "_generated.csv", 'a') i = 0 while i < self.num_epochs: print("\n------ ITERATION %i ------" % i) self.set_lr(i) print("\nCurrent learning rate: %.5f" % tf.keras.backend.get_value(self.model.optimizer.lr)) chkpntr = tf.keras.callbacks.ModelCheckpoint( filepath=self.checkpoint_dir + 'model_epoch_{:02d}.hdf5'.format(i), verbose=1) if self.validation: generator_train = DataGenerator(self.padded, self.train_mols, self.maxlen - 1, self.token_indices, self.step, self.batch_size) generator_val = DataGenerator(self.padded, self.val_mols, self.maxlen - 1, self.token_indices, self.step, self.batch_size) history = self.model.fit(generator_train, epochs=1, validation_data=generator_val, use_multiprocessing=self.multi, workers=self.workers, callbacks=[chkpntr]) with writer.as_default(): tf.summary.scalar('val_loss', history.history['val_loss'][-1], step=i) else: generator = DataGenerator(self.padded, range(self.n_mols), self.maxlen - 1, self.token_indices, self.step, self.batch_size) history = self.model.fit(generator, epochs=1, use_multiprocessing=self.multi, workers=self.workers, callbacks=[chkpntr]) # write losses to tensorboard log with writer.as_default(): tf.summary.scalar('loss', history.history['loss'][-1], step=i) tf.summary.scalar('lr', tf.keras.backend.get_value( self.model.optimizer.lr), step=i) if (i + 1) % self.sample_after == 0: valid_mols = self.sample_points(n_sample, self.temp) n_valid = len(valid_mols) if n_valid: print("Comparing novelty...") inchi_valid = np.array( [MolToInchiKey(MolFromSmiles(s)) for s in valid_mols]) novel = np.array( compare_mollists(inchi_valid, np.array(self.inchi), False)) n_novel = float(len(set(novel))) / n_valid mol_file.write("\n----- epoch %i -----\n" % i) mol_file.write("\n".join(set(valid_mols))) else: novel = [] n_novel = 0 # write generated compound summary to tensorboard log with writer.as_default(): tf.summary.scalar('valid', (float(n_valid) / n_sample), step=i) tf.summary.scalar('novel', n_novel, step=i) tf.summary.scalar('unique_valid', len(set(valid_mols)), step=i) print("\nValid:\t{}/{}".format(n_valid, n_sample)) print("Unique:\t{}".format(len(set(valid_mols)))) print("Novel:\t{}\n".format(len(novel))) if self.reinforce: # reinforce = add most similar generated compounds to training pool if len(novel) > (n_sample / 5): if self.mw_filter: # only consider molecules in given MW range mw = np.array([ Descriptors.MolWt(MolFromSmiles(s)) if MolFromSmiles(s) else 0 for s in novel ]) mw_idx = np.where((int(self.mw_filter[0]) < mw) & (mw < int(self.mw_filter[1])))[0] novel = np.array(novel)[mw_idx] print( "Calculating CATS similarities of novel generated molecules to SMILES pool..." ) fp_novel = cats_descriptor( [MolFromSmiles(s) for s in novel]) if self.reference: # if a reference mol(s) is given, calculate distance to that one fp_train = cats_descriptor( [MolFromSmiles(self.reference)]) else: # else calculate the distance to all training mols fp_train = cats_descriptor( [MolFromSmiles(s) for s in self.smiles]) sims = parallel_pairwise_similarities( fp_novel, fp_train, metric='euclidean') top = sims[range(len(novel)), np.argsort(sims, axis=1)[:, 0, 0]].flatten() # take most similar third of the novel mols and add it to self.padded print( "Adding top 3 most similar but novel molecules to SMILES pool" ) add = randomize_smileslist(novel[np.argsort(top)[:3]], num=3) padd_add = pad_seqs(["^%s$" % m for m in add], ' ', given_len=self.maxlen) self.padded = np.hstack((self.padded, padd_add)) self.padded = np.random.choice(self.padded, len(self.padded), False) # shuffle i += 1 # next epoch