def testPythonDescriptorFunctor(self): class NumAtoms(Descriptors.PropertyFunctor): def __init__(self): Descriptors.PropertyFunctor.__init__(self, "NumAtoms", "1.0.0") def __call__(self, mol): return mol.GetNumAtoms() numAtoms = NumAtoms() rdMD.Properties.RegisterProperty(numAtoms) props = rdMD.Properties(["NumAtoms"]) self.assertEquals(1, props.ComputeProperties(Chem.MolFromSmiles("C"))[0]) self.assertTrue("NumAtoms" in rdMD.Properties.GetAvailableProperties()) # check memory del numAtoms self.assertEquals(1, props.ComputeProperties(Chem.MolFromSmiles("C"))[0]) self.assertTrue("NumAtoms" in rdMD.Properties.GetAvailableProperties()) m = Chem.MolFromSmiles("c1ccccc1") properties = rdMD.Properties() for name, value in zip(properties.GetPropertyNames(), properties.ComputeProperties(m)): print(name, value) properties = rdMD.Properties(['exactmw', 'lipinskiHBA']) for name, value in zip(properties.GetPropertyNames(), properties.ComputeProperties(m)): print(name, value)
def testProperties(self): props = rdMD.Properties() names = list(props.GetAvailableProperties()) self.assertEquals(names, list(props.GetPropertyNames())) m = Chem.MolFromSmiles("C1CC1CC") results = props.ComputeProperties(m) for i, name in enumerate(names): props = rdMD.Properties([name]) res = props.ComputeProperties(m) self.assertEquals(len(res), 1) self.assertEquals(res[0], results[i]) self.assertEquals(props.GetPropertyNames()[0], names[i]) self.assertEquals(len(props.GetPropertyNames()), 1) try: props = rdMD.Properties([1, 2, 3]) self.assertEquals("should not get here", "but did") except TypeError: pass try: props = rdMD.Properties(["property that doesn't exist"]) self.assertEquals("should not get here", "but did") except RuntimeError: pass
def _RDKit_properties_FUTURE(ifile) -> (dict): '''Computes RDKit properties for the SDF provided as argument Parameters ---------- ifile: str SDF input file Returns ------- results_dict: dict, with the following keys 'matrix': ndarray, properties matrix. Full form with non processed and failed molecules. 'names': list, matrix column names. Properties names. 'success_arr': ndarray, array with bool values indicating if mol had any issues during supplier (None) or in the descriptor array (presence of NaNs). ''' properties = rdMolDescriptors.Properties() props_names = [propname for propname in properties.GetPropertyNames()] n_props = len(props_names) n_cols = n_props LOG.info('computing {} RDKit properties per molecule...'.format(n_cols)) results_dict = _calc_descriptors(properties.ComputeProperties, ifile, props_names) return results_dict
def compute_properties(self, feature_name=None): """compute the basic properties from the rdkit package Args: feature_name: a list of input features names. if not specified, all avaiable features will be calculated. Returns: prop_dict: property dictionary, mixed with float and int """ assert type(self.Molecule) == Chem.rdchem.Mol if feature_name is not None: properties = rdDesc.Properties(feature_name) else: properties = rdDesc.Properties() props_dict = dict( zip(properties.GetPropertyNames(), properties.ComputeProperties(self.Molecule))) return props_dict
def getChemicalProperties(SMILE): # A functions that uses RDKit to generate chemical properties from SMILES representation property_names = [] properties = [] try: molecule = Chem.MolFromSmiles(SMILE) generate_properties = rdMolDescriptors.Properties() for name, value in zip( generate_properties.GetPropertyNames(), generate_properties.ComputeProperties(molecule)): property_names.append(name) properties.append(value) return property_names, properties except: return ["N/A"] * 25, ["N/A"] * 25 # RDKit gives you 25 properties
def get_rdkit_properties(df): properties = rdMolDescriptors.Properties() mol_list = [MolFromSmiles(SMILES) for SMILES in df.CompoundSMILES] descriptor_values = [] descriptor_names = [name for name in properties.GetPropertyNames()] # Add all of the avilable rdkit descriptors for mol in mol_list: descriptor_temp_list = [] for name,value in zip(properties.GetPropertyNames(), properties.ComputeProperties(mol)): descriptor_temp_list.append(value) descriptor_values.append(descriptor_temp_list) i = 0 for name in descriptor_names: list_append = [value[i] for value in descriptor_values] df[name] = list_append i += 1 return df,properties
def _do_norm_X(self, smiles_list, find_norm=True): X = [] logS_list_esol = self.esol_calculator.predict(smiles_list) logS_list_rf = self.rf_regression.predict(smiles_list) logS_list_nfp = self.nfp_regression.predict(smiles_list) for i, smiles in enumerate(smiles_list): mol = Chem.MolFromSmiles(smiles) props = [ list( rdMolDescriptors.Properties([name ]).ComputeProperties(mol))[0] for name in self._feats ] vals = [logS_list_esol[i], logS_list_rf[i], logS_list_nfp[i]] x_row = vals + props X.append(x_row) if find_norm: self._size = len(X[0]) self._mean_props = np.mean(X, axis=0) self._std_props = np.std(X, axis=0) logging.debug("Size of props") logging.debug(self._size) logging.debug("Means of X") logging.debug(self._mean_props) logging.debug("Std of X") logging.debug(self._std_props) for i in range(len(X)): for j, X_ij in enumerate(X[i]): X[i][j] = (X_ij - self._mean_props[j]) / self._std_props[j] return X
def __init__(self, fp='ecfp', radius=2, fp_length=1024, prop=False, n_ests=200): super().__init__() self._name = "EnsembleRegressor" self._fp = fp self._fp_r = radius self._fp_length = fp_length self._feats = list(rdMolDescriptors.Properties().GetPropertyNames()) self.model = None self.esol_calculator = ESOLCalculator() self.rf_regression = RFPredictor(n_ests=n_ests, fp_type=self._fp) self.nfp_regression = NfpPredictor() self._means_logS = None self._std_logS = None self._means_props = None self._std_props = None self._size = -1 self._epochs = 10
def _RDKit_properties(ifile, **kwargs) -> (bool, (np.ndarray, list, list)): ''' computes RDKit properties for the file provided as argument output is a boolean and a tupla with the xmatrix and the variable names ''' try: suppl = Chem.SDMolSupplier(ifile) except Exception as e: LOG.error(f'Unable to create supplier with exception {e}') return False, 'unable to create supplier' LOG.info('computing RDKit properties...') properties = rdMolDescriptors.Properties() # get from here num of properties md_name = [prop_name for prop_name in properties.GetPropertyNames()] success_list = [] xmatrix = [] try: num_obj = 0 for mol in suppl: if mol is None: LOG.error( f'Unable to process molecule #{num_obj+1} in {ifile}') success_list.append(False) continue # xmatrix [num_obj] = properties.ComputeProperties(mol) if num_obj == 0: descriptors = properties.ComputeProperties(mol) # what is going on here?? if np.isnan(xmatrix).any(): success_list.append(False) continue else: xmatrix = descriptors else: descriptors = properties.ComputeProperties(mol) if np.isnan(descriptors).any(): success_list.append(False) continue xmatrix = np.vstack((xmatrix, descriptors)) success_list.append(True) num_obj += 1 except Exception as e: LOG.error( f'Failed computing RDKit properties for molecule #{num_obj+1} in {ifile}' f' with exception: {e}') return False, 'Failed computing RDKit properties for molecule' + str( num_obj + 1) + 'in file ' + ifile LOG.debug( f'computed RDKit properties matrix with shape {np.shape(xmatrix)}') if num_obj == 0: return False, 'Unable to compute RDKit properties for molecule ' + ifile results = { 'matrix': xmatrix, 'names': md_name, 'success_arr': success_list } return True, results
def _RDKit_properties(ifile, **kwargs) -> (bool, (np.ndarray, list, list)): ''' computes RDKit properties for the file provided as argument output is a boolean and a tupla with the xmatrix and the variable names ''' try: suppl = Chem.SDMolSupplier(ifile) except Exception as e: LOG.error(f'Unable to create supplier with exception {e}') return False, 'unable to create supplier' LOG.info('computing RDKit properties...') properties = rdMolDescriptors.Properties() # get from here num of properties md_name = [prop_name for prop_name in properties.GetPropertyNames()] #print (md_name) success_list = [] est_obj = len(suppl) xmatrix = np.zeros((est_obj, len(md_name))) try: num_obj = 0 for mol in suppl: if mol is None: LOG.error( f'Unable to process molecule #{num_obj+1} in {ifile}') success_list.append(False) continue if mol.GetNumHeavyAtoms() == 0: LOG.error('Empty molecule' f'#{num_obj+1} in {ifile}') success_list.append(False) continue descriptors = properties.ComputeProperties(mol) if np.isnan(descriptors).any() or np.isinf(descriptors).any(): success_list.append(False) continue xmatrix[num_obj] = descriptors # xmatrix.append(descriptors) # if num_obj == 0: # xmatrix = descriptors # LOG.debug(f'first descriptor vector computed') # else: # xmatrix = np.vstack((xmatrix, descriptors)) success_list.append(True) num_obj += 1 except Exception as e: LOG.error( f'Failed computing RDKit properties for molecule #{num_obj+1} in {ifile}' f' with exception: {e}') return False, 'Failed computing RDKit properties for molecule' + str( num_obj + 1) + 'in file ' + ifile if num_obj < est_obj: # if some molecules failed to compute we will clean xmatrix by # removing extra rows for i in range(num_obj, est_obj): xmatrix = np.delete(xmatrix, num_obj, axis=0) LOG.debug( f'computed RDKit properties matrix with shape {np.shape(xmatrix)}') if num_obj == 0: return False, 'Unable to compute RDKit properties for molecule ' + ifile results = { 'matrix': xmatrix, 'names': md_name, 'success_arr': success_list } return True, results