def featurize_mol(self, coords, mol, max_num_atoms): logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms())) neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff, self.max_num_neighbors, None) z = self.get_Z_matrix(mol, max_num_atoms) z = pad_array(z, max_num_atoms) coords = pad_array(coords, (max_num_atoms, 3)) return coords, neighbor_list, z
def _featurize(self, mol: RDKitMol) -> np.ndarray: """ Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues are returned sorted by absolute value in descending order and padded by max_atoms. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray The eigenvalues of Coulomb matrix for molecules. The default shape is `(num_confs, max_atoms)`. If num_confs == 1, the shape is `(max_atoms,)`. """ cmat = self.coulomb_matrix(mol) features_list = [] for f in cmat: w, v = np.linalg.eig(f) w_abs = np.abs(w) sortidx = np.argsort(w_abs) sortidx = sortidx[::-1] w = w[sortidx] f = pad_array(w, self.max_atoms) features_list.append(f) features = np.asarray(features_list) if features.shape[0] == 1: # `(1, max_atoms)` -> `(max_atoms,)` features = np.squeeze(features, axis=0) return features
def _featurize(self, struct: PymatgenStructure) -> np.ndarray: """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- struct: pymatgen.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ # Get full N x N SCM sine_mat = self.scm.featurize(struct) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros(self.max_atoms) zeros[:len(eigs[0])] = eigs[0] features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Calculate symmetry function. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of symmetry function. The shape is `(max_atoms, 4)`. """ if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) coordinates = self.coordfeat._featurize(datapoint) atom_numbers = np.array( [atom.GetAtomicNum() for atom in datapoint.GetAtoms()]) atom_numbers = np.expand_dims(atom_numbers, axis=1) assert atom_numbers.shape[0] == coordinates.shape[0] features = np.concatenate([atom_numbers, coordinates], axis=1) return pad_array(features, (self.max_atoms, 4))
def get_Z_matrix(self, mol, max_atoms): if len(mol.GetAtoms()) > max_atoms: raise ValueError( "A molecule is larger than permitted by max_atoms. " "Increase max_atoms and try again.") return pad_array( np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)
def coulomb_matrix(self, mol: RDKitMol) -> np.ndarray: """ Generate Coulomb matrices for each conformer of the given molecule. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray The coulomb matrices of the given molecule """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Check whether num_confs >=1 or not num_confs = len(mol.GetConformers()) if num_confs == 0: mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol, AllChem.ETKDG()) if self.remove_hydrogens: mol = Chem.RemoveHs(mol) n_atoms = mol.GetNumAtoms() z = [atom.GetAtomicNum() for atom in mol.GetAtoms()] rval = [] for conf in mol.GetConformers(): d = self.get_interatomic_distances(conf) m = np.outer(z, z) / d m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4 if self.randomize: for random_m in self.randomize_coulomb_matrix(m): random_m = pad_array(random_m, self.max_atoms) rval.append(random_m) else: m = pad_array(m, self.max_atoms) rval.append(m) rval = np.asarray(rval) return rval
def _featurize(self, datapoint: PymatgenStructure, **kwargs) -> np.ndarray: """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- datapoint: pymatgen.core.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ if 'struct' in kwargs and datapoint is None: datapoint = kwargs.get("struct") raise DeprecationWarning( 'Struct is being phased out as a parameter, please pass "datapoint" instead.' ) if self.scm is None: try: from matminer.featurizers.structure import SineCoulombMatrix as SCM self.scm = SCM(flatten=False) except ModuleNotFoundError: raise ImportError( "This class requires matminer to be installed.") # Get full N x N SCM sine_mat = self.scm.featurize(datapoint) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros(self.max_atoms) zeros[:len(eigs[0])] = eigs[0] features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Calculate symmetry function. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of symmetry function. The shape is `(max_atoms, 4)`. """ coordinates = self.coordfeat._featurize(mol) atom_numbers = np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]) atom_numbers = np.expand_dims(atom_numbers, axis=1) assert atom_numbers.shape[0] == coordinates.shape[0] features = np.concatenate([atom_numbers, coordinates], axis=1) return pad_array(features, (self.max_atoms, 4))
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """ Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues are returned sorted by absolute value in descending order and padded by max_atoms. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray The eigenvalues of Coulomb matrix for molecules. The default shape is `(num_confs, max_atoms)`. If num_confs == 1, the shape is `(max_atoms,)`. """ if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) cmat = self.coulomb_matrix(datapoint) features_list = [] for f in cmat: w, v = np.linalg.eig(f) w_abs = np.abs(w) sortidx = np.argsort(w_abs) sortidx = sortidx[::-1] w = w[sortidx] f = pad_array(w, self.max_atoms) features_list.append(f) features = np.asarray(features_list) if features.shape[0] == 1: # `(1, max_atoms)` -> `(max_atoms,)` features = np.squeeze(features, axis=0) return features
def _featurize(self, struct: PymatgenStructure) -> np.ndarray: """ Calculate sine Coulomb matrix from pymatgen structure. Parameters ---------- struct: pymatgen.core.Structure A periodic crystal composed of a lattice and a sequence of atomic sites with 3D coordinates and elements. Returns ------- features: np.ndarray 2D sine Coulomb matrix with shape (max_atoms, max_atoms), or 1D matrix eigenvalues with shape (max_atoms,). """ if self.scm is None: try: from matminer.featurizers.structure import SineCoulombMatrix as SCM self.scm = SCM(flatten=False) except ModuleNotFoundError: raise ImportError( "This class requires matminer to be installed.") # Get full N x N SCM sine_mat = self.scm.featurize(struct) if self.flatten: eigs, _ = np.linalg.eig(sine_mat) zeros = np.zeros(self.max_atoms) zeros[:len(eigs[0])] = eigs[0] features = zeros else: features = pad_array(sine_mat, self.max_atoms) features = np.asarray(features) return features