def CalculateMinHashFingerprint(mol: Chem.Mol, radius: int = 3, rtype: str = 'bitstring', bits: int = 2048) -> Tuple[str, dict, Any]: """Calculate the MinHash Fingerprint (MHFP) of molecule. doi: 10.1186/s13321-018-0321-8. :param radius: maximum radius of atom-centered substructures. :param rtype: Type of output, may either be: bitstring (default), returns a binary string numpy, return the underlying numpy array dict, for a dict of bits turned on :param bits: Number of folded bits (ignored if rtype != 'bitstring') """ mhfp = MHFPEncoder() shingles = mhfp.shingling_from_mol(mol, radius, True, True, 1) hash_values = mhfp.hash(shingles) if rtype == 'numpy': return hash_values elif rtype == 'dict': return {x: 1 for x in hash_values.tolist()} else: folded = mhfp.fold(hash_values, bits) return ''.join(map(str, folded))
class MAP4Calculator: def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False): """ MAP4 calculator class """ self.radius = radius self.is_counted = is_counted self.is_folded = is_folded if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions) def calculate(self, mol): """Calculates the atom pair minhashed fingerprint Arguments: mol -- rdkit mol object Returns: tmap VectorUint -- minhashed fingerprint """ atom_env_pairs = self._calculate(mol) if self.is_folded: return self._fold(atom_env_pairs) return self.encoder.from_string_array(atom_env_pairs) def calculate_many(self, mols): """ Calculates the atom pair minhashed fingerprint Arguments: mols -- list of mols Returns: list of tmap VectorUint -- minhashed fingerprints list """ atom_env_pairs_list = [self._calculate(mol) for mol in mols] if self.is_folded: return [self._fold(pairs) for pairs in atom_env_pairs_list] return self.encoder.batch_from_string_array(atom_env_pairs_list) def _calculate(self, mol): return self._all_pairs(mol, self._get_atom_envs(mol)) def _fold(self, pairs): fp_hash = self.encoder.hash(set(pairs)) return self.encoder.fold(fp_hash) def _get_atom_envs(self, mol): atoms_env = {} for atom in mol.GetAtoms(): idx = atom.GetIdx() for radius in range(1, self.radius + 1): if idx not in atoms_env: atoms_env[idx] = [] atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius)) return atoms_env @classmethod def _find_env(cls, mol, idx, radius): env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx) atom_map = {} submol = Chem.PathToSubmol(mol, env, atomMap=atom_map) if idx in atom_map: smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False) return smiles return '' def _all_pairs(self, mol, atoms_env): atom_pairs = [] distance_matrix = GetDistanceMatrix(mol) num_atoms = mol.GetNumAtoms() shingle_dict = defaultdict(int) for idx1, idx2 in itertools.combinations(range(num_atoms), 2): dist = str(int(distance_matrix[idx1][idx2])) for i in range(self.radius): env_a = atoms_env[idx1][i] env_b = atoms_env[idx2][i] ordered = sorted([env_a, env_b]) shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1]) if self.is_counted: shingle_dict[shingle] += 1 shingle += '|' + str(shingle_dict[shingle]) atom_pairs.append(shingle.encode('utf-8')) return list(set(atom_pairs))
class Map4Fingerprint: """Calculates the atom pair minmashed fingerprint for a given molecular object. Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the [corresponding repository](https://github.com/reymond-group/map4). """ def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False, return_strings=False): """ Parameters ---------- dimensions : int (default = 1024) Number of entries in the output map4 fingerprint. radius : int (default = 2) Number of bonds away from atom centre to consider. is_counted : bool (default = False) is_folded : bool (default = False) return_strings : bool (default = False) If True then returns substructure strings rather than hashed fingerprint. """ self.dimensions = int(dimensions) self.radius = int(radius) self.is_counted = bool(is_counted) self.is_folded = bool(is_folded) self.return_strings = bool(return_strings) if self.is_folded: self.encoder = MHFPEncoder(dimensions) else: self.encoder = tm.Minhash(dimensions) def __call__(self, mol): """Calculates the atom pair minmashed fingerprint for a given molecular object. Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the [corresponding repository](https://github.com/reymond-group/map4). Parameters ---------- mol : rdkit.Chem.rdchem.Mol `rdkit` mol object. Returns ------- fp_arr : np.ndarray shape(self.dimensions, ) Map4 fingerprint. """ atom_envs = self._get_atom_envs(mol) atom_env_pairs = self._all_pairs(mol, atom_envs) if self.is_folded: fp_arr = self._fold(atom_env_pairs) elif self.return_strings: fp_arr = atom_env_pairs else: fp_arr = self.encoder.from_string_array(atom_env_pairs) return np.asarray(fp_arr) def _fold(self, pairs): fp_hash = self.encoder.hash(set(pairs)) return self.encoder.fold(fp_hash, self.dimensions) def _get_atom_envs(self, mol): atoms_env = {} for atom in mol.GetAtoms(): idx = atom.GetIdx() for radius in range(1, self.radius + 1): if idx not in atoms_env: atoms_env[idx] = [] atoms_env[idx].append( Map4Fingerprint._find_env(mol, idx, radius)) return atoms_env @classmethod def _find_env(cls, mol, idx, radius): env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx) atom_map = {} submol = Chem.PathToSubmol(mol, env, atomMap=atom_map) if idx in atom_map: smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False) return smiles return '' def _all_pairs(self, mol, atoms_env): atom_pairs = [] distance_matrix = GetDistanceMatrix(mol) num_atoms = mol.GetNumAtoms() shingle_dict = defaultdict(int) for idx1, idx2 in itertools.combinations(range(num_atoms), 2): dist = str(int(distance_matrix[idx1][idx2])) for i in range(self.radius): env_a = atoms_env[idx1][i] env_b = atoms_env[idx2][i] ordered = sorted([env_a, env_b]) shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1]) if self.is_counted: shingle_dict[shingle] += 1 shingle += '|' + str(shingle_dict[shingle]) atom_pairs.append(shingle.encode('utf-8')) return list(set(atom_pairs))