def get_all(self): sub_mol, pro_mol, name = self.get_mol() for i in tqdm(range(len(sub_mol))): react_atom = set() sub_adj = rdmolops.GetAdjacencyMatrix(sub_mol[i], useBO=1).astype(int) pro_adj = rdmolops.GetAdjacencyMatrix(pro_mol[i], useBO=1).astype(int) sub_map2id = self.map2id(sub_mol[i]) pro_map2id = self.map2id(pro_mol[i]) # print('sub_id', sub_map2id) # print('pro_id', pro_map2id) sub_id2map = self.id2map(sub_mol[i]) pro_id2map = self.id2map(pro_mol[i]) # print('sub_map', sub_id2map) # print('pro_map', pro_id2map) sub2pro_id = self.sub_map2pro_map2pro_id(sub_mol[i], pro_mol[i]) pro2sub_id = self.pro_map2sub_map2sub_id(sub_mol[i], pro_mol[i]) # print('sub_pro_id', sub2pro_id) # print("name", name[i]) ra = self.compare_adj(react_atom, sub_adj, pro_adj, sub_id2map, sub_map2id, pro_map2id, pro_id2map, sub2pro_id, pro2sub_id) res_str = '' if len(ra) > 0: for j in ra: res_str = res_str + ' ' + str(j) sub_mol[i].SetProp('SOM', res_str) sub_mol[i].SetProp('_Name', name[i]) w.write(sub_mol[i]) # print('--------------------------------------------------------') print('done')
def score(self, smiles): mol = Chem.MolFromSmiles(smiles) try: logp = MolLogP(mol) except: logp = -1000 sa_score = -sascorer.calculateScore(mol) cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length sa_score_norm = (sa_score - self._sa_mean) / self._sa_std logp_norm = (logp - self._logp_mean) / self._logp_std cycle_score_norm = (cycle_score - self._cycle_mean) / self._cycle_std return sa_score_norm + logp_norm + cycle_score_norm
def calc_score(smiles): if verify_sequence(smiles): try: molecule = MolFromSmiles(smiles) if Descriptors.MolWt(molecule) > 500: return -1e10 current_log_P_value = Descriptors.MolLogP(molecule) current_SA_score = -sascorer.calculateScore(molecule) cycle_list = nx.cycle_basis( nx.Graph(rdmolops.GetAdjacencyMatrix(molecule))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length current_SA_score_normalized = (current_SA_score - SA_mean) / SA_std current_log_P_value_normalized = (current_log_P_value - logP_mean) / logP_std current_cycle_score_normalized = (current_cycle_score - cycle_mean) / cycle_std score = (current_SA_score_normalized + current_log_P_value_normalized + current_cycle_score_normalized) return score except Exception: return -1e10 else: return -1e10
def _tensorize(self, batch_x): atom_tensor = np.zeros( (len(batch_x), self.num_atoms, self.get_num_features())) adjm_tensor = np.zeros((len(batch_x), self.num_atoms, self.num_atoms)) for mol_idx, mol in enumerate(batch_x): mol_atoms = mol.GetNumAtoms() # Atom features atom_tensor[mol_idx, :mol_atoms, :] = self.get_atom_features(mol) # Adjacency matrix adjms = np.array(rdmolops.GetAdjacencyMatrix(mol), dtype="float") # Normalize adjacency matrix by D^(-1/2) * A_hat * D^(-1/2), Kipf et al. 2016 adjms += np.eye(mol_atoms) degree = np.array(adjms.sum(1)) deg_inv_sqrt = np.power(degree, -0.5) deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0. deg_inv_sqrt = np.diag(deg_inv_sqrt) adjms = np.matmul(np.matmul(deg_inv_sqrt, adjms), deg_inv_sqrt) adjm_tensor[mol_idx, :mol_atoms, :mol_atoms] = adjms return [atom_tensor, adjm_tensor]
def logp_evaluator(self, new_compound, rank): ind=rank try: m = Chem.MolFromSmiles(str(new_compound[0])) except BaseException: m = None if m is not None: try: logp = Descriptors.MolLogP(m) except BaseException: logp = -1000 SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[0])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix( MolFromSmiles( new_compound[0])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length SA_score_norm = SA_score # (SA_score-SA_mean)/SA_std logp_norm = logp # (logp-logP_mean)/logP_std cycle_score_norm = cycle_score # (cycle_score-cycle_mean)/cycle_std score_one = SA_score_norm + logp_norm + cycle_score_norm score = score_one / (1 + abs(score_one)) else: score = -1000 / (1 + 1000) return score, new_compound[0]
def compute_mol_score(s): logP_values, SA_scores, cycle_scores, SA_scores_normalized, logP_values_normalized, cycle_scores_normalized = get_rdkit_score( ) current_log_P_value = Descriptors.MolLogP(MolFromSmiles(s)) current_SA_score = -sascorer.calculateScore(MolFromSmiles(s)) cycle_list = nx.cycle_basis( nx.Graph(rdmolops.GetAdjacencyMatrix(MolFromSmiles(s)))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length current_SA_score_normalized = (current_SA_score - np.mean(SA_scores)) / np.std(SA_scores) current_log_P_value_normalized = ( current_log_P_value - np.mean(logP_values)) / np.std(logP_values) current_cycle_score_normalized = ( current_cycle_score - np.mean(cycle_scores)) / np.std(cycle_scores) score = current_SA_score_normalized + current_log_P_value_normalized + current_cycle_score_normalized #y_new = -current_log_P_value_normalized return score
def load_bbbp(N=40): print('Loading data...') df = pd.read_csv('bbbp/BBBP.csv') feature_matrices = [] # np.zeros((len(df), N, 1)) adj_matrices = [] # np.zeros((len(df), N, N)) labels = [] # np.zeros((len(df), 1)) smiles_list = [] nums = [] for i in tqdm(range(len(df))): row = df.iloc[i] nums.append(row.num) smiles_list.append(row.smiles) mol = Chem.MolFromSmiles(row.smiles) if mol is None: continue # Adjacency Matrix adj = rdmolops.GetAdjacencyMatrix(mol, useBO=True) adj_matrix = np.zeros((N, N)) s0, s1 = adj.shape if s0 > N: continue # adj_matrix[:s0, :s1] = adj + np.eye(s0) adj_matrix[:s0, :s1] = adj adj_matrices.append(adj_matrix) # Feature Vector atomic_nums = [atom.GetAtomicNum() for atom in mol.GetAtoms()] padded_atomic_nums = [0] * N padded_atomic_nums[:len(atomic_nums)] = atomic_nums feature_matrices.append(padded_atomic_nums) # Labels labels.append(row.p_np) enc = OneHotEncoder(handle_unknown='ignore', sparse=False) one_hot_feature_matrices = enc.fit_transform(feature_matrices) one_hot_feature_matrices = np.reshape(one_hot_feature_matrices, (-1, N, 8)) dataset = [] for i in range(len(labels)): X = torch.from_numpy(one_hot_feature_matrices[i]).float() A = torch.from_numpy(adj_matrices[i]).float() y = torch.Tensor([[labels[i]]]).float() mol_num = torch.Tensor([nums[i]]) A_coo = coo_matrix(A) edge_index = torch.from_numpy(np.vstack([A_coo.row, A_coo.col])).long() edge_weight = torch.from_numpy(A_coo.data).float() # breakpoint() dataset.append( Data( x=X, edge_index=edge_index, edge_attr=edge_weight, y=y, # smiles=smiles_list[i], A=A, # atomic_nums=feature_matrices[i], mol_num=mol_num)) return dataset
def get_atom_features(mol, dist_matrix): """ Compute the following features for each atom in 'mol': - atom type: H, C, N, O, F (one-hot) - degree: 1, 2, 3, 4, 5 (one-hot) - Hybridization: SP, SP2, SP3, UNSPECIFIED (one-hot) - is aromatic: bool {0, 1} - formal charge: int - atomic number: float - average bond length: float - average weight of neigboring atoms: float """ n_atoms = mol.GetNumAtoms() # 获取原子个数 features = np.zeros((n_atoms, C.N_ATOM_FEATURES)) # 初始化原子特征数组 adj_matrix = rdmolops.GetAdjacencyMatrix( mol) # 通过rdmolops.GetAdjacencyMatrix函数获取领接矩阵 for a in mol.GetAtoms(): idx = a.GetIdx() # 通过mol结构内置函数GetIdx()获取id if sum(adj_matrix[idx]) > 0: # ave_bond_length = np.mean( dist_matrix[idx][adj_matrix[idx] == 1]) # 获取键长的均值 ave_neighbor_wt = np.mean( [n.GetAtomicNum() for n in a.GetNeighbors()]) # 获取周边原子的质子数的均值 else: ave_bond_length, ave_neighbor_wt = 0.0, 0.0 # 如果没有周边原子,赋值为0 sym = a.GetSymbol() # 获取原子的标记symbol a_feats = one_hot_encoding(sym, C.SYMBOLS) \ + one_hot_encoding(a.GetDegree(), C.DEGREES) \ + one_hot_encoding(a.GetHybridization(), C.HYBRIDIZATIONS) \ + [a.GetIsAromatic(), a.GetFormalCharge(), a.GetAtomicNum(), ave_bond_length, ave_neighbor_wt] # one-hot编码分子中原子的类型、度矩阵、杂化类型; 添加分子中是否有芳香环、部分电荷、原子个数、键长均值、周边原子的质子均值 features[idx, :len(a_feats)] = np.array(a_feats) # 填充features数组 return features
def mol2graph_igraph(mol): """ Convert molecule to nx.Graph Adapted from https://iwatobipen.wordpress.com/2016/12/30/convert-rdkit-molecule-object-to-igraph-graph-object/ """ mol = mol.to_rdkit() admatrix = rdmolops.GetAdjacencyMatrix(mol) bondidxs = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()] adlist = np.ndarray.tolist(admatrix) graph = igraph.Graph() g = graph.Adjacency(adlist).as_undirected() ## set properties # for idx in g.vs.indices: # g.vs[idx][ "AtomicNum" ] = mol.GetAtomWithIdx(idx).GetAtomicNum() # g.vs[idx][ "AtomicSymbole" ] = mol.GetAtomWithIdx(idx).GetSymbol() # for bd in bondidxs: # btype = mol.GetBondBetweenAtoms(bd[0], bd[1]).GetBondTypeAsDouble() # g.es[g.get_eid(bd[0], bd[1])]["BondType"] = btype # print( bd, mol.GetBondBetweenAtoms(bd[0], bd[1]).GetBondTypeAsDouble() ) return g
def calc_score(mol): logP_mean = 2.457 # np.mean(logP_values) logP_std = 1.434 # np.std(logP_values) SA_mean = -3.053 # np.mean(SA_scores) SA_std = 0.834 # np.std(SA_scores) cycle_mean = -0.048 # np.mean(cycle_scores) cycle_std = 0.287 # np.std(cycle_scores) molecule = mol if Descriptors.MolWt(molecule) > 500: return -1e10 current_log_P_value = Descriptors.MolLogP(molecule) current_SA_score = -sascorer.calculateScore(molecule) cycle_list = nx.cycle_basis(nx.Graph( rdmolops.GetAdjacencyMatrix(molecule))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length current_SA_score_normalized = (current_SA_score - SA_mean) / SA_std current_log_P_value_normalized = (current_log_P_value - logP_mean) / logP_std current_cycle_score_normalized = (current_cycle_score - cycle_mean) / cycle_std score = (current_SA_score_normalized + current_log_P_value_normalized + current_cycle_score_normalized)
def gaussion_workers(chem_model, val): while True: simulation_time = time.time() task = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) tag = status.Get_tag() if tag == START: state = task[0] m = task[1] all_posible = chem_kn_simulation(chem_model, state, val, m) generate_smile = predict_smile(all_posible, val) new_compound = make_input_smile(generate_smile) score = [] kao = [] try: m = Chem.MolFromSmiles(str(new_compound[0])) except: m = None #if m!=None and len(task[i])<=81: if m != None: try: logp = Descriptors.MolLogP(m) except: logp = -1000 SA_score = -sascorer.calculateScore( MolFromSmiles(new_compound[0])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix( MolFromSmiles(new_compound[0])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length #print cycle_score #print SA_score #print logp SA_score_norm = (SA_score - SA_mean) / SA_std logp_norm = (logp - logP_mean) / logP_std cycle_score_norm = (cycle_score - cycle_mean) / cycle_std score_one = SA_score_norm + logp_norm + cycle_score_norm score.append(score_one) else: score.append(-1000) score.append(new_compound[0]) score.append(rank) comm.send(score, dest=0, tag=DONE) simulation_fi_time = time.time() - simulation_time print "simulation_fi_time:", simulation_fi_time if tag == EXIT: MPI.Abort(MPI.COMM_WORLD) comm.send(None, dest=0, tag=EXIT)
def calc_score(smiles): if verify_sequence(smiles): molecule = MolFromSmiles(smiles) current_log_P_value = Descriptors.MolLogP(molecule) current_SA_score = -sascorer.calculateScore(molecule) cycle_list = nx.cycle_basis( nx.Graph(rdmolops.GetAdjacencyMatrix(molecule))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length current_SA_score_normalized = (current_SA_score - np.mean(SA_scores)) / np.std(SA_scores) current_log_P_value_normalized = ( current_log_P_value - np.mean(logP_values)) / np.std(logP_values) current_cycle_score_normalized = ( current_cycle_score - np.mean(cycle_scores)) / np.std(cycle_scores) score = (current_SA_score_normalized + current_log_P_value_normalized + current_cycle_score_normalized) return score else: raise ValueError("Error in calc_score: smiles is invalid.")
def check_node_type(new_compound): node_index = [] valid_compound = [] all_smile = [] distance = [] score = [] for i in range(len(new_compound)): ko = Chem.MolFromSmiles(new_compound[i]) if ko != None: SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[i])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles( new_compound[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 if cycle_length == 0: m = rdock_score(new_compound[i]) if m < 10**10: node_index.append(i) valid_compound.append(new_compound[i]) score.append(m) return node_index, score, valid_compound
def is_ts_correct(rsmi, psmi, irc_start_xyz, irc_end_xyz): """ This function compares the input smiles with the smiles of the endpoints of the IRC. """ print(rsmi, psmi) rmol = smiles_to_mol(rsmi) pmol = smiles_to_mol(psmi) charge = GetFormalCharge(rmol) ts_found = False #doing smiles check irc_start_smi, _, _ = get_smiles(irc_start_xyz, charge) print("reverse SMILES: ", irc_start_smi) irc_end_smi, _, _ = get_smiles(irc_end_xyz, charge) print("forward smiles: ", irc_end_smi) if irc_start_smi == rsmi and irc_end_smi == psmi: ts_found = True print("SMILES MATCH: TS FOUND: reactant = reverse") if irc_start_smi == psmi and irc_end_smi == rsmi: ts_found = True print("SMILES MATCH: TS FOUND: reactant = forward") #doing AC check r_ac = rdmolops.GetAdjacencyMatrix(rmol) p_ac = rdmolops.GetAdjacencyMatrix(pmol) irc_start_mol = smiles_to_mol(irc_start_smi) irc_end_mol = smiles_to_mol(irc_end_smi) irc_start_ac = rdmolops.GetAdjacencyMatrix(irc_start_mol) irc_end_ac = rdmolops.GetAdjacencyMatrix(irc_end_mol) if np.all(irc_start_ac == irc_end_ac): print("found TS for conformational change") else: print("found non-coonformational change") if np.all(r_ac == irc_start_ac) and np.all(p_ac == irc_end_ac): print("AC MATCH: reactant = reverse") if np.all(p_ac == irc_start_ac) and np.all(r_ac == irc_end_ac): print("AC MATCH: reactant = forward") return ts_found
def compute(self, mol): cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max(map(len, cycle_list)) return max(0, cycle_length - 6)
def max_pair_distance_pairs(mol: RDKitMol, max_pair_distance: Optional[int]) -> np.ndarray: """Helper method which finds atom pairs within max_pair_distance graph distance. This helper method is used to find atoms which are within max_pair_distance graph_distance of one another. This is done by using the fact that the powers of an adjacency matrix encode path connectivity information. In particular, if `adj` is the adjacency matrix, then `adj**k` has a nonzero value at `(i, j)` if and only if there exists a path of graph distance `k` between `i` and `j`. To find all atoms within `max_pair_distance` of each other, we can compute the adjacency matrix powers `[adj, adj**2, ...,adj**max_pair_distance]` and find pairs which are nonzero in any of these matrices. Since adjacency matrices and their powers are positive numbers, this is simply the nonzero elements of `adj + adj**2 + ... + adj**max_pair_distance`. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit molecules max_pair_distance: Optional[int], (default None) This value can be a positive integer or None. This parameter determines the maximum graph distance at which pair features are computed. For example, if `max_pair_distance==2`, then pair features are computed only for atoms at most graph distance 2 apart. If `max_pair_distance` is `None`, all pairs are considered (effectively infinite `max_pair_distance`) Returns ------- np.ndarray Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs within `max_pair_distance` of one another. """ from rdkit import Chem from rdkit.Chem import rdmolops N = len(mol.GetAtoms()) if (max_pair_distance is None or max_pair_distance >= N): max_distance = N elif max_pair_distance is not None and max_pair_distance <= 0: raise ValueError( "max_pair_distance must either be a positive integer or None") elif max_pair_distance is not None: max_distance = max_pair_distance adj = rdmolops.GetAdjacencyMatrix(mol) # Handle edge case of self-pairs (i, i) sum_adj = np.eye(N) for i in range(max_distance): # Increment by 1 since we don't want 0-indexing power = i + 1 sum_adj += np.linalg.matrix_power(adj, power) nonzero_locs = np.where(sum_adj != 0) num_pairs = len(nonzero_locs[0]) # This creates a matrix of shape (2, num_pairs) pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs)) return pair_edges
def check_node_type(new_compound, SA_mean, SA_std, logP_mean, logP_std, cycle_mean, cycle_std): node_index = [] valid_compound = [] logp_value = [] all_smile = [] distance = [] #print "SA_mean:",SA_mean #print "SA_std:",SA_std #print "logP_mean:",logP_mean #print "logP_std:",logP_std #print "cycle_mean:",cycle_mean #print "cycle_std:",cycle_std activity = [] score = [] for i in range(len(new_compound)): try: m = Chem.MolFromSmiles(str(new_compound[i])) except: print(None) if m != None and len(new_compound[i]) <= 81: try: logp = Descriptors.MolLogP(m) except: logp = -1000 node_index.append(i) valid_compound.append(new_compound[i]) SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[i])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles( new_compound[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length #print cycle_score #print SA_score #print logp SA_score_norm = (SA_score - SA_mean) / SA_std logp_norm = (logp - logP_mean) / logP_std cycle_score_norm = (cycle_score - cycle_mean) / cycle_std score_one = SA_score_norm + logp_norm + cycle_score_norm score.append(score_one) all_smile.append(new_compound[i]) return node_index, score, valid_compound, all_smile
def add_sc_angle_features(df, xyzs, dist_matrices): """ Adds the following angle features to 'df': - diangle: for 3J couplings - cos_angle: for 2J couplings, angle between sc atom 0, atom in between sc atoms and sc atom 1 - cos_angle0: for all coupling types, cos angle between sc atoms and atom closest to atom 0 (except for 1J coupling) - cos_angle1: for all coupling types, cos angle between sc atoms and atom closest to atom 1 """ df['diangle'] = 0.0 df['cos_angle'] = 0.0 df['cos_angle0'] = 0.0 df['cos_angle1'] = 0.0 diangles, cos_angles, cos_angles0, cos_angles1 = {}, {}, {}, {} print('Add scalar coupling angle based features.') n = len(df) for i, (idx, row) in enumerate(df.iterrows()): print_progress(i, n, 500000) #if row['molecule_name'] == 'dsgdb9nsd_086797': # pdb.set_trace() mol_name = row['molecule_name'] mol, xyz = mols[mol_name], xyzs[mol_name] dist_matrix = dist_matrices[mol_name] adj_matrix = rdmolops.GetAdjacencyMatrix(mol) idx0, idx1 = row['atom_index_0'], row['atom_index_1'] atom_ids = rdmolops.GetShortestPath(mol, idx0, idx1) if len(atom_ids)==4: diangles[idx] = dihedral(xyz[atom_ids,:]) elif len(atom_ids)==3: cos_angles[idx] = cosine_angle(xyz[atom_ids,:]) if row['type'] not in [0, 2]: neighbors0 = np.where(adj_matrix[idx0]==1)[0] ### if len(neighbors0) > 0: idx0_closest = neighbors0[ dist_matrix[idx0][neighbors0].argmin()] cos_angles0[idx] = cosine_angle( xyz[[idx0_closest, idx0, idx1],:]) neighbors1 = np.setdiff1d(np.where(adj_matrix[idx1]==1)[0], [idx0]) if len(neighbors1) > 0: idx1_closest = neighbors1[ dist_matrix[idx1][neighbors1].argmin()] cos_angles1[idx] = cosine_angle( xyz[[idx0, idx1, idx1_closest],:]) df['diangle'] = pd.Series(diangles).abs() df['cos_angle'] = pd.Series(cos_angles) df['cos_angle0'] = pd.Series(cos_angles0) df['cos_angle1'] = pd.Series(cos_angles1) df.fillna(0., inplace=True) return df
def _cycle_score(mol): cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 return cycle_length
def simulation(chem_model, state, node): #time.sleep(10) val = [ '\n', '&', 'C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]', 's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]', '[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7', 'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]', '[PH+]', '[PH]', '8', '[S@@+]' ] all_posible = chem_kn_simulation(chem_model, state, val) generate_smile = predict_smile(all_posible, val) new_compound = make_input_smile(generate_smile) #score=[] kao = [] try: m = Chem.MolFromSmiles(str(new_compound[0])) #print (str(new_compound[0])) except: m = None if m != None: try: logp = Descriptors.MolLogP(m) except: logp = -1000 SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[0])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(new_compound[0])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length SA_score_norm = SA_score #(SA_score-SA_mean)/SA_std logp_norm = logp #(logp-logP_mean)/logP_std cycle_score_norm = cycle_score #(cycle_score-cycle_mean)/cycle_std score_one = SA_score_norm + logp_norm + cycle_score_norm #score.append(score_one) score = score_one / (1 + abs(score_one)) else: #score.append(-1000) score = -1000 / (1 + 1000) #score.append(new_compound[0]) #score.append(rank) return score
def construct_adj_matrix(mol, out_size=-1, self_connection=True): """Returns the adjacent matrix of the given molecule. This function returns the adjacent matrix of the given molecule. Contrary to the specification of :func:`rdkit.Chem.rdmolops.GetAdjacencyMatrix`, The diagonal entries of the returned matrix are all-one. Args: mol (rdkit.Chem.Mol): Input molecule. out_size (int): The size of the returned matrix. If this option is negative, it does not take any effect. Otherwise, it must be larger than the number of atoms in the input molecules. In that case, the adjacent matrix is expanded and zeros are padded to right columns and bottom rows. self_connection (bool): Add self connection or not. If True, diagonal element of adjacency matrix is filled with 1. Returns: adj_array (numpy.ndarray): The adjacent matrix of the input molecule. It is 2-dimensional array with shape (atoms1, atoms2), where atoms1 & atoms2 represent from and to of the edge respectively. If ``out_size`` is non-negative, the returned its size is equal to that value. Otherwise, it is equal to the number of atoms in the the molecule. """ adj = rdmolops.GetAdjacencyMatrix(mol) s0, s1 = adj.shape if s0 != s1: raise ValueError('The adjacent matrix of the input molecule' 'has an invalid shape: ({}, {}). ' 'It must be square.'.format(s0, s1)) if self_connection: adj = adj + numpy.eye(s0) if out_size < 0: adj_array = adj.astype(numpy.float32) elif out_size >= s0: adj_array = numpy.zeros((out_size, out_size), dtype=numpy.float32) adj_array[:s0, :s1] = adj else: raise ValueError( '`out_size` (={}) must be negative or larger than or equal to the ' 'number of atoms in the input molecules (={}).' .format(out_size, s0)) return adj_array
def mol2graph(mol): admatrix = rdmolops.GetAdjacencyMatrix(mol) bondidxs = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()] adlist = np.ndarray.tolist(admatrix) graph = igraph.Graph() g = graph.Adjacency(adlist).as_undirected() for idx in g.vs.indices: g.vs[idx]["AtomicNum"] = mol.GetAtomWithIdx(idx).GetAtomicNum() g.vs[idx]["AtomicSymbole"] = mol.GetAtomWithIdx(idx).GetSymbol() for bd in bondidxs: btype = mol.GetBondBetweenAtoms(bd[0], bd[1]).GetBondTypeAsDouble() g.es[g.get_eid(bd[0], bd[1])]["BondType"] = btype # print( bd, mol.GetBondBetweenAtoms( bd[0], bd[1] ).GetBondTypeAsDouble() ) return g
def get_atom_features(mol, dist_matrix): """ Compute the following features for each atom in 'mol': - atom type: H, C, N, O, F (one-hot) - degree: 1, 2, 3, 4, 5 (one-hot) - Hybridization: SP, SP2, SP3, UNSPECIFIED (one-hot) - is aromatic: bool {0, 1} - formal charge: int - atomic number: float - average bond length: float - average weight of neigboring atoms: float - donor: bool {0, 1} - acceptor: bool {0, 1} """ n_atoms = mol.GetNumAtoms() features = np.zeros((n_atoms, C.N_ATOM_FEATURES)) adj_matrix = rdmolops.GetAdjacencyMatrix(mol) for a in mol.GetAtoms(): idx = a.GetIdx() if sum(adj_matrix[idx]) > 0: ave_bond_length = np.mean(dist_matrix[idx][adj_matrix[idx] == 1]) ave_neighbor_wt = np.mean( [n.GetAtomicNum() for n in a.GetNeighbors()]) else: ave_bond_length, ave_neighbor_wt = 0.0, 0.0 sym = a.GetSymbol() a_feats = one_hot_encoding(sym, C.SYMBOLS) \ + one_hot_encoding(a.GetDegree(), C.DEGREES) \ + one_hot_encoding(a.GetHybridization(), C.HYBRIDIZATIONS) \ + [a.GetIsAromatic(), a.GetFormalCharge(), a.GetAtomicNum(), ave_bond_length, ave_neighbor_wt] features[idx, :len(a_feats)] = np.array(a_feats) feat_factory = ChemicalFeatures.BuildFeatureFactory(C.FDEF) try: chem_feats = feat_factory.GetFeaturesForMol(mol) for t in range(len(chem_feats)): if chem_feats[t].GetFamily() == 'Donor': for i in chem_feats[t].GetAtomIds(): features[i, -2] = 1 elif chem_feats[t].GetFamily() == 'Acceptor': for i in chem_feats[t].GetAtomIds(): features[i, -1] = 1 except RuntimeError as e: print(e) return features
def mol_to_graph_data(mol): A = rdmolops.GetAdjacencyMatrix(mol) node_features, edge_features = {}, {} bondidxs = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()] for idx in range(A.shape[0]): atomic_num = mol.GetAtomWithIdx(idx).GetAtomicNum() node_features[idx]["label"] = int(atomic_num) for b1, b2 in bondidxs: btype = mol.GetBondBetweenAtoms(b1, b2).GetBondTypeAsDouble() edge_features[(b1, b2)]["label"] = int(btype) return A, node_features, edge_features
def cycle_score(m): """ Input : a mol object Output : cycle score penalty (scalar) """ cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(m))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([ len(j) for j in cycle_list ]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 return float(cycle_length)
def _get_valid_mols(self): tmpmols = Chem.SDMolSupplier(self.filename) mols = [] for m in tmpmols: if m is None: continue try: rdmolops.GetAdjacencyMatrix(m) except Exception as e: print(e) continue edge_index, _ = get_mol_edge_index(m, self.edge_types) if edge_index.nelement() == 0: continue mols.append(m) return mols
def simulation(chem_model, state, node): val = [ '\n', '&', 'C', 'O', '(', 'F', ')', '1', '2', '=', '#', '[C@H]', '[C@@H]', '3', '[O-]', '[C@@]', '[C]', '[CH]', '/', '[C@]', '[CH2]', '4', '[O+]', '[O]', '5' ] all_posible = chem_kn_simulation(chem_model, state) generate_smile = predict_smile(all_posible, val) new_compound = make_input_smile(generate_smile) #score=[] kao = [] try: m = Chem.MolFromSmiles(str(new_compound[0])) except: m = None #if m!=None and len(task[i])<=81: if m != None: try: logp = Descriptors.MolLogP(m) except: logp = -1000 SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[0])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(new_compound[0])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length SA_score_norm = (SA_score - SA_mean) / SA_std logp_norm = (logp - logP_mean) / logP_std cycle_score_norm = (cycle_score - cycle_mean) / cycle_std score_one = SA_score_norm + logp_norm + cycle_score_norm #score.append(score_one) score = score_one / (1 + abs(score_one)) else: #score.append(-1000) score = -1000 / (1 + 1000) #score.append(new_compound[0]) #score.append(rank) node.reward = score return node
def get_molecules(): """ Constructs rdkit mol objects derrived from the .xyz files. Also returns: - mol ids (unique numerical ids) - set of molecule level features - arrays of xyz coordinates - euclidean distance matrices - graph distance matrices. All objects are returned in dictionaries with 'mol_name' as keys. """ mols, mol_ids, mol_feats = {}, {}, {} xyzs, dist_matrices, graph_dist_matrices = {}, {}, {} print('Create molecules and distance matrices.') for i in range(C.N_MOLS): print_progress(i, C.N_MOLS) filepath = xyz_filepath_list[i] mol_name = filepath.split('/')[-1][:-4] mol, xyz, dist_matrix = mol_from_xyz(filepath) #读取XYZ文件获取结构mol和距离矩阵,坐标 mols[mol_name] = mol xyzs[mol_name] = xyz dist_matrices[mol_name] = dist_matrix mol_ids[mol_name] = i # 数据集中分子序号作为分子的id # make padded graph distance matrix dataframes n_atoms = len(xyz) graph_dist_matrix = pd.DataFrame( np.pad(rdmolops.GetDistanceMatrix(mol), [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)], 'constant')) #通过ramolops.GetDistanceMatrix获取 图距离矩阵 graph_dist_matrix['molecule_id'] = n_atoms * [ i ] # eg: CH4 5 * [0] = [0, 0, 0, 0, 0] list数据可以为dataframe赋值 graph_dist_matrices[mol_name] = graph_dist_matrix #字典:value: dataframe # compute molecule level features adj_matrix = rdmolops.GetAdjacencyMatrix( mol) #通过ramolops.GetDistanceMatrix获取 图邻接矩阵 atomic_num_list, _, _ = read_xyz_file( filepath) #读取XYZ文件获取分子中各原子的原子序数和坐标 dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() == 1] #通过邻接矩阵的下三角获取与相邻原子之间的距离 mol_feats[mol_name] = pd.Series( [np.mean(dists), np.std(dists), np.mean(atomic_num_list)], index=mol_feat_columns) #获取与领接原子之间距离均值和标准差、原子序数的均值(分子级特征) return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices #返回训练集所有分子结构mol和分子ids,分子级特征,原子坐标,距离矩阵,图距离矩阵
def get_score_components_from_mol(this_mol): try: logP = Descriptors.MolLogP(this_mol) except: logP = 0.0 SA_score = -sascorer.calculateScore(this_mol) cycle_list = nx.cycle_basis(nx.Graph( rdmolops.GetAdjacencyMatrix(this_mol))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_score = -cycle_length return logP, SA_score, cycle_score
def tensorize(self, batch_x, batch_c): atom_tensor = np.zeros( (len(batch_x), self.num_atoms, self.get_num_features())) adjm_tensor = np.zeros((len(batch_x), self.num_atoms, self.num_atoms)) posn_tensor = np.zeros( (len(batch_x), self.num_atoms, self.num_atoms, 3)) for mol_idx, mol in enumerate(batch_x): Chem.RemoveHs(mol) mol_atoms = mol.GetNumAtoms() # Atom features atom_tensor[mol_idx, :mol_atoms, :] = self.get_atom_features(mol) # Adjacency matrix adjms = np.array(rdmolops.GetAdjacencyMatrix(mol), dtype="float") # Normalize adjacency matrix by D^(-1/2) * A_hat * D^(-1/2), Kipf et al. 2016 adjms += np.eye(mol_atoms) degree = np.array(adjms.sum(1)) deg_inv_sqrt = np.power(degree, -0.5) deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0. deg_inv_sqrt = np.diag(deg_inv_sqrt) adjms = np.matmul(np.matmul(deg_inv_sqrt, adjms), deg_inv_sqrt) adjm_tensor[mol_idx, :mol_atoms, :mol_atoms] = adjms # Relative position matrix for atom_idx in range(mol_atoms): pos_c = batch_c[mol_idx][atom_idx] for neighbor_idx in range(mol_atoms): pos_n = batch_c[mol_idx][neighbor_idx] # Direction should be Neighbor -> Center n_to_c = [ pos_c[0] - pos_n[0], pos_c[1] - pos_n[1], pos_c[2] - pos_n[2] ] posn_tensor[mol_idx, atom_idx, neighbor_idx, :] = n_to_c return [atom_tensor, adjm_tensor, posn_tensor]