def __getitem__(self, idx): #idx = 0 key = self.keys[idx] with open(self.data_dir+'/'+key, 'rb') as f: m1, m2 = pickle.load(f) #prepare ligand n1 = m1.GetNumAtoms() c1 = m1.GetConformers()[0] d1 = np.array(c1.GetPositions()) adj1 = GetAdjacencyMatrix(m1)+np.eye(n1) H1 = get_atom_feature(m1, True) #prepare protein n2 = m2.GetNumAtoms() c2 = m2.GetConformers()[0] d2 = np.array(c2.GetPositions()) adj2 = GetAdjacencyMatrix(m2)+np.eye(n2) H2 = get_atom_feature(m2, False) #aggregation H = np.concatenate([H1, H2], 0) agg_adj1 = np.zeros((n1+n2, n1+n2)) agg_adj1[:n1, :n1] = adj1 agg_adj1[n1:, n1:] = adj2 agg_adj2 = np.copy(agg_adj1) dm = distance_matrix(d1,d2) agg_adj2[:n1,n1:] = np.copy(dm) agg_adj2[n1:,:n1] = np.copy(np.transpose(dm)) #node indice for aggregation valid = np.zeros((n1+n2,)) valid[:n1] = 1 #pIC50 to class Y = 1 if 'CHEMBL' in key else 0 #if n1+n2 > 300 : return None sample = { 'H':H, \ 'A1': agg_adj1, \ 'A2': agg_adj2, \ 'Y': Y, \ 'V': valid, \ 'key': key, \ } return sample
def __getitem__(self, idx): item = self.smiles_dataset[idx] #item = Chem.MolToSmiles(Chem.MolFromSmiles(i)) input_random, input_label, input_adj_mask = self.random_masking(item) input_data = [self.vocab.start_index ] + input_random + [self.vocab.end_index] input_label = [self.vocab.pad_index ] + input_label + [self.vocab.pad_index] input_adj_mask = [0] + input_adj_mask + [0] if self.mat_pos == 'start': input_adj_mask = [1] smiles_bert_input = input_data[:self.seq_len] smiles_bert_label = input_label[:self.seq_len] smiles_bert_adj_mask = input_adj_mask[:self.seq_len] padding = [0 for _ in range(self.seq_len - len(smiles_bert_input))] smiles_bert_input.extend(padding) smiles_bert_label.extend(padding) smiles_bert_adj_mask.extend(padding) mol = Chem.MolFromSmiles(self.adj_dataset[idx]) adj_mat = GetAdjacencyMatrix(mol) smiles_bert_adjmat = self.zero_padding(adj_mat, (self.seq_len, self.seq_len)) output = {"smiles_bert_input": smiles_bert_input, "smiles_bert_label": smiles_bert_label, \ "smiles_bert_adj_mask": smiles_bert_adj_mask, "smiles_bert_adjmat": smiles_bert_adjmat, "smiles_bert_value": QED.qed(mol)} return {key: torch.tensor(value) for key, value in output.items()}
def get_adj_matrix(mol): ''''Get self-loop added adjacency matrix''' n = mol.GetNumAtoms() adj_matrix = GetAdjacencyMatrix(mol) + np.eye(n) adj_matrix = np.array(adj_matrix) return adj_matrix
def __getitem__(self, idx): item = self.smiles_dataset[idx] label = self.label[idx] input_token, input_adj_masking = self.CharToNum(item) input_data = [self.vocab.start_index ] + input_token + [self.vocab.end_index] input_adj_masking = [0] + input_adj_masking + [0] if self.mat_pos == 'start': input_adj_mask = [1] + [0 for _ in range(len(input_adj_mask) - 1)] smiles_bert_input = input_data[:self.seq_len] smiles_bert_adj_mask = input_adj_masking[:self.seq_len] padding = [0 for _ in range(self.seq_len - len(smiles_bert_input))] smiles_bert_input.extend(padding) smiles_bert_adj_mask.extend(padding) mol = Chem.MolFromSmiles(self.adj_dataset[idx]) #features = add_descriptors(mol) #smiles_bert_ECFP = np.array(features, dtype=np.float32) if mol != None: adj_mat = GetAdjacencyMatrix(mol) smiles_bert_adjmat = self.zero_padding( adj_mat, (self.seq_len, self.seq_len)) else: smiles_bert_adjmat = np.zeros((self.seq_len, self.seq_len), dtype=np.float32) output = {"smiles_bert_input": smiles_bert_input, "smiles_bert_label": label, \ "smiles_bert_adj_mask": smiles_bert_adj_mask, "smiles_bert_adjmat": smiles_bert_adjmat} return {key: torch.tensor(value) for key, value in output.items()}
def __getitem__(self, idx): s = self.smiles[idx] m = Chem.MolFromSmiles(s) natoms = m.GetNumAtoms() #adjacency matrix A = GetAdjacencyMatrix(m) + np.eye(natoms) A_padding = np.zeros((self.max_natoms, self.max_natoms)) A_padding[:natoms, :natoms] = A #atom feature X = [self.atom_feature(m, i) for i in range(natoms)] for i in range(natoms, max_natoms): X.append(np.zeros(28)) X = np.array(X) sample = dict() sample['X'] = torch.from_numpy(X) sample['A'] = torch.from_numpy(A_padding) sample['Y'] = self.properties[idx] return sample
def __getitem__(self, idx): s = self.smiles[idx] m = Chem.MolFromSmiles(s) natoms = m.GetNumAtoms() #from plot_mol import plot_mol_with_index(m) #adjacency matrix A = GetAdjacencyMatrix(m) + np.eye(natoms) #print(A) #print(A.shape) ######################################################## #D = np.array(np.sum(A, axis=0)) #print(D) #print(D.shape) #D = np.matrix(np.diag(D)) #print(D.shape) #A = D**-1*A #print(A.shape) #input() ######################################################## A_padding = np.zeros((self.max_natoms, self.max_natoms)) A_padding[:natoms, :natoms] = A A_padding = torch.from_numpy(A_padding) #atom feature X = [self.atom_feature(m, i) for i in range(natoms)] #print("X") #print(len(X)) for i in range(natoms, max_natoms): X.append(np.zeros(28)) X = np.array(X) #from help_tools import get_mol_feature #print(X) #print(get_mol_feature(s).all() ==X.all()) sample = dict() sample['X'] = torch.from_numpy(X) sample['A'] = A_padding sample['Y'] = self.properties[idx] sample["smi"] = s return sample
def get_mol_A_(s): m = Chem.MolFromSmiles(s) natoms = m.GetNumAtoms() A = GetAdjacencyMatrix(m) + np.eye(natoms) A_padding = np.zeros((max_natoms, max_natoms)) A_padding[:natoms, :natoms] = A return A_padding
def __getitem__(self, idx): s = self.smiles[idx] m = Chem.MolFromSmiles(s) natoms = m.GetNumAtoms() #adjacency matrix A = GetAdjacencyMatrix(m) + np.eye(natoms) #print(A) #print(A.shape) ######################################################## #D = np.array(np.sum(A, axis=0)) #print(D) #print(D.shape) #D = np.matrix(np.diag(D)) #print(D.shape) #A = D**-1*A #print(A.shape) #input() ######################################################## A_padding = np.zeros((self.max_natoms, self.max_natoms)) A_padding[:natoms, :natoms] = A A_padding = torch.from_numpy(A_padding) #d = A_padding.sum(1) #D = torch.diag(torch.pow(d , -0.5)) #A_padding = D.mm(A_padding).mm(D) #atom feature X = [self.atom_feature(m, i) for i in range(natoms)] for i in range(natoms, max_natoms): X.append(np.zeros(28)) X = np.array(X) sample = dict() sample['X'] = torch.from_numpy(X) sample['A'] = A_padding sample['Y'] = self.properties[idx] return sample
def distance_fix_pair(m): # adjacency matrix adj = GetAdjacencyMatrix(m).astype(float) adj += np.eye(len(adj)).astype(float) adj_sec_neighbor = np.matmul(adj, adj) adj += make_ring_matrix(m).astype(float) adj += make_conjugate_matrix(m).astype(float) #adj[adj>1.0] = 1.0 adj = np.matmul(adj, adj) adj += adj_sec_neighbor adj[adj > 1] = 1 return adj
def get_mol_fea(drug_smiles_list): drug_node_list = [] drug_edge_list = [] drug_n2n_list = [] drug_e2n_list = [] node_dim = len(atom_type_list + hybridization_list + num_h_list+formal_charge_list) + 1 edge_dim = len(bond_type_list) + 1 + 1 for i, smiles in tqdm.tqdm(enumerate(drug_smiles_list), total=len(drug_smiles_list)): if smiles[-1] == ' ': smiles = smiles[:-1] mol = Chem.MolFromSmiles(smiles) if mol is None: print("molecule {} is not defined well".format(smiles)) exit(1) atom_list = mol.GetAtoms() bond_list = mol.GetBonds() n_node = len(atom_list) n_edge = len(bond_list) node = np.zeros((n_node, node_dim)) for j, atom in enumerate(atom_list): node[j] += get_atom_feature(atom) node = np.array(node) n2n = GetAdjacencyMatrix(mol) edge = np.zeros((n_edge, edge_dim)) e2n = np.zeros((n_node, n_edge)) edge_idx = 0 for j in range(n_node): for k in range(j+1, n_node): bond = mol.GetBondBetweenAtoms(j, k) if bond is not None: edge[edge_idx] += bond_features(bond) e2n[j, edge_idx] += 1 e2n[k, edge_idx] += 1 edge_idx += 1 drug_node_list.append(node) drug_n2n_list.append(n2n) drug_edge_list.append(edge) drug_e2n_list.append(e2n) return (drug_node_list, drug_edge_list, drug_n2n_list, drug_e2n_list)
def molecule_to_adjacency(molecule): """ Construct an adjacency matrix using RDKit. Parameters ---------- molecule: :molLego:`Molecule` Molecule to calculate adjacency matrix for. """ # Convert molecule to rdkit mol. rdkit_mol = molecule_to_rdkit(molecule) # Calculate adjacency matrix. return GetAdjacencyMatrix(rdkit_mol)
def getNNAtoms(self, molStr, cAtoms, hight): atoms = copy.deepcopy(cAtoms) # create an RDKit mol mol = Chem.MolFromMolBlock(molStr, True, False) if not mol: print "Could not create mol for compound " return [] adj = GetAdjacencyMatrix(mol) visitedAtoms = [] for n in range(hight): for atom in copy.deepcopy(atoms): if atom not in visitedAtoms: lNN = findNeighbors(atom, adj) visitedAtoms.append(atom) for lnn in lNN: if lnn not in atoms: atoms.append(lnn) atoms.sort() return atoms
def characteristic_poly(mol_list, useBO=False): eigenvalue_list = [] max_length = 0 for mol in mol_list: evs = CharacteristicPolynomial(mol, GetAdjacencyMatrix(mol, useBO=True)) #evs = sorted(evs, reverse=True) #sort eigenvalue_list += [list(evs)] length = len(evs) if (length > max_length): max_length = length #zero padding for i in range(len(eigenvalue_list)): pad_width = max_length - len(eigenvalue_list[i]) eigenvalue_list[i] += [0] * pad_width return np.array(eigenvalue_list)
def adjacency_matrix_eigenvalues(mol_list, useBO=False): eigenvalue_list = [] max_length = 0 for mol in mol_list: adj_matrix = GetAdjacencyMatrix(mol, useBO=useBO) evs = list(np.linalg.eigvals(adj_matrix)) #evs = sorted(evs, reverse=True) #sort eigenvalue_list += [evs] length = len(evs) if (length > max_length): max_length = length #zero padding for i in range(len(eigenvalue_list)): pad_width = max_length - len(eigenvalue_list[i]) eigenvalue_list[i] += [0] * pad_width return np.array(eigenvalue_list)
def cal_internal_vdw(m): retval = 0 n = m.GetNumAtoms() c = m.GetConformers()[0] d = np.array(c.GetPositions()) dm = distance_matrix(d, d) adj = GetAdjacencyMatrix(m) topological_dm = GetDistanceMatrix(m) for i1 in range(n): for i2 in range(0, i1): param = GetUFFVdWParams(m, i1, i2) if param is None: continue d, e = param d = d * 1.0 if adj[i1, i2] == 1: continue if topological_dm[i1, i2] < 4: continue retval += e * ((d / dm[i1, i2])**12 - 2 * ((d / dm[i1, i2])**6)) # print (i1, i2, e, d) return retval
def process_molecule(molecule, max_num_atoms): num_atoms = molecule.GetNumAtoms() assert num_atoms <= max_num_atoms # Atom features features = [] for i in range(num_atoms): atomic_number = molecule.GetAtomWithIdx(i).GetAtomicNum() if atomic_number > 1: features.append([0, 1]) else: features.append([1, 0]) #features = [[atom.GetAtomicNum()] for atom in molecule.GetAtoms()] # Adjacency matrix adj = GetAdjacencyMatrix(molecule) # Padding diff = max_num_atoms - num_atoms padded_features = np.pad(features, ((0, diff), (0, 0))) padded_adj = np.pad(adj, (0, diff)) return padded_adj, padded_features
def mol_to_feature(m1, m1_uff, m2, interaction_data, pos_noise_std): # Remove hydrogens m1 = Chem.RemoveHs(m1) m2 = Chem.RemoveHs(m2) # extract valid amino acids # m2 = extract_valid_amino_acid(m2, self.amino_acids) # random rotation angle = np.random.uniform(0, 360, 1)[0] axis = np.random.uniform(-1, 1, 3) # m1 = rotate(m1, angle, axis, False) # m2 = rotate(m2, angle, axis, False) angle = np.random.uniform(0, 360, 1)[0] axis = np.random.uniform(-1, 1, 3) m1_rot = rotate(copy.deepcopy(m1), angle, axis, True) # prepare ligand n1 = m1.GetNumAtoms() d1 = np.array(m1.GetConformers()[0].GetPositions()) d1 += np.random.normal(0.0, pos_noise_std, d1.shape) d1_rot = np.array(m1_rot.GetConformers()[0].GetPositions()) adj1 = GetAdjacencyMatrix(m1) + np.eye(n1) h1 = get_atom_feature(m1, True) # prepare protein n2 = m2.GetNumAtoms() c2 = m2.GetConformers()[0] d2 = np.array(c2.GetPositions()) d2 += np.random.normal(0.0, pos_noise_std, d2.shape) adj2 = GetAdjacencyMatrix(m2) + np.eye(n2) h2 = get_atom_feature(m2, True) # prepare distance vector dmv = dm_vector(d1, d2) dmv_rot = dm_vector(d1_rot, d2) # get interaction matrix # A_int = get_interaction_matrix(d1, d2, interaction_data) A_int = np.zeros( (len(interaction_types), m1.GetNumAtoms(), m2.GetNumAtoms())) A_int[-2] = get_A_hydrophobic(m1, m2) A_int[1] = get_A_hbond(m1, m2) A_int[-1] = get_A_metal_complexes(m1, m2) # cal sasa sasa = cal_sasa(m1) dsasa = sasa - cal_sasa(m1_uff) # count rotatable bonds rotor = CalcNumRotatableBonds(m1) # dm = distance_matrix(d1, d2) # rotor = count_active_rotatable_bond(m1, dm) # charge # charge1 = cal_charge(m1) # charge2 = cal_charge(m2) charge1 = np.zeros((n1, )) charge2 = np.zeros((n2, )) """ mp1 = AllChem.MMFFGetMoleculeProperties(m1) mp2 = AllChem.MMFFGetMoleculeProperties(m2) charge1 = [mp1.GetMMFFPartialCharge(i) for i in range(m1.GetNumAtoms())] charge2 = [mp2.GetMMFFPartialCharge(i) for i in range(m2.GetNumAtoms())] """ # partial charge calculated by gasteiger charge1 = np.array(charge1) charge2 = np.array(charge2) # There is nan for some cases. charge1 = np.nan_to_num(charge1, nan=0, neginf=0, posinf=0) charge2 = np.nan_to_num(charge2, nan=0, neginf=0, posinf=0) # valid valid1 = np.ones((n1, )) valid2 = np.ones((n2, )) # no metal metal_symbols = ["Zn", "Mn", "Co", "Mg", "Ni", "Fe", "Ca", "Cu"] no_metal1 = np.array([ 1 if a.GetSymbol() not in metal_symbols else 0 for a in m1.GetAtoms() ]) no_metal2 = np.array([ 1 if a.GetSymbol() not in metal_symbols else 0 for a in m2.GetAtoms() ]) # vdw radius vdw_radius1 = np.array([get_vdw_radius(a) for a in m1.GetAtoms()]) vdw_radius2 = np.array([get_vdw_radius(a) for a in m2.GetAtoms()]) vdw_epsilon, vdw_sigma = get_epsilon_sigma(m1, m2, False) # uff energy difference # delta_uff = cal_uff(m1)-cal_uff(m1_uff) # delta_uff = get_torsion_energy(m1) - get_torsion_energy(m1_uff) # delta_uff = cal_torsion_energy(m1)+cal_internal_vdw(m1) delta_uff = 0.0 sample = { "h1": h1, "adj1": adj1, "h2": h2, "adj2": adj2, "A_int": A_int, "dmv": dmv, "dmv_rot": dmv_rot, "pos1": d1, "pos2": d2, "sasa": sasa, "dsasa": dsasa, "rotor": rotor, "charge1": charge1, "charge2": charge2, "vdw_radius1": vdw_radius1, "vdw_radius2": vdw_radius2, "vdw_epsilon": vdw_epsilon, "vdw_sigma": vdw_sigma, "delta_uff": delta_uff, "valid1": valid1, "valid2": valid2, "no_metal1": no_metal1, "no_metal2": no_metal2, } return sample
def anal_mols(key, m1, m2): fnm = whoami() # # prepare ligand # m1 = Chem.AddHs(m1, addCoords=True, addResidueInfo=True) n1 = m1.GetNumAtoms() c1 = m1.GetConformers( )[0] # m1.GetConformers() 함수는 1개의 rdkit.Chem.rdchem.Conformer object 만을 되돌려 줌 d1 = np.array(c1.GetPositions()) #print('{}:#1:n_atoms:{} --> shape:{}\n{}'.format(fnm, n1, d1.shape, d1)) print('{}:#1:n_atoms:{} --> shape:{}'.format(fnm, n1, d1.shape)) print('+' * 3) for i, coord in enumerate(d1): symbol = m1.GetAtomWithIdx(i).GetSymbol() print(' #{:>3}:{}:{}'.format(i, symbol, coord)) pass print('+' * 10) adj1 = GetAdjacencyMatrix(m1) + np.eye(n1) # adj1.dtype: float64 print('{}:#2:adj1:shape:{}, dtype:{}\n{}'.format(fnm, adj1.shape, adj1.dtype, adj1)) print('+' * 3) print('{}:m1:{}, n1:{}'.format(fnm, m1, n1)) H1 = get_atom_feature(m1, n1, True) print('#' * 80) # # prepare protein # m2 = Chem.AddHs(m2, addCoords=True, addResidueInfo=True) n2 = m2.GetNumAtoms() c2 = m2.GetConformers( )[0] # m2.GetConformers() 함수는 1개의 rdkit.Chem.rdchem.Conformer object 만을 되돌려 줌 d2 = np.array(c2.GetPositions()) print('{}:#1:n_atoms:{} --> shape:{}\n{}'.format(fnm, n2, d2.shape, d2)) print('+' * 10) adj2 = GetAdjacencyMatrix(m2) + np.eye(n2) print('{}:#2:adj2:shape:{}, dtype:{}\n{}'.format(fnm, adj2.shape, adj2.dtype, adj2)) print('+' * 3) H2 = get_atom_feature(m2, n2, False) print('#' * 80) # aggregation H = np.concatenate([H1, H2], axis=0) print('{}: H:shape:{}, type:{}'.format(fnm, H.shape, H.dtype), flush=True) print('+' * 10) print('n:{} = n1:{} + n2:{}'.format(n1 + n2, n1, n2)) # # agg_adj1: 인접행렬(1) # # - 행렬의 upper-left 부분: ligand 내부의 인접행렬 # - 행렬의 lower-right 부분: protein 내부의 인접행렬 # - 위의 2영역을 제외한 나머지는 0(zero)로 패딩됨 # agg_adj1 = np.zeros((n1 + n2, n1 + n2)) agg_adj1[:n1, :n1] = adj1 agg_adj1[n1:, n1:] = adj2 print('{}: agg_adj1:shape:{}, type:{}'.format(fnm, agg_adj1.shape, agg_adj1.dtype), flush=True) # # agg_adj2: 인접행렬(2) # # - 행렬의 upper-left 부분: ligand 내부의 인접행렬 # - 행렬의 upper-right 부분: row기준으로(ligand기준 ) ligand와 protein간의 거리 # - 행렬의 lower-left 부분: row기준으로(protein기준) protein과 ligand간의 거리 # - 행렬의 lower-right 부분: protein 내부의 인접행렬 # agg_adj2 = np.copy(agg_adj1) print('{}: agg_adj2:shape:{}, type:{}'.format(fnm, agg_adj2.shape, agg_adj2.dtype), flush=True) dm = distance_matrix(d1, d2) print('{}: dm:shape:{}, type:{}, min:{}, max:{}'.format( fnm, dm.shape, dm.dtype, dm.min(), dm.max()), flush=True) agg_adj2[:n1, n1:] = np.copy(dm) agg_adj2[n1:, :n1] = np.copy(np.transpose(dm)) #node indice for aggregation valid = np.zeros((n1 + n2, )) valid[:n1] = 1 print('valid:{}, sum(valid):{}'.format(valid, sum(valid))) #pIC50 to class Y = 1 if 'CHEMBL' in key else 0 sample = { 'H': H, 'A1': agg_adj1, 'A2': agg_adj2, 'Y': Y, 'V': valid, 'key': key } return sample
def get_adj(mol): return GetAdjacencyMatrix(mol) + np.eye(mol.GetNumAtoms())
def process(self): print('processing data from ({}) and saving it to ({})'.format(self.qm9_directory, os.path.join(self.qm9_directory, 'processed'))) # load qm9 data with spatial coordinates data_qm9 = dict(np.load(os.path.join(self.qm9_directory, self.raw_spatial_data), allow_pickle=True)) coordinates = torch.tensor(data_qm9['R'], dtype=torch.float) # Read the QM9 data with SMILES information molecules_df = pd.read_csv(os.path.join(self.qm9_directory, self.raw_qm9_file)) atom_slices = [0] edge_slices = [0] total_eigvecs = [] total_eigvals = [] all_atom_features = [] all_edge_features = [] edge_indices = [] # edges of each molecule in coo format targets = [] # the 19 properties that should be predicted for the QM9 dataset total_atoms = 0 total_edges = 0 avg_degree = 0 # average degree in the dataset # go through all molecules in the npz file for mol_idx, n_atoms in tqdm(enumerate(data_qm9['N'])): # get the molecule using the smiles representation from the csv file mol = Chem.MolFromSmiles(molecules_df['smiles'][data_qm9['id'][mol_idx]]) # add hydrogen bonds to molecule because they are not in the smiles representation mol = Chem.AddHs(mol) atom_features_list = [] for atom in mol.GetAtoms(): atom_features_list.append(atom_to_feature_vector(atom)) all_atom_features.append(torch.tensor(atom_features_list, dtype=torch.long)) adj = GetAdjacencyMatrix(mol, useBO=False, force=True) max_freqs = 10 adj = torch.tensor(adj).float() D = torch.diag(adj.sum(dim=0)) L = D - adj N = adj.sum(dim=0) ** -0.5 L_sym = torch.eye(n_atoms) - N * L * N eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True) idx = eig_vals.argsort()[0: max_freqs] # Keep up to the maximum desired number of frequencies eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx] # Sort, normalize and pad EigenVectors eig_vecs = eig_vecs[:, eig_vals.argsort()] # increasing order eig_vecs = F.normalize(eig_vecs, p=2, dim=1, eps=1e-12, out=None) if n_atoms < max_freqs: eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms), value=float('nan')) eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms), value=float('nan')) total_eigvecs.append(eig_vecs) total_eigvals.append(eig_vals.unsqueeze(0)) edges_list = [] edge_features_list = [] for bond in mol.GetBonds(): i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() edge_feature = bond_to_feature_vector(bond) # add edges in both directions edges_list.append((i, j)) edge_features_list.append(edge_feature) edges_list.append((j, i)) edge_features_list.append(edge_feature) # Graph connectivity in COO format with shape [2, num_edges] edge_index = torch.tensor(edges_list, dtype=torch.long).T edge_features = torch.tensor(edge_features_list, dtype=torch.long) avg_degree += (len(edges_list) / 2) / n_atoms # get all 19 attributes that should be predicted, so we drop the first two entries (name and smiles) target = torch.tensor(molecules_df.iloc[data_qm9['id'][mol_idx]][2:], dtype=torch.float) targets.append(target) edge_indices.append(edge_index) all_edge_features.append(edge_features) total_edges += len(edges_list) total_atoms += n_atoms edge_slices.append(total_edges) atom_slices.append(total_atoms) # convert targets to eV units targets = torch.stack(targets) * torch.tensor(list(self.unit_conversion.values()))[None, :] data_dict = {'mol_id': data_qm9['id'], 'n_atoms': torch.tensor(data_qm9['N'], dtype=torch.long), 'atom_slices': torch.tensor(atom_slices, dtype=torch.long), 'edge_slices': torch.tensor(edge_slices, dtype=torch.long), 'eig_vecs': torch.cat(total_eigvecs).float(), 'eig_vals': torch.cat(total_eigvals).float(), 'edge_indices': torch.cat(edge_indices, dim=1), 'atom_features': torch.cat(all_atom_features, dim=0), 'edge_features': torch.cat(all_edge_features, dim=0), 'atomic_number_long': torch.tensor(data_qm9['Z'], dtype=torch.long)[:, None], 'coordinates': coordinates, 'targets': targets, 'avg_degree': avg_degree / len(data_qm9['id']) } if not os.path.exists(os.path.join(self.qm9_directory, 'processed')): os.mkdir(os.path.join(self.qm9_directory, 'processed')) torch.save(data_dict, os.path.join(self.qm9_directory, 'processed', self.processed_file))
def createSignImg(self, smi, signature, atomColor, imgPath, endHeight=None): colors = [] print "Creating signature image..." if not signature or not atomColor or not smi: print "Missing inputs:", str([smi, signature, atomColor]) return "", "", [], [] if hasattr(self.model, "specialType") and self.model.specialType == 1: # Create an Orange ExampleTable with a smiles attribute smilesAttr = orange.EnumVariable("SMILEStoPred", values=[smi]) myDomain = orange.Domain([smilesAttr], 0) smilesData = dataUtilities.DataTable(myDomain, [[smi]]) preCalcData = None startHeight = 0 dataSign, cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures( smilesData, startHeight, endHeight, preCalcData, returnAtomID=True) cmpdSignList = cmpdSignList[0] CLabDesc = [] # create a mol file tmpFile = miscUtilities.generateUniqueFile(desc="NN", ext="mol") file = open(tmpFile, "w") molStr = "" for line in sdfStr[0]: if "$$$$" in line: break molStr += line file.write(line) file.close() else: CLabDesc, cmpdSignList, tmpFile, molStr = self.getClabDescSignList( smi, getMolFile=True) if not cmpdSignList or not tmpFile: print "Couldn't get the cmpd list or the mol file" return "", "", [], [] # create an RDKit mol mol = Chem.MolFromMolFile(tmpFile, True, False) if not mol: mol = Chem.MolFromMolFile(tmpFile, False, False) if not mol: print "Could not create mol for: ", smi return "", "", [], [] adj = GetAdjacencyMatrix(mol) # find the NN hights = [] for i in miscUtilities.Range(0, len(cmpdSignList), mol.GetNumAtoms()): hList = cmpdSignList[i:i + mol.GetNumAtoms()] if len(hList): hights.append(cmpdSignList[i:i + mol.GetNumAtoms()]) atoms = [] hight = None for idx, h in enumerate(hights): if signature in h: for i, a in enumerate(h): if a == signature: atoms.append(i) hight = idx break if len(atoms) == 0: print "ERROR: Could not find the atom for ", signature return "signatureNOTfound", "", [], [] #print "IniAtoms: ",atoms visitedAtoms = [] for n in range(hight): for atom in copy.deepcopy(atoms): if atom not in visitedAtoms: lNN = findNeighbors(atom, adj) visitedAtoms.append(atom) for lnn in lNN: if lnn not in atoms: atoms.append(lnn) atoms.sort() os.system("rm " + tmpFile) #Specify the atom colors colors = [atomColor] * len(atoms) if not imgPath: return "", molStr, atoms, colors try: #Draw the image MolDrawing.elemDict = defaultdict(lambda: (0, 0, 0)) Draw.MolToImageFile(mol, imgPath, size=(300, 300), kekulize=True, wedgeBonds=True, highlightAtoms=atoms) #Color the Highlighted atoms with the choosen atomColor. # Only using one color if atomColor == 'r': rgb = (255, 0, 0) elif atomColor == 'g': rgb = (0, 255, 0) else: rgb = (0, 0, 255) #Blue img = Image.open(imgPath) img = img.convert("RGBA") pixdata = img.getdata() newData = list() for item in pixdata: if item[0] == 255 and item[1] == 0 and item[2] == 0: newData.append(rgb + (255, )) else: newData.append(item) img.putdata(newData) img.save(imgPath) if os.path.isfile(imgPath): return imgPath, molStr, atoms, colors else: return "", molStr, atoms, colors except: return "", molStr, atoms, colors
def process(self): print('processing data from ({}) and saving it to ({})'.format( self.directory, os.path.join(self.directory, 'processed'))) with open(os.path.join(self.directory, "summary_qm9.json"), "r") as f: summary = json.load(f) atom_slices = [0] edge_slices = [0] total_eigvecs = [] total_eigvals = [] all_atom_features = [] all_edge_features = [] targets = { 'ensembleenergy': [], 'ensembleentropy': [], 'ensemblefreeenergy': [], 'lowestenergy': [], 'poplowestpct': [], 'temperature': [], 'uniqueconfs': [] } edge_indices = [] # edges of each molecule in coo format atomic_number_long = [] n_atoms_list = [] coordinates = [] smiles_list = [] total_atoms = 0 total_edges = 0 avg_degree = 0 # average degree in the dataset for smiles, sub_dic in tqdm(list(summary.items())): pickle_path = os.path.join(self.directory, sub_dic.get("pickle_path", "")) if os.path.isfile(pickle_path): pickle_file = open(pickle_path, 'rb') mol_dict = pickle.load(pickle_file) if 'ensembleenergy' in mol_dict: conformers = mol_dict['conformers'] mol = conformers[0]['rd_mol'] n_atoms = len(mol.GetAtoms()) atom_features_list = [] for atom in mol.GetAtoms(): atom_features_list.append(atom_to_feature_vector(atom)) all_atom_features.append( torch.tensor(atom_features_list, dtype=torch.long)) adj = GetAdjacencyMatrix(mol, useBO=False, force=True) max_freqs = 10 adj = torch.tensor(adj).float() D = torch.diag(adj.sum(dim=0)) L = D - adj N = adj.sum(dim=0)**-0.5 L_sym = torch.eye(n_atoms) - N * L * N try: eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True) except Exception as e: # if we have disconnected components deg = adj.sum(dim=0) deg[deg == 0] = 1 N = deg**-0.5 L_sym = torch.eye(n_atoms) - N * L * N eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True) idx = eig_vals.argsort( )[0: max_freqs] # Keep up to the maximum desired number of frequencies eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx] # Sort, normalize and pad EigenVectors eig_vecs = eig_vecs[:, eig_vals.argsort()] # increasing order eig_vecs = F.normalize(eig_vecs, p=2, dim=1, eps=1e-12, out=None) if n_atoms < max_freqs: eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms), value=float('nan')) eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms), value=float('nan')) total_eigvecs.append(eig_vecs) total_eigvals.append(eig_vals.unsqueeze(0)) edges_list = [] edge_features_list = [] for bond in mol.GetBonds(): i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() edge_feature = bond_to_feature_vector(bond) # add edges in both directions edges_list.append((i, j)) edge_features_list.append(edge_feature) edges_list.append((j, i)) edge_features_list.append(edge_feature) # Graph connectivity in COO format with shape [2, num_edges] edge_index = torch.tensor(edges_list, dtype=torch.long).T edge_features = torch.tensor(edge_features_list, dtype=torch.long) avg_degree += (len(edges_list) / 2) / n_atoms targets['ensembleenergy'].append( mol_dict['ensembleenergy']) targets['ensembleentropy'].append( mol_dict['ensembleentropy']) targets['ensemblefreeenergy'].append( mol_dict['ensemblefreeenergy']) targets['lowestenergy'].append(mol_dict['lowestenergy']) targets['poplowestpct'].append(mol_dict['poplowestpct']) targets['temperature'].append(mol_dict['temperature']) targets['uniqueconfs'].append(mol_dict['uniqueconfs']) conformers = [ torch.tensor( conformer['rd_mol'].GetConformer().GetPositions(), dtype=torch.float) for conformer in conformers[:10] ] if len( conformers ) < 10: # if there are less than 10 conformers we add the first one a few times conformers.extend([conformers[0]] * (10 - len(conformers))) all_edge_features.append(edge_features) coordinates.append(torch.cat(conformers, dim=1)) edge_indices.append(edge_index) total_edges += len(edges_list) total_atoms += n_atoms smiles_list.append(smiles) edge_slices.append(total_edges) atom_slices.append(total_atoms) n_atoms_list.append(n_atoms) for key, value in targets.items(): targets[key] = torch.tensor(value)[:, None] data_dict = { 'smiles': smiles_list, 'n_atoms': torch.tensor(n_atoms_list, dtype=torch.long), 'atom_slices': torch.tensor(atom_slices, dtype=torch.long), 'edge_slices': torch.tensor(edge_slices, dtype=torch.long), 'atom_features': torch.cat(all_atom_features, dim=0), 'edge_features': torch.cat(all_edge_features, dim=0), 'atomic_number_long': torch.tensor(atomic_number_long, dtype=torch.long), 'edge_indices': torch.cat(edge_indices, dim=1), 'coordinates': torch.cat(coordinates, dim=0).float(), 'targets': targets, 'avg_degree': avg_degree / len(n_atoms_list) } data_dict.update(targets) if not os.path.exists(os.path.join(self.directory, 'processed')): os.mkdir(os.path.join(self.directory, 'processed')) torch.save( data_dict, os.path.join(self.directory, 'processed', self.processed_file))
def get_adjacency_matrix(self): """ Returning the adjacency matrix of the molecular graph (defined atoms) :return: """ return GetAdjacencyMatrix(self.export_mol())
N = len(atoms) if N <= 2: continue for j, atom in enumerate(atoms): X[j] = atom_features(atom) for j in range(N): for k in range(N): bond = mol.GetBondBetweenAtoms(j, k) if bond is not None: E_idx.append([j, k]) E_fea.append(bond_features(bond)) A = GetAdjacencyMatrix(mol) Y = [outs[i]] # global properties g = nx.Graph() g.add_nodes_from(list(range(N))) g.add_edges_from(E_idx) if not nx.is_connected(g): print("{} is not connected".format(smile)) continue for j in range(N): for k in range(N): if j == k:
def __getitem__(self, idx): fnm = __class__.__name__ + '.' + whoami() self.n_queried += 1 #idx = 0 key = self.keys[idx] data_file_path = os.path.join(self.data_dir, key) #with open(self.data_dir+'/'+key, 'rb') as f: with open(data_file_path, 'rb') as f: m1, m2 = pickle.load(f) self.n_file_opened += 1 pass if not self.proc_info_printed: print('{}: data_file_path:{}, type(m1):{}, type(m2):{}'.format( fnm, data_file_path, type(m1), type(m2))) pass # # prepare ligand # #m1 = Chem.AddHs(m1, addCoords=True, addResidueInfo=True) # 2020-03-26 added by caleb n1 = m1.GetNumAtoms() c1 = m1.GetConformers( )[0] # m1.GetConformers() 함수는 1개의 rdkit.Chem.rdchem.Conformer object 만을 되돌려 줌 d1 = np.array(c1.GetPositions()) #adj1 = GetAdjacencyMatrix(m1) + np.eye(n1) adj = GetAdjacencyMatrix(m1) + np.eye(n1) if n1 <= N_PADDED_LIGAND: adj1 = np.zeros((N_PADDED_LIGAND, N_PADDED_LIGAND), dtype=np.float64) adj1[:n1, :n1] = adj pass else: adj1 = adj[:N_PADDED_LIGAND, :N_PADDED_LIGAND] pass #H1 = get_atom_feature(m1, True) H1 = get_atom_feature(m1, n1, True) # # prepare protein # #m2 = Chem.AddHs(m2, addCoords=True, addResidueInfo=True) # 2020-03-26 added by caleb n2 = m2.GetNumAtoms() c2 = m2.GetConformers()[0] d2 = np.array(c2.GetPositions()) #adj2 = GetAdjacencyMatrix(m2)+np.eye(n2) adj = GetAdjacencyMatrix(m2) + np.eye(n2) if n2 <= N_PADDED_PROTEIN: adj2 = np.zeros((N_PADDED_PROTEIN, N_PADDED_PROTEIN), dtype=np.float64) adj2[:n2, :n2] = adj pass else: adj2 = adj[:N_PADDED_PROTEIN, :N_PADDED_PROTEIN] pass #H2 = get_atom_feature(m2, False) H2 = get_atom_feature(m2, n2, False) # aggregation H = np.concatenate([H1, H2], axis=0) ''' agg_adj1 = np.zeros((n1+n2, n1+n2)) agg_adj1[:n1, :n1] = adj1 agg_adj1[n1:, n1:] = adj2 agg_adj2 = np.copy(agg_adj1) dm = distance_matrix(d1,d2) agg_adj2[:n1,n1:] = np.copy(dm) agg_adj2[n1:,:n1] = np.copy(np.transpose(dm)) #node indice for aggregation valid = np.zeros((n1+n2,)) valid[:n1] = 1 ''' agg_adj1 = np.zeros((N_PADDED_ALL, N_PADDED_ALL)) agg_adj1[:N_PADDED_LIGAND, :N_PADDED_LIGAND] = adj1 agg_adj1[N_PADDED_LIGAND:, N_PADDED_LIGAND:] = adj2 agg_adj2 = np.copy(agg_adj1) dm = distance_matrix(d1, d2) # # 2020-03-27 # * (계산의 편의를 위해) 무식하게 최대크기(라고 가정한) 매트릭스를 특정값으로 세팅함 # * 거리정보가 없는 녀석들은 먼거리(여기서는 100.0)로 세팅해 놓음 --> 그냥 0으로 세팅함 # #dm_padded = np.full((N_PADDED_LIGAND_MAX, N_PADDED_PROTEIN_MAX), fill_value=100.0, dtype=np.float64) dm_padded = np.zeros((N_PADDED_LIGAND_MAX, N_PADDED_PROTEIN_MAX), dtype=np.float64) dm_padded[:n1, :n2] = dm dm = dm_padded[:N_PADDED_LIGAND, :N_PADDED_PROTEIN] #agg_adj2[:n1,n1:] = np.copy(dm) #agg_adj2[n1:,:n1] = np.copy(np.transpose(dm)) agg_adj2[:N_PADDED_LIGAND, N_PADDED_LIGAND:] = np.copy(dm) agg_adj2[N_PADDED_LIGAND:, :N_PADDED_LIGAND] = np.copy( np.transpose(dm)) #node indice for aggregation #valid = np.zeros((n1+n2,)) #valid[:n1] = 1 valid = np.zeros((N_PADDED_ALL, )) valid[:N_PADDED_LIGAND] = 1 #pIC50 to class Y = 1 if 'CHEMBL' in key else 0 #if n1+n2 > 300 : return None sample = { 'H' : H , \ 'A1' : agg_adj1, \ 'A2' : agg_adj2, \ 'Y' : Y , \ 'V' : valid , \ 'key': key , \ } if self.n_max_n1 < n1: self.n_max_n1 = n1 pass if self.n_max_n2 < n2: self.n_max_n2 = n2 pass if self.n_max_adj < n1 + n2: self.n_max_adj = n1 + n2 pass if not self.proc_info_printed: #print('{}: n1:{}, n2:{}, H.shape:{}, A1.shape:{}, A2.shape:{}, Y.shape:{}, V.shape:{}, key:{}'.format( # fnm, n1, n2, H.shape, adj1.shape, adj2.shape, Y.shape, V.shape, key)) #print('{}: n1:{}, n2:{}, type(H):{}, type(adj1):{}, type(adj2):{}, type(Y):{}, type(valid):{}({}), key:{}'.format( # fnm, n1, n2, type(H), type(adj1), type(adj2), type(Y), type(valid)(valid[:10]), key[:10])) print( '{}: n1:{}, n2:{}, H.shape:{}, adj1.shape:{}, adj2.shape:{}, type(Y):{}, type(valid):{}, type(key):{}:{}' .format(fnm, n1, n2, H.shape, adj1.shape, adj2.shape, type(Y), type(valid), type(key), key)) pass self.proc_info_printed = True return sample