def __getitem__(self, index): molecule_name = self.id[index] graph_file = get_path( ) + 'data/graphs/' + self.graph_dir + '/%s.pickle' % molecule_name graph = read_pickle_from_file(graph_file) assert (graph.molecule_name == molecule_name) mask = np.zeros(len(graph.coupling.type), np.bool) for t in self.coupling_types: mask += (graph.coupling.type == COUPLING_TYPE.index(t)) graph.coupling.id = graph.coupling.id[mask] #graph.coupling.contribution = graph.coupling.contribution[mask] graph.coupling.index = graph.coupling.index[mask] graph.coupling.type = graph.coupling.type[mask] graph.coupling.value = graph.coupling.value[mask] atom = System(symbols=graph.axyz[0], positions=graph.axyz[1]) acsf = ACSF_GENERATOR.create(atom) graph.node += [ acsf, ] graph.node = np.concatenate(graph.node, -1) graph.edge = np.concatenate(graph.edge, -1) return graph
def __getitem__(self, index): molecule_name = self.id[index] graph_file = DATA_DIR + '/structure/graph1/%s.pickle' % molecule_name graph = read_pickle_from_file(graph_file) assert (graph.molecule_name == molecule_name) # ##filter only J link # if 0: # # 1JHC, 2JHC, 3JHC, 1JHN, 2JHN, 3JHN, 2JHH, 3JHH # mask = np.zeros(len(graph.coupling.type),np.bool) # for t in ['1JHC', '2JHH']: # mask += (graph.coupling.type == COUPLING_TYPE.index(t)) # # graph.coupling.id = graph.coupling.id [mask] # graph.coupling.contribution = graph.coupling.contribution [mask] # graph.coupling.index = graph.coupling.index [mask] # graph.coupling.type = graph.coupling.type [mask] # graph.coupling.value = graph.coupling.value [mask] if 1: atom = System(symbols=graph.axyz[0], positions=graph.axyz[1]) acsf = ACSF_GENERATOR.create(atom) graph.node += [ acsf, ] # if 1: # graph.edge = graph.edge[:-1] graph.node = np.concatenate(graph.node, -1) graph.edge = np.concatenate(graph.edge, -1) return graph
def func_acsf(params): i, molecule = params #if i%1000 == 0: # print(f"{i}th finish") st = st_dict[molecule] atoms = System(symbols=st["atom"].values, positions=st[["x", "y", "z"]].values) return gen.create(atoms)
def __getitem__(self, index): molecule_name = self.id[index] #graph_file = DATA_DIR + '/atoms-graph/graph/graph/%s.pickle'%molecule_name #graph_file = DATA_DIR + '/graph-v4/graph_v4/graph_v4/%s.pickle'%molecule_name graph_file = \ '../data/graph_v8/%s.pickle'%molecule_name #graph_file = DATA_DIR + '/graph-v5/graph_v5/graph_v5/%s.pickle'%molecule_name #graph_file = DATA_DIR + '/molecule-graph/graph_v2/graph_v2/%s.pickle'%molecule_name graph = list(read_pickle_from_file(graph_file)) assert (graph[0] == molecule_name) # ##filter only J link # if 0: # # 1JHC, 2JHC, 3JHC, 1JHN, 2JHN, 3JHN, 2JHH, 3JHH # mask = np.zeros(len(graph.coupling.type),np.bool) # for t in ['1JHC', '2JHH']: # mask += (graph.coupling.type == COUPLING_TYPE.index(t)) # # graph.coupling.id = graph.coupling.id [mask] # graph.coupling.contribution = graph.coupling.contribution [mask] # graph.coupling.index = graph.coupling.index [mask] # graph.coupling.type = graph.coupling.type [mask] # graph.coupling.value = graph.coupling.value [mask] # add ACSF atom = System(symbols=graph[2][0], positions=graph[2][1]) acsf = ACSF_GENERATOR.create(atom) graph[3] += [ acsf, ] graph[g_node_idx][7] = graph[g_node_idx][7].reshape([-1, 1]) graph[3] = np.concatenate(graph[3], -1) dist = np.concatenate(graph[4], -1)[:, 4].reshape(-1, 1) graph[4].append(1 / dist) graph[4].append(1 / dist**2) graph[4].append(1 / dist**3) graph[4].append(1 / dist**6) #for i in range(len(graph[4])): # print(graph[4][]) graph[4] = np.concatenate(graph[4], -1) graph[3][np.isnan(graph[3])] = 0 graph[4][np.isnan(graph[4])] = 0 # replace coupling atom_index2 -1 => 1 #if np.isnan(graph[3]).sum()>0 or np.isnan(graph[4]).sum() > 0: # print(graph) return graph
def get_scsf(data): ret_list = [] for molecule_name in data["mol_names"]: df = gb_structure.get_group(molecule_name) df = df.sort_values(['atom_index'], ascending=True) a = df.atom.values.tolist() xyz = df[['x', 'y', 'z']].values atom = System(symbols=a, positions=xyz) acsf = ACSF_GENERATOR.create(atom) acsf_df = pd.DataFrame(acsf) acsf_df.columns = [f"acsf_{c}" for c in range(acsf_df.shape[1])] acsf_df = pd.concat([ df[["molecule_name", "atom_index"]].reset_index(drop=True), acsf_df.reset_index(drop=True) ], axis=1) ret_list.append(acsf_df) return pd.concat(ret_list, axis=0)
def get_system(self, system): """Used to convert the given atomic system into a custom System-object that is used internally. The System class inherits from ase.Atoms, but includes built-in caching for geometric quantities that may be re-used by the descriptors. Args: system (:class:`ase.Atoms` | :class:`.System`): Input system. Returns: :class:`.System`: The given system transformed into a corresponding System-object. """ if isinstance(system, Atoms): if type(system) == System: return system else: return System.from_atoms(system) else: raise ValueError("Invalid system with type: '{}'.".format( type(system)))
def structure_to_graph(structure_file): mol, smile = MolFromXYZ(structure_file) factory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) feature = factory.GetFeaturesForMol(mol) structure = pd.read_csv(structure_file, skiprows=1, header=None, sep=" ", names=["atom", "x", "y", "z"]) structure["radius"] = structure["atom"].map({'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71}) xyz = structure[["x", "y", "z"]] norm_xyz = preprocessing.normalize(xyz, norm='l2') n_atoms = mol.GetNumAtoms() edge_array = [] bond_features = [] distance = [] rel_distance = [] angle = [] bond_vector = [] for i, j in itertools.product(range(n_atoms), repeat=2): if i == j: continue edge_array.append((i, j)) bond = mol.GetBondBetweenAtoms(i, j) if bond: bond_type = bond.GetBondType() else: bond_type = None bond_features.append(one_hot_encoding(bond_type, BONDS)) r = ((xyz.iloc[i] - xyz.iloc[j])**2).sum()**0.5 rel_dist = r/(structure.iloc[i]["radius"] + structure.iloc[j]["radius"]) theta = (norm_xyz[i]*norm_xyz[j]).sum() distance.append([r]) rel_distance.append([rel_dist]) # divide distance by sum of atomic radii angle.append([theta]) bond_vector.append((xyz.iloc[i] - xyz.iloc[j]).tolist()) #distance = np.digitize(np.array(distance), bins=[0, 1, 2, 4, 8]) #rel_distance = np.digitize(np.array(rel_distance), bins=[0, 1, 2, 4, 8]) #angle = np.digitize(np.array(angle), bins=[-1, -.6, -.2, .2, .6]) edge_array = np.array(edge_array).T edge_features = np.concatenate([ np.array(bond_features), np.array(distance) / 4 - 1, np.array(rel_distance) / 4 - 1, np.array(angle), # absolute bond angle. Can use to calculate dihedral angles np.array(bond_vector) # difference between coords of atoms i and j ], axis=1) atom_features = defaultdict(list) n_atoms = mol.GetNumAtoms() for i in range(n_atoms): atom = mol.GetAtomWithIdx(i) atom_features["symbol"].append(one_hot_encoding(atom.GetSymbol(), SYMBOLS)) atom_features["aromatic"].append([atom.GetIsAromatic()]) atom_features["hybridization"].append(one_hot_encoding(atom.GetHybridization(), HYBRIDIZATIONS)) atom_features["num_h"].append([atom.GetTotalNumHs(includeNeighbors=True)]) atom_features["atomic"].append([atom.GetAtomicNum()]) atom = System(symbols=structure["atom"].values, positions=xyz.values) acsf = ACSF_GENERATOR.create(atom) atom_features["acsf"] = acsf acceptor = np.zeros((n_atoms, 1), np.uint8) donor = np.zeros((n_atoms, 1), np.uint8) for feat in feature: if feat.GetFamily() == 'Donor': for i in feat.GetAtomIds(): donor[i] = 1 elif feat.GetFamily() == 'Acceptor': for i in feat.GetAtomIds(): acceptor[i] = 1 print(len(atom_features["acsf"]), len(acceptor), len(donor)) atom_features = np.concatenate([atom_features["symbol"], atom_features["aromatic"], atom_features["hybridization"], atom_features["num_h"], atom_features["atomic"], atom_features["acsf"], acceptor, donor], axis=1) return edge_array, edge_features, atom_features, smile, xyz.values
def make_graph(name, gb_structure, gb_scalar_coupling): # ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type','scalar_coupling_constant'] coupling_df = gb_scalar_coupling.get_group(name) # [molecule_name,atom_index,atom,x,y,z] df = gb_structure.get_group(name) df = df.sort_values(['atom_index'], ascending=True) a = df.atom.values.tolist() xyz = df[['x', 'y', 'z']].values mol = mol_from_axyz(a, xyz) mol_op = openbabel.OBMol() obConversion.ReadFile(mol_op, f'../input/champs-scalar-coupling/structures/{name}.xyz') factory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) feature = factory.GetFeaturesForMol(mol) num_atom = mol.GetNumAtoms() symbol = np.zeros((num_atom, len(SYMBOL)), np.uint8) # category acceptor = np.zeros((num_atom, 1), np.uint8) donor = np.zeros((num_atom, 1), np.uint8) aromatic = np.zeros((num_atom, 1), np.uint8) hybridization = np.zeros((num_atom, len(HYBRIDIZATION)), np.uint8) num_h = np.zeros((num_atom, 1), np.float32) # real atomic = np.zeros((num_atom, 1), np.float32) # new features degree = np.zeros((num_atom, 1), np.uint8) formalCharge = np.zeros((num_atom, 1), np.float32) chiral_tag = np.zeros((num_atom, 1), np.uint8) crippen_contribs = np.zeros((num_atom, 2), np.float32) tpsa = np.zeros((num_atom, 1), np.float32) labute_asac = np.zeros((num_atom, 1), np.float32) gasteiger_charges = np.zeros((num_atom, 1), np.float32) esataindices = np.zeros((num_atom, 1), np.float32) atomic_radiuss = np.zeros((num_atom, 1), np.float32) electronegate = np.zeros((num_atom, 1), np.float32) electronegate_sqre = np.zeros((num_atom, 1), np.float32) mass = np.zeros((num_atom, 1), np.float32) van = np.zeros((num_atom, 1), np.float32) cov = np.zeros((num_atom, 1), np.float32) ion = np.zeros((num_atom, 1), np.float32) for i in range(num_atom): atom = mol.GetAtomWithIdx(i) atom_op = mol_op.GetAtomById(i) symbol[i] = one_hot_encoding(atom.GetSymbol(), SYMBOL) aromatic[i] = atom.GetIsAromatic() hybridization[i] = one_hot_encoding(atom.GetHybridization(), HYBRIDIZATION) num_h[i] = atom.GetTotalNumHs(includeNeighbors=True) atomic[i] = atom.GetAtomicNum() degree[i] = atom.GetTotalDegree() formalCharge[i] = atom.GetFormalCharge() chiral_tag[i] = int(atom.GetChiralTag()) crippen_contribs[i] = rdMolDescriptors._CalcCrippenContribs(mol)[i] tpsa[i] = rdMolDescriptors._CalcTPSAContribs(mol)[i] labute_asac[i] = rdMolDescriptors._CalcLabuteASAContribs(mol)[0][i] gasteiger_charges[i] = atom_op.GetPartialCharge() esataindices[i] = EState.EStateIndices(mol)[i] atomic_radiuss[i] = atomic_radius[atom.GetSymbol()] electronegate[i] = electronegativity[atom.GetSymbol()] electronegate_sqre[i] = electronegativity_square[atom.GetSymbol()] mass[i] = atomic_mass[atom.GetSymbol()] van[i] = vanderwaalsradius[atom.GetSymbol()] cov[i] = covalenzradius[atom.GetSymbol()] ion[i] = ionization_energy[atom.GetSymbol()] for t in range(0, len(feature)): if feature[t].GetFamily() == 'Donor': for i in feature[t].GetAtomIds(): donor[i] = 1 elif feature[t].GetFamily() == 'Acceptor': for i in feature[t].GetAtomIds(): acceptor[i] = 1 num_edge = num_atom * num_atom - num_atom edge_index = np.zeros((num_edge, 2), np.uint32) bond_type = np.zeros((num_edge, len(BOND_TYPE)), np.uint32) distance = np.zeros((num_edge, 1), np.float32) angle = np.zeros((num_edge, 1), np.float32) norm_xyz = preprocessing.normalize(xyz, norm='l2') ij = 0 for i in range(num_atom): for j in range(num_atom): if i == j: continue edge_index[ij] = [i, j] bond = mol.GetBondBetweenAtoms(i, j) if bond is not None: bond_type[ij] = one_hot_encoding(bond.GetBondType(), BOND_TYPE) distance[ij] = np.linalg.norm(xyz[i] - xyz[j]) angle[ij] = (norm_xyz[i] * norm_xyz[j]).sum() ij += 1 xyz = xyz * 1.889726133921252 atom = System(symbols=a, positions=xyz) acsf = ACSF_GENERATOR.create(atom) l = [] for item in coupling_df[['atom_index_0', 'atom_index_1']].values.tolist(): i = edge_index.tolist().index(item) l.append(i) l = np.array(l) coupling_edge_index = np.concatenate([coupling_df[['atom_index_0', 'atom_index_1']].values, l.reshape(len(l), 1)], axis=1) coupling = Coupling(coupling_df['id'].values, coupling_df[['fc', 'sd', 'pso', 'dso']].values, coupling_edge_index, np.array([COUPLING_TYPE.index(t) for t in coupling_df.type.values], np.int32), coupling_df['scalar_coupling_constant'].values, ) graph = Graph( name, Chem.MolToSmiles(mol), [a, xyz], [acsf, symbol, acceptor, donor, aromatic, hybridization, num_h, atomic, degree, formalCharge, chiral_tag, crippen_contribs, tpsa, labute_asac, gasteiger_charges, esataindices, atomic_radiuss, electronegate, electronegate_sqre, mass, van, cov, ion], [bond_type, distance, angle, ], edge_index, coupling, ) return graph
def __getitem__(self, idx): self.global_features = h5py.File(script_dir + '/../processed_data/global_116.h5', mode = 'r') self.atom_features = h5py.File(script_dir + '/../processed_data/atom_116.h5', mode = 'r') self.bond_features = h5py.File(script_dir + '/../processed_data/bond_116.h5', mode = 'r') molecule_id = self.molecules_ids[idx] molecule = self.molecules[idx] atom_descriptor = self.atom_descriptors.loc[molecule_id] bond_descriptor = self.bond_descriptors.loc[molecule_id] #bond_descriptor = bond_descriptor.loc[bond_descriptor['bond_distance'] <= 3] # Cycles if molecule_id in self.cycles.index: cycles = self.cycles.loc[molecule_id] else: cycles = pd.DataFrame(columns = self.cycles.columns) cycles_edge_index = cycles['edge_index'].values.astype(np.int64) cycles_id = cycles['cycle_id'].values.astype(np.int64) # Edge connectivity edges_connectivity = self.edges_connectivity.loc[molecule_id] edges_connectivity_ids = np.copy(edges_connectivity[['edge_index_0', 'edge_index_1']].values.astype(np.int64).T) edges_connectivity_vectors_0 = edges_connectivity[['vx_0', 'vy_0', 'vz_0']].values edges_connectivity_vectors_1 = edges_connectivity[['vx_1', 'vy_1', 'vz_1']].values edges_connectivity_feature_1 = np.sqrt(np.square(edges_connectivity_vectors_0).sum(axis = 1)).reshape(-1, 1) edges_connectivity_feature_2 = np.sqrt(np.square(edges_connectivity_vectors_1).sum(axis = 1)).reshape(-1, 1) edges_connectivity_feature_3 = np.sqrt(np.square(edges_connectivity_vectors_0 + edges_connectivity_vectors_1).sum(axis = 1)).reshape(-1, 1) edges_connectivity_feature_0 = (edges_connectivity_vectors_0 * edges_connectivity_vectors_1).sum(axis = 1).reshape(-1, 1) / edges_connectivity_feature_1 / edges_connectivity_feature_2 edges_connectivity_features = np.concatenate([edges_connectivity_feature_0, edges_connectivity_feature_1, edges_connectivity_feature_2, edges_connectivity_feature_3], axis = 1) atom = list(atom_descriptor['atom']) xyz = atom_descriptor[['x', 'y', 'z']].values xyz = apply_random_rotation(xyz) connectivity = bond_descriptor[['atom_index_0', 'atom_index_1']].values global_feature_numeric = np.copy(self.global_features['numeric'][molecule_id].reshape(1, -1)) global_feature_embeddings = np.copy(self.global_features['embeddings'][molecule_id].reshape(1, -1)) atom_indexes = atom_descriptor['index'].values atom_index_min = atom_indexes.min() atom_index_max = atom_indexes.max() atom_feature_numeric = np.copy(self.atom_features['numeric'][atom_index_min : atom_index_max + 1][atom_indexes - atom_index_min]) atom_feature_embeddings = np.copy(self.atom_features['embeddings'][atom_index_min : atom_index_max + 1][atom_indexes - atom_index_min]) bond_indexes = bond_descriptor['index'].values bond_index_min = bond_indexes.min() bond_index_max = bond_indexes.max() bond_feature_numeric = np.copy(self.bond_features['numeric'][bond_index_min : bond_index_max + 1][bond_indexes - bond_index_min]) bond_feature_embeddings = np.copy(self.bond_features['embeddings'][bond_index_min : bond_index_max + 1][bond_indexes - bond_index_min]) self.global_features.close() self.atom_features.close() self.bond_features.close() # chemical descriptors atom = System(symbols = atom, positions=xyz) acsf = ACSF_GENERATOR.create(atom) atom_feature_numeric = np.concatenate([atom_feature_numeric, xyz, acsf], axis = 1) bond_vectors = build_bond_vector(connectivity, xyz) bond_feature_numeric = np.concatenate([bond_feature_numeric, bond_vectors], axis = 1) if self.name == 'train': # Target target = bond_descriptor['scalar_coupling_constant'].values.reshape(-1, 1) target_mask = (bond_descriptor['type'] != 'VOID').values.reshape(-1, 1) target_types = bond_descriptor['type_id'].values.reshape(-1, 1) target_idx = bond_descriptor['edge_index'].values.reshape(-1, 1) # data data = Data( x_numeric = torch.tensor(atom_feature_numeric, dtype = torch.float32), x_embeddings = torch.tensor(atom_feature_embeddings, dtype = torch.int64), edge_attr_numeric = torch.tensor(bond_feature_numeric, dtype = torch.float32), edge_attr_embeddings = torch.tensor(bond_feature_embeddings, dtype = torch.int64), u_numeric = torch.tensor(global_feature_numeric, dtype = torch.float32), u_embeddings = torch.tensor(global_feature_embeddings, dtype = torch.int64), edge_index = torch.tensor(connectivity.T), num_nodes = atom_feature_numeric.shape[0], molecule_ids = torch.tensor([molecule_id], dtype = torch.int64), y = torch.tensor(target, dtype = torch.float32), y_mask = torch.tensor(target_mask, dtype = torch.float32), y_types = torch.tensor(target_types, dtype = torch.int64), y_idx = torch.tensor(target_idx, dtype = torch.int32), cycles_edge_index = torch.tensor(cycles_edge_index), cycles_id = torch.tensor(cycles_id), edges_connectivity_ids = torch.tensor(edges_connectivity_ids), edges_connectivity_features = torch.tensor(edges_connectivity_features, dtype = torch.float32), ) inputs = [ data.u_embeddings, data.x_embeddings, data.edge_attr_embeddings, data.u_numeric, data.x_numeric, data.edge_attr_numeric, ]
def system_stats(system_iterator): """ Args: system_stats(iterable containing ASE.Atoms or System): The atomic systems for which to gather statistics. Returns: Dict: A dictionary of different statistics for the system. The dictionary will contain: n_atoms_max: The maximum number of atoms in a system. max_atomic_number: The highest atomic number min_atomic_number: The lowest atomic number atomic_numbers: List of present atomic numbers element_symbols: List of present atomic symbols min_distance: Minimum distance in the system """ n_atoms_max = 0 atomic_numbers = set() symbols = set() min_distance = None for system in system_iterator: n_atoms = len(system) # Make ASE.Atoms into a System object if isinstance(system, Atoms): system = System.from_atoms(system) i_atomic_numbers = set(system.get_atomic_numbers()) i_symbols = set(system.get_chemical_symbols()) distance_matrix = system.get_distance_matrix() # Gather atomic numbers and symbols atomic_numbers = atomic_numbers.union(i_atomic_numbers) symbols = symbols.union(i_symbols) # Gather maximum number of atoms if n_atoms > n_atoms_max: n_atoms_max = n_atoms # Gather min distance. For periodic systems we must also consider # distances from an atom to it's periodic copy, as given by # get_distance_matrix() on the diagonal. if np.any(system.get_pbc()): triu_indices = np.triu_indices(len(distance_matrix), k=0) else: triu_indices = np.triu_indices(len(distance_matrix), k=1) distances = distance_matrix[triu_indices] i_min_dist = distances.min() if min_distance is None or i_min_dist < min_distance: min_distance = i_min_dist return { "n_atoms_max": n_atoms_max, "max_atomic_number": max(list(atomic_numbers)), "min_atomic_number": min(list(atomic_numbers)), "atomic_numbers": list(atomic_numbers), "element_symbols": list(symbols), "min_distance": min_distance, }