def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, dataset='zinc250k', empty=False): """ Adapted from qm9.py. Disabled the download functionality :param root: directory of the dataset, containing a raw and processed dir. The raw dir should contain the file containing the smiles, and the processed dir can either empty or a previously processed file :param dataset: name of the dataset. Currently only implemented for zinc250k, chembl_with_labels, tox21, hiv, bace, bbbp, clintox, esol, freesolv, lipophilicity, muv, pcba, sider, toxcast :param empty: if True, then will not load any data obj. For initializing empty dataset """ self.dataset = dataset self.root = root super(MoleculeDataset, self).__init__() if not os.path.exists(self.root + "/new/data.pdparams"): self.read() self.smiles_list = paddle.load(self.root + "/new/smiles.pdparams")[1] data_list = paddle.load(self.root + "/new/data.pdparams")[1] self.data_list = [ G.Graph(i['edge_index'], i['x'].shape[0], {'feature': i['x']}, {'feature': i['edge_attr']}) for i in data_list ] for i in range(len(self.data_list)): self.data_list[i].y = data_list[i]['y'] x = 0
def _load_edge_data(self): node_sets = set() edges = [] with open(self._data_dir, "r") as f: node_dict = dict() for line in f: src, dist = [ int(data) for data in line.strip("\n\r").split(" ") ] if src not in node_dict: node_dict[src] = len(node_dict) + 1 src = node_dict[src] if dist not in node_dict: node_dict[dist] = len(node_dict) + 1 dist = node_dict[dist] node_sets.add(src) node_sets.add(dist) edges.append((src, dist)) if self._undirected: edges.append((dist, src)) num_nodes = len(node_sets) self.graph = graph.Graph(num_nodes=num_nodes + 1, edges=edges) self.nodes = np.array(list(node_sets)) self.node_dict = node_dict
def __call__(self, batch_data_list): """ Function caller to convert a batch of data into a big batch feed dictionary. Args: batch_data_list: a batch of the compound graph data. Returns: feed_dict: a dictionary contains `graph/xxx` inputs for PGL. """ g_list = [] label_list = [] for data in batch_data_list: g = graph.Graph(num_nodes=len(data['atom_type']), edges=data['edges'], node_feat={ 'atom_type': data['atom_type'].reshape([-1, 1]), 'chirality_tag': data['chirality_tag'].reshape([-1, 1]), }, edge_feat={ 'bond_type': data['bond_type'].reshape([-1, 1]), 'bond_direction': data['bond_direction'].reshape([-1, 1]), }) g_list.append(g) if self.with_graph_label: label_list.append(data['label']) join_graph = pgl.graph.MultiGraph(g_list) feed_dict = self.graph_wrapper.to_feed(join_graph) if self.with_graph_label: if self.task_type == 'cls': batch_label = np.array(label_list).reshape( -1, self.num_cls_tasks) elif self.task_type == 'reg': label_list = [ label[self.reg_target_id] for label in label_list ] batch_label = np.array(label_list).reshape(-1, 1) # label: -1 -> 0, 1 -> 1 batch_label = ((batch_label + 1.0) / 2).astype('float32') batch_valid = (batch_label != 0.5).astype("float32") feed_dict['label'] = batch_label feed_dict['valid'] = batch_valid if self.with_pos_neg_mask: pos_mask, neg_mask = MoleculeCollateFunc.get_pos_neg_mask(g_list) feed_dict['pos_mask'] = pos_mask feed_dict['neg_mask'] = neg_mask return feed_dict
def _load_data(self): """Load data""" content = os.path.join(self.path, 'cora.content') cite = os.path.join(self.path, 'cora.cites') node_feature = [] paper_ids = [] y = [] y_dict = {} with open(content, 'r') as f: for line in f: line = line.strip().split() paper_id = int(line[0]) paper_class = line[-1] if paper_class not in y_dict: y_dict[paper_class] = len(y_dict) feature = [int(i) for i in line[1:-1]] feature_array = np.array(feature, dtype="float32") # Normalize feature_array = feature_array / (np.sum(feature_array) + 1e-15) node_feature.append(feature_array) y.append(y_dict[paper_class]) paper_ids.append(paper_id) paper2vid = dict([(v, k) for (k, v) in enumerate(paper_ids)]) num_nodes = len(paper_ids) node_feature = np.array(node_feature, dtype="float32") all_edges = [] with open(cite, 'r') as f: for line in f: u, v = line.split() u = paper2vid[int(u)] v = paper2vid[int(v)] all_edges.append((u, v)) if self.symmetry_edges: all_edges.append((v, u)) if self.self_loop: for i in range(num_nodes): all_edges.append((i, i)) all_edges = list(set(all_edges)) self.graph = graph.Graph( num_nodes=num_nodes, edges=all_edges, node_feat={"words": node_feature}) perm = np.arange(0, num_nodes) #np.random.shuffle(perm) self.train_index = perm[:140] self.val_index = perm[200:500] self.test_index = perm[500:1500] self.y = np.array(y, dtype="int64") self.num_classes = len(y_dict)
def __call__(self, batch_data_list): g_list = [] label_list = [] for data in batch_data_list: g = graph.Graph(num_nodes=len(data['atom_type']), edges=data['edges'], node_feat={ 'atom_type': data['atom_type'].reshape([-1, 1]), 'chirality_tag': data['chirality_tag'].reshape([-1, 1]), }, edge_feat={ 'bond_type': data['bond_type'].reshape([-1, 1]), 'bond_direction': data['bond_direction'].reshape([-1, 1]), }) g_list.append(g) if self.with_graph_label: label_list.append(data['label']) join_graph = pgl.graph.MultiGraph(g_list) feed_dict = self.graph_wrapper.to_feed(join_graph) if self.with_graph_label: if self.task_type == 'cls': batch_label = np.array(label_list).reshape( -1, self.num_cls_tasks) elif self.task_type == 'reg': label_list = [ label[self.reg_target_id] for label in label_list ] batch_label = np.array(label_list).reshape(-1, 1) # label: -1 -> 0, 1 -> 1 batch_label = ((batch_label + 1.0) / 2).astype('float32') batch_valid = (batch_label != 0.5).astype("float32") feed_dict['label'] = batch_label feed_dict['valid'] = batch_valid if self.with_pos_neg_mask: pos_mask, neg_mask = self.get_pos_neg_mask(g_list) feed_dict['pos_mask'] = pos_mask feed_dict['neg_mask'] = neg_mask return feed_dict
def _load_data(self): np.random.seed(self.np_random_seed) edge_path = os.path.join(self.path, 'ca-AstroPh.txt') bi_edges = set() self.neg_edges = [] self.pos_edges = [] self.node2id = dict() def node_id(node): if node not in self.node2id: self.node2id[node] = len(self.node2id) return self.node2id[node] with io.open(edge_path) as inf: for _ in range(4): inf.readline() for line in inf: u, v = line.strip('\n').split('\t') u, v = node_id(u), node_id(v) if u < v: bi_edges.add((u, v)) else: bi_edges.add((v, u)) num_nodes = len(self.node2id) while len(self.neg_edges) < len(bi_edges) // 2: random_edges = np.random.choice(num_nodes, [len(bi_edges), 2]) for (u, v) in random_edges: if u != v and (u, v) not in bi_edges and (v, u) not in bi_edges: self.neg_edges.append((u, v)) if len(self.neg_edges) == len(bi_edges) // 2: break bi_edges = list(bi_edges) np.random.shuffle(bi_edges) self.pos_edges = bi_edges[:len(bi_edges) // 2] bi_edges = bi_edges[len(bi_edges) // 2:] all_edges = [] for edge in bi_edges: u, v = edge all_edges.append((u, v)) all_edges.append((v, u)) self.graph = graph.Graph(num_nodes=num_nodes, edges=all_edges)
def test_graph_gather(self): """test_graph_gather """ np.random.seed(1) graph_list = [] num_graph = 10 for _ in range(num_graph): num_nodes = np.random.randint(5, 20) edges = np.random.randint(low=0, high=num_nodes, size=(10, 2)) node_feat = { "feature": np.random.rand(num_nodes, 4).astype("float32") } g = graph.Graph(num_nodes=num_nodes, edges=edges, node_feat=node_feat) graph_list.append(g) gg = graph.MultiGraph(graph_list) use_cuda = False place = F.CUDAPlace(0) if use_cuda else F.CPUPlace() prog = F.Program() startup_prog = F.Program() with F.program_guard(prog, startup_prog): gw = graph_wrapper.GraphWrapper(name='graph', place=place, node_feat=g.node_feat_info(), edge_feat=g.edge_feat_info()) index = L.data(name="index", dtype="int32", shape=[-1]) feats = pgl.layers.graph_gather(gw, gw.node_feat["feature"], index) exe = F.Executor(place) exe.run(startup_prog) feed_dict = gw.to_feed(gg) feed_dict["index"] = np.zeros(num_graph, dtype="int32") ret = exe.run(prog, feed=feed_dict, fetch_list=[feats]) self.assertEqual(list(ret[0].shape), [num_graph, 4]) for i in range(num_graph): dist = (ret[0][i] - graph_list[i].node_feat["feature"][0]) dist = np.sum(dist**2) self.assertLess(dist, 1e-15)
def test_gin(self): """test_gin """ np.random.seed(1) hidden_size = 8 num_nodes = 10 edges = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6), (3, 7), (3, 4), (3, 8)] inver_edges = [(v, u) for u, v in edges] edges.extend(inver_edges) node_feat = {"feature": np.random.rand(10, 4).astype("float32")} g = graph.Graph(num_nodes=num_nodes, edges=edges, node_feat=node_feat) use_cuda = False place = F.GPUPlace(0) if use_cuda else F.CPUPlace() prog = F.Program() startup_prog = F.Program() with F.program_guard(prog, startup_prog): gw = graph_wrapper.GraphWrapper(name='graph', place=place, node_feat=g.node_feat_info(), edge_feat=g.edge_feat_info()) output = gin(gw, gw.node_feat['feature'], hidden_size=hidden_size, activation='relu', name='gin', init_eps=1, train_eps=True) exe = F.Executor(place) exe.run(startup_prog) ret = exe.run(prog, feed=gw.to_feed(g), fetch_list=[output]) self.assertEqual(ret[0].shape[0], num_nodes) self.assertEqual(ret[0].shape[1], hidden_size)
def _load_data(self): edge_path = os.path.join(self.path, 'edges.csv') node_path = os.path.join(self.path, 'nodes.csv') group_edge_path = os.path.join(self.path, 'group-edges.csv') all_edges = [] with io.open(node_path) as inf: num_nodes = len(inf.readlines()) node_feature = np.zeros((num_nodes, self.num_groups)) with io.open(group_edge_path) as inf: for line in inf: node_id, group_id = line.strip('\n').split(',') node_id, group_id = int(node_id) - 1, int(group_id) - 1 node_feature[node_id][group_id] = 1 with io.open(edge_path) as inf: for line in inf: u, v = line.strip('\n').split(',') u, v = int(u) - 1, int(v) - 1 all_edges.append((u, v)) if self.symmetry_edges: all_edges.append((v, u)) if self.self_loop: for i in range(num_nodes): all_edges.append((i, i)) all_edges = list(set(all_edges)) self.graph = graph.Graph( num_nodes=num_nodes, edges=all_edges, node_feat={"group_id": node_feature}) perm = np.arange(0, num_nodes) np.random.shuffle(perm) train_num = int(num_nodes * 0.5) self.train_index = perm[:train_num] self.test_index = perm[train_num:]
def _load_data(self): """Load data """ import networkx as nx objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(objnames)): with open("{}/ind.{}.{}".format(self.path, self.name, objnames[i]), 'rb') as f: objects.append(_pickle_load(f)) x, y, tx, ty, allx, ally, _graph = objects test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format( self.path, self.name)) test_idx_range = np.sort(test_idx_reorder) allx = allx.todense() tx = tx.todense() if self.name == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = np.zeros((len(test_idx_range_full), x.shape[1]), dtype="float32") tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]), dtype="float32") ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = np.vstack([allx, tx]) features[test_idx_reorder, :] = features[test_idx_range, :] features = features / (np.sum(features, axis=-1) + 1e-15) features = np.array(features, dtype="float32") _graph = nx.DiGraph(nx.from_dict_of_lists(_graph)) onehot_labels = np.vstack((ally, ty)) onehot_labels[test_idx_reorder, :] = onehot_labels[test_idx_range, :] labels = np.argmax(onehot_labels, 1) idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) all_edges = [] for i in _graph.edges(): u, v = tuple(i) all_edges.append((u, v)) if self.symmetry_edges: all_edges.append((v, u)) if self.self_loop: for i in range(_graph.number_of_nodes()): all_edges.append((i, i)) all_edges = list(set(all_edges)) self.graph = graph.Graph(num_nodes=_graph.number_of_nodes(), edges=all_edges, node_feat={"words": features}) self.y = np.array(labels, dtype="int64") self.num_classes = onehot_labels.shape[1] self.train_index = np.array(idx_train, dtype="int32") self.val_index = np.array(idx_val, dtype="int32") self.test_index = np.array(idx_test, dtype="int32")
def test_batched_graph_wrapper(self): """test_batch_graph_wrapper """ np.random.seed(1) graph_list = [] num_graph = 5 feed_num_nodes = [] feed_num_edges = [] feed_edges = [] feed_node_feats = [] for _ in range(num_graph): num_nodes = np.random.randint(5, 20) edges = np.random.randint(low=0, high=num_nodes, size=(10, 2)) node_feat = { "feature": np.random.rand(num_nodes, 4).astype("float32") } single_graph = graph.Graph(num_nodes=num_nodes, edges=edges, node_feat=node_feat) feed_num_nodes.append(num_nodes) feed_num_edges.append(len(edges)) feed_edges.append(edges) feed_node_feats.append(node_feat["feature"]) graph_list.append(single_graph) multi_graph = graph.MultiGraph(graph_list) np.random.seed(1) hidden_size = 8 num_nodes = 10 place = F.CUDAPlace(0) # if use_cuda else F.CPUPlace() prog = F.Program() startup_prog = F.Program() with F.program_guard(prog, startup_prog): with F.unique_name.guard(): # Standard Graph Wrapper gw = graph_wrapper.GraphWrapper(name='graph', place=place, node_feat=[("feature", [-1, 4], "float32")]) output = gcn(gw, gw.node_feat['feature'], hidden_size=hidden_size, activation='relu', name='gcn') # BatchGraphWrapper num_nodes = L.data(name="num_nodes", shape=[-1], dtype="int32") num_edges = L.data(name="num_edges", shape=[-1], dtype="int32") edges = L.data(name="edges", shape=[-1, 2], dtype="int32") node_feat = L.data(name="node_feats", shape=[-1, 4], dtype="float32") batch_gw = graph_wrapper.BatchGraphWrapper( num_nodes=num_nodes, num_edges=num_edges, edges=edges, node_feats={"feature": node_feat}) output2 = gcn(batch_gw, batch_gw.node_feat['feature'], hidden_size=hidden_size, activation='relu', name='gcn') exe = F.Executor(place) exe.run(startup_prog) feed_dict = gw.to_feed(multi_graph) feed_dict["num_nodes"] = np.array(feed_num_nodes, dtype="int32") feed_dict["num_edges"] = np.array(feed_num_edges, dtype="int32") feed_dict["edges"] = np.array(np.concatenate(feed_edges, 0), dtype="int32").reshape([-1, 2]) feed_dict["node_feats"] = np.array(np.concatenate(feed_node_feats, 0), dtype="float32").reshape([-1, 4]) # Run O1, O2 = exe.run(prog, feed=feed_dict, fetch_list=[output, output2]) # The output from two kind of models should be same. for o1, o2 in zip(O1, O2): dist = np.sum((o1 - o2)**2) self.assertLess(dist, 1e-15)
def _load_data(self): edge_path = os.path.join(self.path, 'edges.txt') node_path = os.path.join(self.path, 'nodes.txt') nodes_label_path = os.path.join(self.path, 'nodes_label.txt') all_edges = [] edges_weight = [] with io.open(node_path) as inf: num_nodes = len(inf.readlines()) node_feature = np.zeros((num_nodes, self.num_groups)) with io.open(nodes_label_path) as inf: for line in inf: # group_id means the label of the node node_id, group_id = line.strip('\n').split(',') node_id = int(node_id) - 1 labels = group_id.split(' ') for i in labels: node_feature[node_id][int(i) - 1] = 1 node_degree_list = [1 for _ in range(num_nodes)] with io.open(edge_path) as inf: for line in inf: items = line.strip().split('\t') if len(items) == 2: u, v = int(items[0]), int(items[1]) weight = 1 # binary weight, default set to 1 else: u, v, weight = int(items[0]), int(items[1]), float( items[2]), u, v = u - 1, v - 1 all_edges.append((u, v)) edges_weight.append(weight) if self.symmetry_edges: all_edges.append((v, u)) edges_weight.append(weight) # sum the weights of the same node as the outdegree node_degree_list[u] += weight if self.self_loop: for i in range(num_nodes): all_edges.append((i, i)) edges_weight.append(1.) all_edges = list(set(all_edges)) self.graph = graph.Graph(num_nodes=num_nodes, edges=all_edges, node_feat={"group_id": node_feature}) perm = np.arange(0, num_nodes) np.random.shuffle(perm) train_num = int(num_nodes * self.train_percentage) self.train_index = perm[:train_num] self.test_index = perm[train_num:] edge_distribution = np.array(edges_weight, dtype=np.float32) self.edge_distribution = edge_distribution / np.sum(edge_distribution) self.edge_sampling = AliasSampling(prob=edge_distribution) node_dist = np.array(node_degree_list, dtype=np.float32) node_negative_distribution = np.power(node_dist, 0.75) self.node_negative_distribution = node_negative_distribution / np.sum( node_negative_distribution) self.node_sampling = AliasSampling(prob=node_negative_distribution) self.node_index = {} self.node_index_reversed = {} for index, e in enumerate(self.graph.edges): self.node_index[e[0]] = index self.node_index_reversed[index] = e[0]