Exemplo n.º 1
0
    def __init__(self,
                 root,
                 transform=None,
                 pre_transform=None,
                 pre_filter=None,
                 dataset='zinc250k',
                 empty=False):
        """
        Adapted from qm9.py. Disabled the download functionality
        :param root: directory of the dataset, containing a raw and processed
        dir. The raw dir should contain the file containing the smiles, and the
        processed dir can either empty or a previously processed file
        :param dataset: name of the dataset. Currently only implemented for
        zinc250k, chembl_with_labels, tox21, hiv, bace, bbbp, clintox, esol,
        freesolv, lipophilicity, muv, pcba, sider, toxcast
        :param empty: if True, then will not load any data obj. For
        initializing empty dataset
        """
        self.dataset = dataset
        self.root = root

        super(MoleculeDataset, self).__init__()
        if not os.path.exists(self.root + "/new/data.pdparams"):
            self.read()
        self.smiles_list = paddle.load(self.root + "/new/smiles.pdparams")[1]
        data_list = paddle.load(self.root + "/new/data.pdparams")[1]
        self.data_list = [
            G.Graph(i['edge_index'], i['x'].shape[0], {'feature': i['x']},
                    {'feature': i['edge_attr']}) for i in data_list
        ]
        for i in range(len(self.data_list)):
            self.data_list[i].y = data_list[i]['y']
        x = 0
Exemplo n.º 2
0
    def _load_edge_data(self):
        node_sets = set()
        edges = []
        with open(self._data_dir, "r") as f:
            node_dict = dict()
            for line in f:
                src, dist = [
                    int(data) for data in line.strip("\n\r").split(" ")
                ]
                if src not in node_dict:
                    node_dict[src] = len(node_dict) + 1
                src = node_dict[src]
                if dist not in node_dict:
                    node_dict[dist] = len(node_dict) + 1
                dist = node_dict[dist]
                node_sets.add(src)
                node_sets.add(dist)
                edges.append((src, dist))
                if self._undirected:
                    edges.append((dist, src))

        num_nodes = len(node_sets)
        self.graph = graph.Graph(num_nodes=num_nodes + 1, edges=edges)
        self.nodes = np.array(list(node_sets))
        self.node_dict = node_dict
Exemplo n.º 3
0
    def __call__(self, batch_data_list):
        """
        Function caller to convert a batch of data into a big batch feed dictionary.

        Args:
            batch_data_list: a batch of the compound graph data.

        Returns:
            feed_dict: a dictionary contains `graph/xxx` inputs for PGL.
        """
        g_list = []
        label_list = []
        for data in batch_data_list:
            g = graph.Graph(num_nodes=len(data['atom_type']),
                            edges=data['edges'],
                            node_feat={
                                'atom_type':
                                data['atom_type'].reshape([-1, 1]),
                                'chirality_tag':
                                data['chirality_tag'].reshape([-1, 1]),
                            },
                            edge_feat={
                                'bond_type':
                                data['bond_type'].reshape([-1, 1]),
                                'bond_direction':
                                data['bond_direction'].reshape([-1, 1]),
                            })
            g_list.append(g)
            if self.with_graph_label:
                label_list.append(data['label'])

        join_graph = pgl.graph.MultiGraph(g_list)
        feed_dict = self.graph_wrapper.to_feed(join_graph)

        if self.with_graph_label:
            if self.task_type == 'cls':
                batch_label = np.array(label_list).reshape(
                    -1, self.num_cls_tasks)
            elif self.task_type == 'reg':
                label_list = [
                    label[self.reg_target_id] for label in label_list
                ]
                batch_label = np.array(label_list).reshape(-1, 1)

            # label: -1 -> 0, 1 -> 1
            batch_label = ((batch_label + 1.0) / 2).astype('float32')
            batch_valid = (batch_label != 0.5).astype("float32")
            feed_dict['label'] = batch_label
            feed_dict['valid'] = batch_valid

        if self.with_pos_neg_mask:
            pos_mask, neg_mask = MoleculeCollateFunc.get_pos_neg_mask(g_list)
            feed_dict['pos_mask'] = pos_mask
            feed_dict['neg_mask'] = neg_mask

        return feed_dict
Exemplo n.º 4
0
    def _load_data(self):
        """Load data"""
        content = os.path.join(self.path, 'cora.content')
        cite = os.path.join(self.path, 'cora.cites')
        node_feature = []
        paper_ids = []
        y = []
        y_dict = {}
        with open(content, 'r') as f:
            for line in f:
                line = line.strip().split()
                paper_id = int(line[0])
                paper_class = line[-1]
                if paper_class not in y_dict:
                    y_dict[paper_class] = len(y_dict)
                feature = [int(i) for i in line[1:-1]]
                feature_array = np.array(feature, dtype="float32")
                # Normalize
                feature_array = feature_array / (np.sum(feature_array) + 1e-15)
                node_feature.append(feature_array)
                y.append(y_dict[paper_class])
                paper_ids.append(paper_id)
        paper2vid = dict([(v, k) for (k, v) in enumerate(paper_ids)])
        num_nodes = len(paper_ids)
        node_feature = np.array(node_feature, dtype="float32")

        all_edges = []
        with open(cite, 'r') as f:
            for line in f:
                u, v = line.split()
                u = paper2vid[int(u)]
                v = paper2vid[int(v)]
                all_edges.append((u, v))
                if self.symmetry_edges:
                    all_edges.append((v, u))

        if self.self_loop:
            for i in range(num_nodes):
                all_edges.append((i, i))

        all_edges = list(set(all_edges))
        self.graph = graph.Graph(
            num_nodes=num_nodes,
            edges=all_edges,
            node_feat={"words": node_feature})
        perm = np.arange(0, num_nodes)
        #np.random.shuffle(perm)
        self.train_index = perm[:140]
        self.val_index = perm[200:500]
        self.test_index = perm[500:1500]
        self.y = np.array(y, dtype="int64")
        self.num_classes = len(y_dict)
Exemplo n.º 5
0
    def __call__(self, batch_data_list):
        g_list = []
        label_list = []
        for data in batch_data_list:
            g = graph.Graph(num_nodes=len(data['atom_type']),
                            edges=data['edges'],
                            node_feat={
                                'atom_type':
                                data['atom_type'].reshape([-1, 1]),
                                'chirality_tag':
                                data['chirality_tag'].reshape([-1, 1]),
                            },
                            edge_feat={
                                'bond_type':
                                data['bond_type'].reshape([-1, 1]),
                                'bond_direction':
                                data['bond_direction'].reshape([-1, 1]),
                            })
            g_list.append(g)
            if self.with_graph_label:
                label_list.append(data['label'])

        join_graph = pgl.graph.MultiGraph(g_list)
        feed_dict = self.graph_wrapper.to_feed(join_graph)

        if self.with_graph_label:
            if self.task_type == 'cls':
                batch_label = np.array(label_list).reshape(
                    -1, self.num_cls_tasks)
            elif self.task_type == 'reg':
                label_list = [
                    label[self.reg_target_id] for label in label_list
                ]
                batch_label = np.array(label_list).reshape(-1, 1)

            # label: -1 -> 0, 1 -> 1
            batch_label = ((batch_label + 1.0) / 2).astype('float32')
            batch_valid = (batch_label != 0.5).astype("float32")
            feed_dict['label'] = batch_label
            feed_dict['valid'] = batch_valid

        if self.with_pos_neg_mask:
            pos_mask, neg_mask = self.get_pos_neg_mask(g_list)
            feed_dict['pos_mask'] = pos_mask
            feed_dict['neg_mask'] = neg_mask

        return feed_dict
Exemplo n.º 6
0
    def _load_data(self):
        np.random.seed(self.np_random_seed)
        edge_path = os.path.join(self.path, 'ca-AstroPh.txt')

        bi_edges = set()
        self.neg_edges = []
        self.pos_edges = []
        self.node2id = dict()

        def node_id(node):
            if node not in self.node2id:
                self.node2id[node] = len(self.node2id)
            return self.node2id[node]

        with io.open(edge_path) as inf:
            for _ in range(4):
                inf.readline()
            for line in inf:
                u, v = line.strip('\n').split('\t')
                u, v = node_id(u), node_id(v)
                if u < v:
                    bi_edges.add((u, v))
                else:
                    bi_edges.add((v, u))

        num_nodes = len(self.node2id)

        while len(self.neg_edges) < len(bi_edges) // 2:
            random_edges = np.random.choice(num_nodes, [len(bi_edges), 2])
            for (u, v) in random_edges:
                if u != v and (u, v) not in bi_edges and (v,
                                                          u) not in bi_edges:
                    self.neg_edges.append((u, v))
                    if len(self.neg_edges) == len(bi_edges) // 2:
                        break

        bi_edges = list(bi_edges)
        np.random.shuffle(bi_edges)
        self.pos_edges = bi_edges[:len(bi_edges) // 2]
        bi_edges = bi_edges[len(bi_edges) // 2:]
        all_edges = []
        for edge in bi_edges:
            u, v = edge
            all_edges.append((u, v))
            all_edges.append((v, u))
        self.graph = graph.Graph(num_nodes=num_nodes, edges=all_edges)
Exemplo n.º 7
0
    def test_graph_gather(self):
        """test_graph_gather
        """
        np.random.seed(1)

        graph_list = []

        num_graph = 10
        for _ in range(num_graph):
            num_nodes = np.random.randint(5, 20)
            edges = np.random.randint(low=0, high=num_nodes, size=(10, 2))
            node_feat = {
                "feature": np.random.rand(num_nodes, 4).astype("float32")
            }
            g = graph.Graph(num_nodes=num_nodes,
                            edges=edges,
                            node_feat=node_feat)
            graph_list.append(g)

        gg = graph.MultiGraph(graph_list)

        use_cuda = False
        place = F.CUDAPlace(0) if use_cuda else F.CPUPlace()

        prog = F.Program()
        startup_prog = F.Program()
        with F.program_guard(prog, startup_prog):
            gw = graph_wrapper.GraphWrapper(name='graph',
                                            place=place,
                                            node_feat=g.node_feat_info(),
                                            edge_feat=g.edge_feat_info())

            index = L.data(name="index", dtype="int32", shape=[-1])
            feats = pgl.layers.graph_gather(gw, gw.node_feat["feature"], index)

        exe = F.Executor(place)
        exe.run(startup_prog)
        feed_dict = gw.to_feed(gg)
        feed_dict["index"] = np.zeros(num_graph, dtype="int32")
        ret = exe.run(prog, feed=feed_dict, fetch_list=[feats])
        self.assertEqual(list(ret[0].shape), [num_graph, 4])
        for i in range(num_graph):
            dist = (ret[0][i] - graph_list[i].node_feat["feature"][0])
            dist = np.sum(dist**2)
            self.assertLess(dist, 1e-15)
Exemplo n.º 8
0
    def test_gin(self):
        """test_gin
        """
        np.random.seed(1)
        hidden_size = 8

        num_nodes = 10

        edges = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6),
                 (3, 7), (3, 4), (3, 8)]
        inver_edges = [(v, u) for u, v in edges]
        edges.extend(inver_edges)

        node_feat = {"feature": np.random.rand(10, 4).astype("float32")}

        g = graph.Graph(num_nodes=num_nodes, edges=edges, node_feat=node_feat)

        use_cuda = False
        place = F.GPUPlace(0) if use_cuda else F.CPUPlace()

        prog = F.Program()
        startup_prog = F.Program()
        with F.program_guard(prog, startup_prog):
            gw = graph_wrapper.GraphWrapper(name='graph',
                                            place=place,
                                            node_feat=g.node_feat_info(),
                                            edge_feat=g.edge_feat_info())

            output = gin(gw,
                         gw.node_feat['feature'],
                         hidden_size=hidden_size,
                         activation='relu',
                         name='gin',
                         init_eps=1,
                         train_eps=True)

        exe = F.Executor(place)
        exe.run(startup_prog)
        ret = exe.run(prog, feed=gw.to_feed(g), fetch_list=[output])

        self.assertEqual(ret[0].shape[0], num_nodes)
        self.assertEqual(ret[0].shape[1], hidden_size)
Exemplo n.º 9
0
    def _load_data(self):
        edge_path = os.path.join(self.path, 'edges.csv')
        node_path = os.path.join(self.path, 'nodes.csv')
        group_edge_path = os.path.join(self.path, 'group-edges.csv')

        all_edges = []

        with io.open(node_path) as inf:
            num_nodes = len(inf.readlines())

        node_feature = np.zeros((num_nodes, self.num_groups))

        with io.open(group_edge_path) as inf:
            for line in inf:
                node_id, group_id = line.strip('\n').split(',')
                node_id, group_id = int(node_id) - 1, int(group_id) - 1
                node_feature[node_id][group_id] = 1

        with io.open(edge_path) as inf:
            for line in inf:
                u, v = line.strip('\n').split(',')
                u, v = int(u) - 1, int(v) - 1
                all_edges.append((u, v))
                if self.symmetry_edges:
                    all_edges.append((v, u))

        if self.self_loop:
            for i in range(num_nodes):
                all_edges.append((i, i))

        all_edges = list(set(all_edges))
        self.graph = graph.Graph(
            num_nodes=num_nodes,
            edges=all_edges,
            node_feat={"group_id": node_feature})

        perm = np.arange(0, num_nodes)
        np.random.shuffle(perm)
        train_num = int(num_nodes * 0.5)
        self.train_index = perm[:train_num]
        self.test_index = perm[train_num:]
Exemplo n.º 10
0
    def _load_data(self):
        """Load data
        """
        import networkx as nx
        objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(objnames)):
            with open("{}/ind.{}.{}".format(self.path, self.name, objnames[i]),
                      'rb') as f:
                objects.append(_pickle_load(f))

        x, y, tx, ty, allx, ally, _graph = objects
        test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format(
            self.path, self.name))
        test_idx_range = np.sort(test_idx_reorder)

        allx = allx.todense()
        tx = tx.todense()
        if self.name == 'citeseer':
            # Fix citeseer dataset (there are some isolated nodes in the graph)
            # Find isolated nodes, add them as zero-vecs into the right position
            test_idx_range_full = range(min(test_idx_reorder),
                                        max(test_idx_reorder) + 1)
            tx_extended = np.zeros((len(test_idx_range_full), x.shape[1]),
                                   dtype="float32")
            tx_extended[test_idx_range - min(test_idx_range), :] = tx
            tx = tx_extended
            ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]),
                                   dtype="float32")
            ty_extended[test_idx_range - min(test_idx_range), :] = ty
            ty = ty_extended

        features = np.vstack([allx, tx])
        features[test_idx_reorder, :] = features[test_idx_range, :]
        features = features / (np.sum(features, axis=-1) + 1e-15)
        features = np.array(features, dtype="float32")
        _graph = nx.DiGraph(nx.from_dict_of_lists(_graph))

        onehot_labels = np.vstack((ally, ty))
        onehot_labels[test_idx_reorder, :] = onehot_labels[test_idx_range, :]
        labels = np.argmax(onehot_labels, 1)

        idx_test = test_idx_range.tolist()
        idx_train = range(len(y))
        idx_val = range(len(y), len(y) + 500)
        all_edges = []
        for i in _graph.edges():
            u, v = tuple(i)
            all_edges.append((u, v))
            if self.symmetry_edges:
                all_edges.append((v, u))

        if self.self_loop:
            for i in range(_graph.number_of_nodes()):
                all_edges.append((i, i))
        all_edges = list(set(all_edges))

        self.graph = graph.Graph(num_nodes=_graph.number_of_nodes(),
                                 edges=all_edges,
                                 node_feat={"words": features})
        self.y = np.array(labels, dtype="int64")
        self.num_classes = onehot_labels.shape[1]
        self.train_index = np.array(idx_train, dtype="int32")
        self.val_index = np.array(idx_val, dtype="int32")
        self.test_index = np.array(idx_test, dtype="int32")
Exemplo n.º 11
0
    def test_batched_graph_wrapper(self):
        """test_batch_graph_wrapper
        """
        np.random.seed(1)

        graph_list = []

        num_graph = 5
        feed_num_nodes = []
        feed_num_edges = []
        feed_edges = []
        feed_node_feats = []

        for _ in range(num_graph):
            num_nodes = np.random.randint(5, 20)
            edges = np.random.randint(low=0, high=num_nodes, size=(10, 2))
            node_feat = {
                "feature": np.random.rand(num_nodes, 4).astype("float32")
            }
            single_graph = graph.Graph(num_nodes=num_nodes,
                                       edges=edges,
                                       node_feat=node_feat)
            feed_num_nodes.append(num_nodes)
            feed_num_edges.append(len(edges))
            feed_edges.append(edges)
            feed_node_feats.append(node_feat["feature"])
            graph_list.append(single_graph)

        multi_graph = graph.MultiGraph(graph_list)

        np.random.seed(1)
        hidden_size = 8
        num_nodes = 10

        place = F.CUDAPlace(0)  # if use_cuda else F.CPUPlace()
        prog = F.Program()
        startup_prog = F.Program()

        with F.program_guard(prog, startup_prog):
            with F.unique_name.guard():
                # Standard Graph Wrapper
                gw = graph_wrapper.GraphWrapper(name='graph',
                                                place=place,
                                                node_feat=[("feature", [-1, 4],
                                                            "float32")])

                output = gcn(gw,
                             gw.node_feat['feature'],
                             hidden_size=hidden_size,
                             activation='relu',
                             name='gcn')

                # BatchGraphWrapper
                num_nodes = L.data(name="num_nodes", shape=[-1], dtype="int32")
                num_edges = L.data(name="num_edges", shape=[-1], dtype="int32")
                edges = L.data(name="edges", shape=[-1, 2], dtype="int32")
                node_feat = L.data(name="node_feats",
                                   shape=[-1, 4],
                                   dtype="float32")
                batch_gw = graph_wrapper.BatchGraphWrapper(
                    num_nodes=num_nodes,
                    num_edges=num_edges,
                    edges=edges,
                    node_feats={"feature": node_feat})

                output2 = gcn(batch_gw,
                              batch_gw.node_feat['feature'],
                              hidden_size=hidden_size,
                              activation='relu',
                              name='gcn')

        exe = F.Executor(place)
        exe.run(startup_prog)
        feed_dict = gw.to_feed(multi_graph)
        feed_dict["num_nodes"] = np.array(feed_num_nodes, dtype="int32")
        feed_dict["num_edges"] = np.array(feed_num_edges, dtype="int32")
        feed_dict["edges"] = np.array(np.concatenate(feed_edges, 0),
                                      dtype="int32").reshape([-1, 2])
        feed_dict["node_feats"] = np.array(np.concatenate(feed_node_feats, 0),
                                           dtype="float32").reshape([-1, 4])

        # Run
        O1, O2 = exe.run(prog, feed=feed_dict, fetch_list=[output, output2])

        # The output from two kind of models should be same.
        for o1, o2 in zip(O1, O2):
            dist = np.sum((o1 - o2)**2)
            self.assertLess(dist, 1e-15)
Exemplo n.º 12
0
    def _load_data(self):
        edge_path = os.path.join(self.path, 'edges.txt')
        node_path = os.path.join(self.path, 'nodes.txt')
        nodes_label_path = os.path.join(self.path, 'nodes_label.txt')

        all_edges = []
        edges_weight = []

        with io.open(node_path) as inf:
            num_nodes = len(inf.readlines())

        node_feature = np.zeros((num_nodes, self.num_groups))

        with io.open(nodes_label_path) as inf:
            for line in inf:
                # group_id means the label of the node
                node_id, group_id = line.strip('\n').split(',')
                node_id = int(node_id) - 1
                labels = group_id.split(' ')
                for i in labels:
                    node_feature[node_id][int(i) - 1] = 1

        node_degree_list = [1 for _ in range(num_nodes)]

        with io.open(edge_path) as inf:
            for line in inf:
                items = line.strip().split('\t')
                if len(items) == 2:
                    u, v = int(items[0]), int(items[1])
                    weight = 1  # binary weight, default set to 1
                else:
                    u, v, weight = int(items[0]), int(items[1]), float(
                        items[2]),
                u, v = u - 1, v - 1
                all_edges.append((u, v))
                edges_weight.append(weight)

                if self.symmetry_edges:
                    all_edges.append((v, u))
                    edges_weight.append(weight)

                # sum the weights of the same node as the outdegree
                node_degree_list[u] += weight

        if self.self_loop:
            for i in range(num_nodes):
                all_edges.append((i, i))
                edges_weight.append(1.)

        all_edges = list(set(all_edges))
        self.graph = graph.Graph(num_nodes=num_nodes,
                                 edges=all_edges,
                                 node_feat={"group_id": node_feature})

        perm = np.arange(0, num_nodes)
        np.random.shuffle(perm)
        train_num = int(num_nodes * self.train_percentage)
        self.train_index = perm[:train_num]
        self.test_index = perm[train_num:]

        edge_distribution = np.array(edges_weight, dtype=np.float32)
        self.edge_distribution = edge_distribution / np.sum(edge_distribution)
        self.edge_sampling = AliasSampling(prob=edge_distribution)

        node_dist = np.array(node_degree_list, dtype=np.float32)
        node_negative_distribution = np.power(node_dist, 0.75)
        self.node_negative_distribution = node_negative_distribution / np.sum(
            node_negative_distribution)
        self.node_sampling = AliasSampling(prob=node_negative_distribution)

        self.node_index = {}
        self.node_index_reversed = {}
        for index, e in enumerate(self.graph.edges):
            self.node_index[e[0]] = index
            self.node_index_reversed[index] = e[0]