Exemplo n.º 1
0
    def _add_ndata(self):
        vectorizer = CountVectorizer(min_df=5)
        features = vectorizer.fit_transform(
            self.data['plot_keywords'].fillna('').values)
        self.g.nodes['movie'].data['feat'] = torch.from_numpy(
            features.toarray()).float()
        self.g.nodes['movie'].data['label'] = torch.from_numpy(
            self.labels).long()

        # actor和director顶点的特征为其关联的movie顶点特征的平均
        self.g.multi_update_all(
            {
                'ma': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat')),
                'md': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat'))
            }, 'sum')

        n_movies = len(self.movies)
        train_idx, val_idx, test_idx = split_idx(np.arange(n_movies), 400, 400,
                                                 self._seed)
        self.g.nodes['movie'].data['train_mask'] = generate_mask_tensor(
            idx2mask(train_idx, n_movies))
        self.g.nodes['movie'].data['val_mask'] = generate_mask_tensor(
            idx2mask(val_idx, n_movies))
        self.g.nodes['movie'].data['test_mask'] = generate_mask_tensor(
            idx2mask(test_idx, n_movies))
Exemplo n.º 2
0
    def process(self):
        with open(os.path.join(self.raw_dir, 'ACM3025.pkl'), 'rb') as f:
            data = pickle.load(f)
        features = torch.from_numpy(
            data['feature'].todense()).float()  # (3025, 1870)
        labels = torch.from_numpy(
            data['label'].todense()).long().nonzero(as_tuple=True)[1]  # (3025)

        # Adjacency matrices for meta-path based neighbors
        # (Mufei): I verified both of them are binary adjacency matrices with self loops
        author_g = dgl.from_scipy(data['PAP'])
        subject_g = dgl.from_scipy(data['PLP'])
        self.gs = [author_g, subject_g]

        num_nodes = data['label'].shape[0]
        train_mask = generate_mask_tensor(
            idx2mask(data['train_idx'][0], num_nodes))
        val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0],
                                                 num_nodes))
        test_mask = generate_mask_tensor(
            idx2mask(data['test_idx'][0], num_nodes))
        for g in self.gs:
            g.ndata['feat'] = features
            g.ndata['label'] = labels
            g.ndata['train_mask'] = train_mask
            g.ndata['val_mask'] = val_mask
            g.ndata['test_mask'] = test_mask
Exemplo n.º 3
0
    def process(self):
        # 将原始数据处理为图、标签和数据集划分的掩码
        # 构建图
        root = self.raw_dir
        #edges_data = pd.read_csv("{}/coraheader.csv".format(root))
        #edges_data = pd.read_csv("{}/citseerheader.csv".format(root))
        edges_data = pd.read_csv("{}/pubmedheader.csv".format(root))
        src = edges_data['SRC'].to_numpy()
        dst = edges_data['DST'].to_numpy()
        g = dgl.graph((src, dst))

        # 设置数据集idx
        # cora数据集7类
        """idx_train =np.arange(0,140).tolist()
        idx_test = np.arange(1707,2708).tolist()
        idx_val = np.arange(140,640).tolist()"""

        # citseer6类 3703个特征6类
        """idx_train =np.arange(0,120).tolist()
        idx_test = np.arange(2327,3327).tolist()
        idx_val = np.arange(120,620).tolist()"""

        # Pubmed6类 500个特征,3类
        idx_train = np.arange(0, 60).tolist()
        idx_test = np.arange(18717, 19717).tolist()
        idx_val = np.arange(60, 500).tolist()

        # 节点标签
        #labels=th.LongTensor(pd.read_csv("{}/coralabels.csv".format(root),header=None)[1])
        #labels=th.LongTensor(pd.read_csv("{}/citseerlabels.csv".format(root),header=None)[1])
        labels = th.LongTensor(
            pd.read_csv("{}/pubmedlabels.csv".format(root), header=None)[1])
        g.ndata['labels'] = labels

        # 设置掩码
        train_mask = _sample_mask(idx_train, labels.shape[0])
        val_mask = _sample_mask(idx_val, labels.shape[0])
        test_mask = _sample_mask(idx_test, labels.shape[0])

        #划分掩码
        g.ndata['train_mask'] = generate_mask_tensor(train_mask)
        g.ndata['val_mask'] = generate_mask_tensor(val_mask)
        g.ndata['test_mask'] = generate_mask_tensor(test_mask)

        # 节点特征
        #features = np.load("{}/corafeatures.npy".format(root))
        #features =np.load("{}/citseerfeatures.npy".format(root))
        features = np.load("{}/pubmedfeatures.npy".format(root))
        g.ndata['feat'] = th.FloatTensor(features)

        # 边缘特征
        g.edata['w'] = th.tensor(edges_data['W'])
        self._labels = labels
        self._g = g
        self._features = features
        self._train_mask = train_mask
        self._test_mask = test_mask
        self._val_mask = val_mask
Exemplo n.º 4
0
 def load(self):
     graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
     graphs, _ = load_graphs(graph_path)
     self._graph = graphs[0]
     self._graph.ndata['train_mask'] = generate_mask_tensor(
         self._graph.ndata['train_mask'].numpy())
     self._graph.ndata['val_mask'] = generate_mask_tensor(
         self._graph.ndata['val_mask'].numpy())
     self._graph.ndata['test_mask'] = generate_mask_tensor(
         self._graph.ndata['test_mask'].numpy())
     self._print_info()
Exemplo n.º 5
0
    def _add_ndata(self):
        _raw_file2 = os.path.join(self.raw_dir, 'DBLP4057_GAT_with_idx.mat')
        if not os.path.exists(_raw_file2):
            raise FileNotFoundError('请手动下载文件 {} 提取码:6b3h 并保存到 {}'.format(
                self._url2, _raw_file2
            ))
        mat = sio.loadmat(_raw_file2)
        self.g.nodes['author'].data['feat'] = torch.from_numpy(mat['features']).float()
        self.g.nodes['author'].data['label'] = torch.tensor(self.authors['label'].to_list())

        n_authors = len(self.authors)
        train_idx, val_idx, test_idx = split_idx(np.arange(n_authors), 800, 400, self._seed)
        self.g.nodes['author'].data['train_mask'] = generate_mask_tensor(idx2mask(train_idx, n_authors))
        self.g.nodes['author'].data['val_mask'] = generate_mask_tensor(idx2mask(val_idx, n_authors))
        self.g.nodes['author'].data['test_mask'] = generate_mask_tensor(idx2mask(test_idx, n_authors))

        self.g.nodes['conf'].data['label'] = torch.tensor(self.confs['label'].to_list())
Exemplo n.º 6
0
    def process(self):
        data = sio.loadmat(os.path.join(self.raw_dir, 'DBLP4057_GAT_with_idx.mat'))
        apa_g = dgl.graph(data['net_APA'].nonzero())
        apcpa_g = dgl.graph(data['net_APCPA'].nonzero())
        aptpa_g = dgl.graph(data['net_APTPA'].nonzero())
        self.gs = [apa_g, apcpa_g, aptpa_g]

        features = torch.from_numpy(data['features']).float()
        labels = torch.from_numpy(data['label'].nonzero()[1])
        num_nodes = data['label'].shape[0]
        train_mask = generate_mask_tensor(idx2mask(data['train_idx'][0], num_nodes))
        val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0], num_nodes))
        test_mask = generate_mask_tensor(idx2mask(data['test_idx'][0], num_nodes))
        for g in self.gs:
            g.ndata['feat'] = features
            g.ndata['label'] = labels
            g.ndata['train_mask'] = train_mask
            g.ndata['val_mask'] = val_mask
            g.ndata['test_mask'] = test_mask
Exemplo n.º 7
0
 def process(self):
     # graph
     coo_adj = sp.load_npz(os.path.join(self._raw_dir, "amazon_graph.npz"))
     self._graph = from_scipy(coo_adj)
     # features and labels
     reddit_data = np.load(os.path.join(self._raw_dir, "amazon_data.npz"))
     features = reddit_data["feature"]
     labels = reddit_data["label"]
     # tarin/val/test indices
     node_types = reddit_data["node_types"]
     train_mask = (node_types == 1)
     val_mask = (node_types == 2)
     test_mask = (node_types == 3)
     self._graph.ndata['train_mask'] = generate_mask_tensor(train_mask)
     self._graph.ndata['val_mask'] = generate_mask_tensor(val_mask)
     self._graph.ndata['test_mask'] = generate_mask_tensor(test_mask)
     self._graph.ndata['feat'] = F.tensor(features,
                                          dtype=F.data_type_dict['float32'])
     self._graph.ndata['label'] = F.tensor(labels,
                                           dtype=F.data_type_dict['int64'])
     self._print_info()
Exemplo n.º 8
0
    def process(self):
        data = sio.loadmat(os.path.join(self.raw_dir, 'imdb5k.mat'))
        mam_g = dgl.graph(data['MAM'].nonzero())
        mdm_g = dgl.graph(data['MDM'].nonzero())
        # mym_g = dgl.graph(data['MYM'].nonzero())
        self.gs = [mam_g, mdm_g]

        features = torch.from_numpy(data['feature']).float()
        num_nodes = features.shape[0]
        labels = torch.full((num_nodes, ), -1, dtype=torch.long)
        idx, label = data['label'].nonzero()
        labels[idx] = torch.from_numpy(label)
        train_mask = generate_mask_tensor(
            idx2mask(data['train_idx'][0], num_nodes))
        val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0],
                                                 num_nodes))
        test_mask = generate_mask_tensor(
            idx2mask(data['test_idx'][0], num_nodes))
        for g in self.gs:
            g.ndata['feat'] = features
            g.ndata['label'] = labels
            g.ndata['train_mask'] = train_mask
            g.ndata['val_mask'] = val_mask
            g.ndata['test_mask'] = test_mask
Exemplo n.º 9
0
    def process(self):
        self.g = dgl.heterograph(self._read_edges())

        feats = self._read_feats()
        for ntype, feat in feats.items():
            self.g.nodes[ntype].data['feat'] = feat

        labels = torch.from_numpy(
            np.load(os.path.join(self.raw_path, 'labels.npy'))).long()
        self._num_classes = labels.max().item() + 1
        self.g.nodes[self.predict_ntype].data['label'] = labels

        n = self.g.num_nodes(self.predict_ntype)
        for split in ('train', 'val', 'test'):
            idx = np.load(os.path.join(self.raw_path, f'{split}_60.npy'))
            mask = generate_mask_tensor(idx2mask(idx, n))
            self.g.nodes[self.predict_ntype].data[f'{split}_mask'] = mask

        pos_i, pos_j = sp.load_npz(os.path.join(self.raw_path,
                                                'pos.npz')).nonzero()
        self.pos_i, self.pos_j = torch.from_numpy(
            pos_i).long(), torch.from_numpy(pos_j).long()
Exemplo n.º 10
0
    def process(self):
        data = sio.loadmat(os.path.join(self.raw_dir, 'ACM.mat'))
        p_vs_l = data['PvsL']  # paper-field?
        p_vs_a = data['PvsA']  # paper-author
        p_vs_t = data['PvsT']  # paper-term, bag of words
        p_vs_c = data['PvsC']  # paper-conference, labels come from that

        # We assign
        # (1) KDD papers as class 0 (data mining),
        # (2) SIGMOD and VLDB papers as class 1 (database),
        # (3) SIGCOMM and MobiCOMM papers as class 2 (communication)
        conf_ids = [0, 1, 9, 10, 13]
        label_ids = [0, 1, 2, 2, 1]

        p_vs_c_filter = p_vs_c[:, conf_ids]
        p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
        p_vs_l = p_vs_l[p_selected]
        p_vs_a = p_vs_a[p_selected]
        p_vs_t = p_vs_t[p_selected]
        p_vs_c = p_vs_c[p_selected]

        self.g = dgl.heterograph({
            ('paper', 'pa', 'author'):
            p_vs_a.nonzero(),
            ('author', 'ap', 'paper'):
            p_vs_a.transpose().nonzero(),
            ('paper', 'pf', 'field'):
            p_vs_l.nonzero(),
            ('field', 'fp', 'paper'):
            p_vs_l.transpose().nonzero()
        })
        paper_features = torch.FloatTensor(p_vs_t.toarray())  # (4025, 1903)

        pc_p, pc_c = p_vs_c.nonzero()
        paper_labels = np.zeros(len(p_selected), dtype=np.int64)
        for conf_id, label_id in zip(conf_ids, label_ids):
            paper_labels[pc_p[pc_c == conf_id]] = label_id
        paper_labels = torch.from_numpy(paper_labels)

        float_mask = np.zeros(len(pc_p))
        for conf_id in conf_ids:
            pc_c_mask = (pc_c == conf_id)
            float_mask[pc_c_mask] = np.random.permutation(
                np.linspace(0, 1, pc_c_mask.sum()))
        train_idx = np.where(float_mask <= 0.2)[0]
        val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
        test_idx = np.where(float_mask > 0.3)[0]

        num_paper_nodes = self.g.num_nodes('paper')
        train_mask = generate_mask_tensor(idx2mask(train_idx, num_paper_nodes))
        val_mask = generate_mask_tensor(idx2mask(val_idx, num_paper_nodes))
        test_mask = generate_mask_tensor(idx2mask(test_idx, num_paper_nodes))

        self.g.nodes['paper'].data['feat'] = paper_features
        self.g.nodes['paper'].data['label'] = paper_labels
        self.g.nodes['paper'].data['train_mask'] = train_mask
        self.g.nodes['paper'].data['val_mask'] = val_mask
        self.g.nodes['paper'].data['test_mask'] = test_mask
        # author顶点的特征为其关联的paper顶点特征的平均
        self.g.multi_update_all(
            {'pa': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat'))}, 'sum')
        self.g.nodes['field'].data['feat'] = torch.eye(
            self.g.num_nodes('field'))