예제 #1
0
    def _add_ndata(self):
        vectorizer = CountVectorizer(min_df=5)
        features = vectorizer.fit_transform(
            self.data['plot_keywords'].fillna('').values)
        self.g.nodes['movie'].data['feat'] = torch.from_numpy(
            features.toarray()).float()
        self.g.nodes['movie'].data['label'] = torch.from_numpy(
            self.labels).long()

        # actor和director顶点的特征为其关联的movie顶点特征的平均
        self.g.multi_update_all(
            {
                'ma': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat')),
                'md': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat'))
            }, 'sum')

        n_movies = len(self.movies)
        train_idx, val_idx, test_idx = split_idx(np.arange(n_movies), 400, 400,
                                                 self._seed)
        self.g.nodes['movie'].data['train_mask'] = generate_mask_tensor(
            idx2mask(train_idx, n_movies))
        self.g.nodes['movie'].data['val_mask'] = generate_mask_tensor(
            idx2mask(val_idx, n_movies))
        self.g.nodes['movie'].data['test_mask'] = generate_mask_tensor(
            idx2mask(test_idx, n_movies))
예제 #2
0
    def process(self):
        with open(os.path.join(self.raw_dir, 'ACM3025.pkl'), 'rb') as f:
            data = pickle.load(f)
        features = torch.from_numpy(
            data['feature'].todense()).float()  # (3025, 1870)
        labels = torch.from_numpy(
            data['label'].todense()).long().nonzero(as_tuple=True)[1]  # (3025)

        # Adjacency matrices for meta-path based neighbors
        # (Mufei): I verified both of them are binary adjacency matrices with self loops
        author_g = dgl.from_scipy(data['PAP'])
        subject_g = dgl.from_scipy(data['PLP'])
        self.gs = [author_g, subject_g]

        num_nodes = data['label'].shape[0]
        train_mask = generate_mask_tensor(
            idx2mask(data['train_idx'][0], num_nodes))
        val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0],
                                                 num_nodes))
        test_mask = generate_mask_tensor(
            idx2mask(data['test_idx'][0], num_nodes))
        for g in self.gs:
            g.ndata['feat'] = features
            g.ndata['label'] = labels
            g.ndata['train_mask'] = train_mask
            g.ndata['val_mask'] = val_mask
            g.ndata['test_mask'] = test_mask
예제 #3
0
    def _add_ndata(self):
        _raw_file2 = os.path.join(self.raw_dir, 'DBLP4057_GAT_with_idx.mat')
        if not os.path.exists(_raw_file2):
            raise FileNotFoundError('请手动下载文件 {} 提取码:6b3h 并保存到 {}'.format(
                self._url2, _raw_file2
            ))
        mat = sio.loadmat(_raw_file2)
        self.g.nodes['author'].data['feat'] = torch.from_numpy(mat['features']).float()
        self.g.nodes['author'].data['label'] = torch.tensor(self.authors['label'].to_list())

        n_authors = len(self.authors)
        train_idx, val_idx, test_idx = split_idx(np.arange(n_authors), 800, 400, self._seed)
        self.g.nodes['author'].data['train_mask'] = generate_mask_tensor(idx2mask(train_idx, n_authors))
        self.g.nodes['author'].data['val_mask'] = generate_mask_tensor(idx2mask(val_idx, n_authors))
        self.g.nodes['author'].data['test_mask'] = generate_mask_tensor(idx2mask(test_idx, n_authors))

        self.g.nodes['conf'].data['label'] = torch.tensor(self.confs['label'].to_list())
예제 #4
0
    def process(self):
        data = sio.loadmat(os.path.join(self.raw_dir, 'DBLP4057_GAT_with_idx.mat'))
        apa_g = dgl.graph(data['net_APA'].nonzero())
        apcpa_g = dgl.graph(data['net_APCPA'].nonzero())
        aptpa_g = dgl.graph(data['net_APTPA'].nonzero())
        self.gs = [apa_g, apcpa_g, aptpa_g]

        features = torch.from_numpy(data['features']).float()
        labels = torch.from_numpy(data['label'].nonzero()[1])
        num_nodes = data['label'].shape[0]
        train_mask = generate_mask_tensor(idx2mask(data['train_idx'][0], num_nodes))
        val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0], num_nodes))
        test_mask = generate_mask_tensor(idx2mask(data['test_idx'][0], num_nodes))
        for g in self.gs:
            g.ndata['feat'] = features
            g.ndata['label'] = labels
            g.ndata['train_mask'] = train_mask
            g.ndata['val_mask'] = val_mask
            g.ndata['test_mask'] = test_mask
예제 #5
0
    def process(self):
        data = sio.loadmat(os.path.join(self.raw_dir, 'imdb5k.mat'))
        mam_g = dgl.graph(data['MAM'].nonzero())
        mdm_g = dgl.graph(data['MDM'].nonzero())
        # mym_g = dgl.graph(data['MYM'].nonzero())
        self.gs = [mam_g, mdm_g]

        features = torch.from_numpy(data['feature']).float()
        num_nodes = features.shape[0]
        labels = torch.full((num_nodes, ), -1, dtype=torch.long)
        idx, label = data['label'].nonzero()
        labels[idx] = torch.from_numpy(label)
        train_mask = generate_mask_tensor(
            idx2mask(data['train_idx'][0], num_nodes))
        val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0],
                                                 num_nodes))
        test_mask = generate_mask_tensor(
            idx2mask(data['test_idx'][0], num_nodes))
        for g in self.gs:
            g.ndata['feat'] = features
            g.ndata['label'] = labels
            g.ndata['train_mask'] = train_mask
            g.ndata['val_mask'] = val_mask
            g.ndata['test_mask'] = test_mask
예제 #6
0
    def process(self):
        self.g = dgl.heterograph(self._read_edges())

        feats = self._read_feats()
        for ntype, feat in feats.items():
            self.g.nodes[ntype].data['feat'] = feat

        labels = torch.from_numpy(
            np.load(os.path.join(self.raw_path, 'labels.npy'))).long()
        self._num_classes = labels.max().item() + 1
        self.g.nodes[self.predict_ntype].data['label'] = labels

        n = self.g.num_nodes(self.predict_ntype)
        for split in ('train', 'val', 'test'):
            idx = np.load(os.path.join(self.raw_path, f'{split}_60.npy'))
            mask = generate_mask_tensor(idx2mask(idx, n))
            self.g.nodes[self.predict_ntype].data[f'{split}_mask'] = mask

        pos_i, pos_j = sp.load_npz(os.path.join(self.raw_path,
                                                'pos.npz')).nonzero()
        self.pos_i, self.pos_j = torch.from_numpy(
            pos_i).long(), torch.from_numpy(pos_j).long()
예제 #7
0
    def process(self):
        data = sio.loadmat(os.path.join(self.raw_dir, 'ACM.mat'))
        p_vs_l = data['PvsL']  # paper-field?
        p_vs_a = data['PvsA']  # paper-author
        p_vs_t = data['PvsT']  # paper-term, bag of words
        p_vs_c = data['PvsC']  # paper-conference, labels come from that

        # We assign
        # (1) KDD papers as class 0 (data mining),
        # (2) SIGMOD and VLDB papers as class 1 (database),
        # (3) SIGCOMM and MobiCOMM papers as class 2 (communication)
        conf_ids = [0, 1, 9, 10, 13]
        label_ids = [0, 1, 2, 2, 1]

        p_vs_c_filter = p_vs_c[:, conf_ids]
        p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
        p_vs_l = p_vs_l[p_selected]
        p_vs_a = p_vs_a[p_selected]
        p_vs_t = p_vs_t[p_selected]
        p_vs_c = p_vs_c[p_selected]

        self.g = dgl.heterograph({
            ('paper', 'pa', 'author'):
            p_vs_a.nonzero(),
            ('author', 'ap', 'paper'):
            p_vs_a.transpose().nonzero(),
            ('paper', 'pf', 'field'):
            p_vs_l.nonzero(),
            ('field', 'fp', 'paper'):
            p_vs_l.transpose().nonzero()
        })
        paper_features = torch.FloatTensor(p_vs_t.toarray())  # (4025, 1903)

        pc_p, pc_c = p_vs_c.nonzero()
        paper_labels = np.zeros(len(p_selected), dtype=np.int64)
        for conf_id, label_id in zip(conf_ids, label_ids):
            paper_labels[pc_p[pc_c == conf_id]] = label_id
        paper_labels = torch.from_numpy(paper_labels)

        float_mask = np.zeros(len(pc_p))
        for conf_id in conf_ids:
            pc_c_mask = (pc_c == conf_id)
            float_mask[pc_c_mask] = np.random.permutation(
                np.linspace(0, 1, pc_c_mask.sum()))
        train_idx = np.where(float_mask <= 0.2)[0]
        val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
        test_idx = np.where(float_mask > 0.3)[0]

        num_paper_nodes = self.g.num_nodes('paper')
        train_mask = generate_mask_tensor(idx2mask(train_idx, num_paper_nodes))
        val_mask = generate_mask_tensor(idx2mask(val_idx, num_paper_nodes))
        test_mask = generate_mask_tensor(idx2mask(test_idx, num_paper_nodes))

        self.g.nodes['paper'].data['feat'] = paper_features
        self.g.nodes['paper'].data['label'] = paper_labels
        self.g.nodes['paper'].data['train_mask'] = train_mask
        self.g.nodes['paper'].data['val_mask'] = val_mask
        self.g.nodes['paper'].data['test_mask'] = test_mask
        # author顶点的特征为其关联的paper顶点特征的平均
        self.g.multi_update_all(
            {'pa': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat'))}, 'sum')
        self.g.nodes['field'].data['feat'] = torch.eye(
            self.g.num_nodes('field'))