Exemplo n.º 1
0
    def __init__(self, args=None):
        dataset = "jknet_cora"
        path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
        if not osp.exists(path):
            os.makedirs(path)
        super(CoraDataset, self).__init__(path)
        with open(self.processed_paths[0], 'rb') as fin:
            load_data = pickle.load(fin)
        self.num_nodes = load_data['node_num']

        data = Data()
        data.x = load_data['xs']
        data.y = load_data['ys']

        train_size = int(self.num_nodes * 0.8)
        train_mask = np.zeros((self.num_nodes, ), dtype=bool)
        train_idx = np.random.choice(np.arange(self.num_nodes), size=train_size, replace=False)
        train_mask[train_idx] = True
        test_mask = np.ones((self.num_nodes, ), dtype=bool)
        test_mask[train_idx] = False
        val_mask = test_mask

        edges = load_data['edges']
        edges = np.array(edges, dtype=int).transpose((1, 0))

        data.edge_index = torch.from_numpy(edges)
        data.train_mask = torch.from_numpy(train_mask)
        data.test_mask = torch.from_numpy(test_mask)
        data.val_mask = torch.from_numpy(val_mask)
        data.x = torch.Tensor(data.x)
        data.y = torch.LongTensor(data.y)

        self.data = data
        self.num_classes = torch.max(self.data.y).item() + 1
Exemplo n.º 2
0
 def split_dataset(cls, dataset, args):
     if "ModelNet" in args.dataset:
         train_data = [Data(x=d.pos, y=d.y) for d in dataset["train"]]
         test_data = [Data(x=d.pos, y=d.y) for d in dataset["test"]]
         train_loader = DataLoader(train_data,
                                   batch_size=args.batch_size,
                                   num_workers=6)
         test_loader = DataLoader(test_data,
                                  batch_size=args.batch_size,
                                  num_workers=6,
                                  shuffle=False)
         return train_loader, test_loader, test_loader
     else:
         random.shuffle(dataset)
         train_size = int(len(dataset) * args.train_ratio)
         test_size = int(len(dataset) * args.test_ratio)
         bs = args.batch_size
         train_loader = DataLoader(dataset[:train_size], batch_size=bs)
         test_loader = DataLoader(dataset[-test_size:], batch_size=bs)
         if args.train_ratio + args.test_ratio < 1:
             valid_loader = DataLoader(dataset[train_size:-test_size],
                                       batch_size=bs)
         else:
             valid_loader = test_loader
         return train_loader, valid_loader, test_loader
Exemplo n.º 3
0
def read_gatne_data(folder):
    train_data = {}
    with open(osp.join(folder, '{}'.format('train.txt')), 'r') as f:
        for line in f:
            items = line.strip().split()
            if items[0] not in train_data:
                train_data[items[0]] = []
            train_data[items[0]].append([int(items[1]), int(items[2])])

    valid_data = {}
    with open(osp.join(folder, '{}'.format('valid.txt')), 'r') as f:
        for line in f:
            items = line.strip().split()
            if items[0] not in valid_data:
                valid_data[items[0]] = [[], []]
            valid_data[items[0]][1 - int(items[3])].append(
                [int(items[1]), int(items[2])])

    test_data = {}
    with open(osp.join(folder, '{}'.format('test.txt')), 'r') as f:
        for line in f:
            items = line.strip().split()
            if items[0] not in test_data:
                test_data[items[0]] = [[], []]
            test_data[items[0]][1 - int(items[3])].append(
                [int(items[1]), int(items[2])])

    data = Data()
    data.train_data = train_data
    data.valid_data = valid_data
    data.test_data = test_data
    return data
Exemplo n.º 4
0
 def __init__(self, root, name1, name2):
     edge_index_1, dict_1, self.node2id_1 = self._preprocess(root, name1)
     edge_index_2, dict_2, self.node2id_2 = self._preprocess(root, name2)
     self.data = [
         Data(x=None, edge_index=edge_index_1, y=dict_1),
         Data(x=None, edge_index=edge_index_2, y=dict_2),
     ]
     self.transform = None
Exemplo n.º 5
0
def read_planetoid_data(folder, prefix):
    prefix = prefix.lower()
    names = ["x", "tx", "allx", "y", "ty", "ally", "graph", "test.index"]
    objects = []
    for item in names[:-1]:
        with open(f"{folder}/ind.{prefix}.{item}", "rb") as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding="latin1"))
            else:
                objects.append(pkl.load(f))
    test_index = parse_index_file(f"{folder}/ind.{prefix}.{names[-1]}")
    test_index = torch.Tensor(test_index).long()
    test_index_reorder = test_index.sort()[0]

    x, tx, allx, y, ty, ally, graph = tuple(objects)
    x, tx, allx = tuple(
        [torch.from_numpy(item.todense()).float() for item in [x, tx, allx]])
    y, ty, ally = tuple(
        [torch.from_numpy(item).float() for item in [y, ty, ally]])

    train_index = torch.arange(y.size(0), dtype=torch.long)
    val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long)

    if prefix.lower() == "citeseer":
        # There are some isolated nodes in the Citeseer graph, resulting in
        # none consecutive test indices. We need to identify them and add them
        # as zero vectors to `tx` and `ty`.
        len_test_indices = (test_index.max() - test_index.min()).item() + 1

        tx_ext = torch.zeros(len_test_indices, tx.size(1))
        tx_ext[test_index_reorder - test_index.min(), :] = tx
        ty_ext = torch.zeros(len_test_indices, ty.size(1))
        ty_ext[test_index_reorder - test_index.min(), :] = ty

        tx, ty = tx_ext, ty_ext

    x = torch.cat([allx, tx], dim=0).float()
    y = torch.cat([ally, ty], dim=0).max(dim=1)[1].long()

    x[test_index] = x[test_index_reorder]
    y[test_index] = y[test_index_reorder]

    train_mask = index_to_mask(train_index, size=y.size(0))
    val_mask = index_to_mask(val_index, size=y.size(0))
    test_mask = index_to_mask(test_index, size=y.size(0))

    edge_index = edge_index_from_dict(graph, num_nodes=y.size(0))

    data = Data(x=x, edge_index=edge_index, y=y)
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    return data
Exemplo n.º 6
0
    def preprocessing(self, data, gdc_type="ppr"):
        # generate adjacency matrix from sparse representation
        adj_matrix = self._get_adj_matrix(data.x, data.edge_index)

        if gdc_type == "none":
            print("No GDC filters chosen")
            processed_matrix = adj_matrix
        elif gdc_type == "ppr":
            print("PPR filters chosen")
            processed_matrix = self._get_ppr_matrix(adj_matrix,
                                                    alpha=self.alpha)
        elif gdc_type == "heat":
            print("Heat filters chosen")
            processed_matrix = self._get_heat_matrix(adj_matrix, t=self.t)
        else:
            raise ValueError

        if gdc_type == "ppr" or gdc_type == "heat":
            if self.k:
                print(f"Selecting top {self.k} edges per node.")
                processed_matrix = self._get_top_k_matrix(processed_matrix,
                                                          k=self.k)
            elif self.eps:
                print(f"Selecting edges with weight greater than {self.eps}.")
                processed_matrix = self._get_clipped_matrix(processed_matrix,
                                                            eps=self.eps)
            else:
                raise ValueError

        # create PyG Data object
        edges_i = []
        edges_j = []
        edge_attr = []
        for i, row in enumerate(processed_matrix):
            for j in np.where(row > 0)[0]:
                edges_i.append(i)
                edges_j.append(j)
                edge_attr.append(processed_matrix[i, j])
        edge_index = [edges_i, edges_j]

        data = Data(
            x=data.x,
            edge_index=torch.LongTensor(edge_index),
            edge_attr=torch.FloatTensor(edge_attr),
            y=data.y,
            train_mask=data.train_mask,
            test_mask=data.test_mask,
            val_mask=data.val_mask,
        )
        data.apply(lambda x: x.to(self.device))

        return data
Exemplo n.º 7
0
    def __init__(self, root, name):
        self.name = name
        super(GCCDataset, self).__init__(root)

        name1 = name.split("_")[0]
        name2 = name.split("_")[1]
        edge_index_1, dict_1, self.node2id_1 = self.preprocess(root, name1)
        edge_index_2, dict_2, self.node2id_2 = self.preprocess(root, name2)
        self.data = [
            Data(x=None, edge_index=edge_index_1, y=dict_1),
            Data(x=None, edge_index=edge_index_2, y=dict_2),
        ]
        self.transform = None
Exemplo n.º 8
0
    def read_gtn_data(self, folder):
        data = sio.loadmat(osp.join(folder, 'data.mat'))
        if self.name == 'han-acm' or self.name == 'han-imdb':
            truelabels, truefeatures = data['label'], data['feature'].astype(float)
        elif self.name == 'han-dblp':
            truelabels, truefeatures = data['label'], data['features'].astype(float)
        num_nodes = truefeatures.shape[0]
        if self.name == 'han-acm':
            rownetworks = [data['PAP'] - np.eye(num_nodes), data['PLP'] - np.eye(num_nodes)]
        elif self.name == 'han-dblp':
            rownetworks = [data['net_APA'] - np.eye(num_nodes), data['net_APCPA'] - np.eye(num_nodes), data['net_APTPA'] - np.eye(num_nodes)]
        elif self.name == 'han-imdb':
            rownetworks = [data['MAM'] - np.eye(num_nodes), data['MDM'] - np.eye(num_nodes), data['MYM'] - np.eye(num_nodes)]

        y = truelabels
        train_idx = data['train_idx']
        val_idx = data['val_idx']
        test_idx = data['test_idx']

        train_mask = sample_mask(train_idx, y.shape[0])
        val_mask = sample_mask(val_idx, y.shape[0])
        test_mask = sample_mask(test_idx, y.shape[0])

        y_train = np.argmax(y[train_mask, :], axis=1)
        y_val = np.argmax(y[val_mask, :], axis=1)
        y_test = np.argmax(y[test_mask, :], axis=1)

        data = Data()
        A = []                     
        for i, edge in enumerate(rownetworks):
            edge_tmp = torch.from_numpy(np.vstack((edge.nonzero()[0], edge.nonzero()[1]))).type(torch.LongTensor)
            value_tmp = torch.ones(edge_tmp.shape[1]).type(torch.FloatTensor)
            A.append((edge_tmp, value_tmp))
        edge_tmp = torch.stack((torch.arange(0,num_nodes), torch.arange(0,num_nodes))).type(torch.LongTensor)
        value_tmp = torch.ones(num_nodes).type(torch.FloatTensor)
        A.append((edge_tmp, value_tmp))
        data.adj = A

        data.x = torch.from_numpy(truefeatures).type(torch.FloatTensor)

        data.train_node = torch.from_numpy(train_idx[0]).type(torch.LongTensor)
        data.train_target = torch.from_numpy(y_train).type(torch.LongTensor)
        data.valid_node = torch.from_numpy(val_idx[0]).type(torch.LongTensor)
        data.valid_target = torch.from_numpy(y_val).type(torch.LongTensor)
        data.test_node = torch.from_numpy(test_idx[0]).type(torch.LongTensor)
        data.test_target = torch.from_numpy(y_test).type(torch.LongTensor)

        self.data = data
Exemplo n.º 9
0
 def setup_class(self):
     self.dataset = build_dataset_from_name("cora")
     self.data = Data.from_pyg_data(self.dataset[0])
     self.num_nodes = self.data.num_nodes
     self.num_edges = self.data.num_edges
     self.num_features = self.data.num_features
     print("Call Setup")
Exemplo n.º 10
0
    def __init__(self, args):
        super(GraphClassification, self).__init__(args)

        dataset = build_dataset(args)
        self.data = [
            Data(x=data.x, y=data.y, edge_index=data.edge_index, edge_attr=data.edge_attr, pos=data.pos).apply(lambda x:x.cuda())
            for data in dataset
        ]

        args.num_features = dataset.num_features
        args.num_classes = dataset.num_classes
        args.use_unsup = False
        if args.degree_feature:
            self.data = node_degree_as_feature(self.data)
            args.num_features = self.data[0].num_features


        model = build_model(args)
        self.model = model.cuda()
        self.patience = args.patience
        self.max_epoch = args.max_epoch

        self.train_loader, self.val_loader, self.test_loader = self.model.split_dataset(dataset, args)

        self.optimizer = torch.optim.Adam(
            self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay
        )
        self.scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer=self.optimizer,
            step_size=50,
            gamma=0.5
        )
Exemplo n.º 11
0
    def process(self):
        filenames = self.raw_paths
        with open(f"{filenames[0]}", "r") as f:
            edge_index = f.read().strip().split("\n")
        edge_index = [[int(i) for i in x.split("\t")] for x in edge_index]
        edge_index = np.array(edge_index, dtype=np.int64).transpose()
        edge_index = torch.from_numpy(edge_index)
        rev_edge_index = torch.stack([edge_index[1], edge_index[0]])
        edge_index = torch.cat((edge_index, rev_edge_index), dim=1)

        self_loop_mask = edge_index[0] != edge_index[1]
        edge_index = edge_index[:, self_loop_mask]

        with open(f"{filenames[1]}", "r") as f:
            cmty = f.read().strip().split("\n")
        cmty = [[int(i) for i in x.split("\t")] for x in cmty]

        num_classes = len(cmty)
        num_nodes = torch.max(edge_index).item() + 1

        labels = np.zeros((num_nodes, num_classes), dtype=np.float)
        for i, cls in enumerate(cmty):
            labels[cls, i] = 1.0

        labels = torch.from_numpy(labels)
        data = Data(x=None, y=labels, edge_index=edge_index)
        torch.save(data, self.processed_paths[0])
Exemplo n.º 12
0
    def __init__(self,
                 data_type="unsupervised",
                 root="testchem",
                 transform=None,
                 pre_transform=None,
                 pre_filter=None):
        super(TestChemDataset, self).__init__(root, transform, pre_transform,
                                              pre_filter)
        num_nodes = 10
        num_edges = 10
        num_graphs = 100

        def cycle_index(num, shift):
            arr = torch.arange(num) + shift
            arr[-shift:] = torch.arange(shift)
            return arr

        upp = torch.cat([torch.arange(0, num_nodes)] * num_graphs)
        dwn = torch.cat([cycle_index(num_nodes, 1)] * num_graphs)
        edge_index = torch.stack([upp, dwn])

        edge_attr = torch.zeros(num_edges * num_graphs, 2)
        x = torch.zeros(num_graphs * num_nodes, 2)
        for idx, val in enumerate(
                torch.randint(0, 6, size=(num_edges * num_graphs, ))):
            edge_attr[idx][0] = val
        for idx, val in enumerate(
                torch.randint(0, 3, size=(num_edges * num_graphs, ))):
            edge_attr[idx][1] = val
        for idx, val in enumerate(
                torch.randint(0, 120, size=(num_edges * num_graphs, ))):
            x[idx][0] = val
        for idx, val in enumerate(
                torch.randint(0, 3, size=(num_edges * num_graphs, ))):
            x[idx][1] = val

        self.data = Data(
            x=x.to(torch.long),
            edge_index=edge_index.to(torch.long),
            edge_attr=edge_attr.to(torch.long),
        )

        self.slices = {
            "x": torch.arange(0, (num_graphs + 1) * num_nodes, num_nodes),
            "edge_index": torch.arange(0, (num_graphs + 1) * num_edges,
                                       num_edges),
            "edge_attr": torch.arange(0, (num_graphs + 1) * num_edges,
                                      num_edges),
        }

        if data_type == "supervised":
            pretrain_tasks = 10
            go_target_pretrain = torch.zeros(pretrain_tasks * num_graphs) - 1
            for i in range(num_graphs):
                val = np.random.randint(0, pretrain_tasks)
                go_target_pretrain[i * pretrain_tasks + val] = 1
            self.data.y = go_target_pretrain
            self.slices["y"] = torch.arange(0,
                                            (num_graphs + 1) * pretrain_tasks,
                                            pretrain_tasks)
Exemplo n.º 13
0
    def process(self):
        for s, split in enumerate(['train', 'valid', 'test']):
            path = osp.join(self.raw_dir, '{}_graph.json').format(split)
            with open(path, 'r') as f:
                G = nx.DiGraph(json_graph.node_link_graph(json.load(f)))

            x = np.load(osp.join(self.raw_dir, '{}_feats.npy').format(split))
            x = torch.from_numpy(x).to(torch.float)

            y = np.load(osp.join(self.raw_dir, '{}_labels.npy').format(split))
            y = torch.from_numpy(y).to(torch.float)

            data_list = []
            path = osp.join(self.raw_dir, '{}_graph_id.npy').format(split)
            idx = torch.from_numpy(np.load(path)).to(torch.long)
            idx = idx - idx.min()

            for i in range(idx.max().item() + 1):
                mask = idx == i

                G_s = G.subgraph(mask.nonzero().view(-1).tolist())
                edge_index = torch.tensor(list(G_s.edges)).t().contiguous()
                edge_index = edge_index - edge_index.min()
                edge_index, _ = remove_self_loops(edge_index)

                data = Data(edge_index=edge_index, x=x[mask], y=y[mask])

                if self.pre_filter is not None and not self.pre_filter(data):
                    continue

                if self.pre_transform is not None:
                    data = self.pre_transform(data)

                data_list.append(data)
            torch.save(self.collate(data_list), self.processed_paths[s])
Exemplo n.º 14
0
 def __init__(self):
     super(TestSmallDataset, self).__init__("test")
     x = torch.FloatTensor([[-2, -1], [-2, 1], [-1, 0], [0, 0], [0, 1],
                            [1, 0], [2, 1], [3, 0], [2, -1], [4, 0], [4, 1],
                            [5, 0]])
     edge_index = torch.LongTensor([
         [
             0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7,
             8, 8, 9, 9, 9, 10, 10, 11, 11
         ],
         [
             1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 3, 6, 7, 8, 5, 7, 5, 6, 8, 9,
             5, 7, 7, 10, 11, 9, 11, 9, 10
         ],
     ])
     y = torch.LongTensor([0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3])
     self.data = Data(x, edge_index, None, y, None)
     self.data.train_mask = torch.tensor([
         True, False, False, True, False, True, False, False, False, True,
         False, False
     ])
     self.data.val_mask = torch.tensor([
         False, True, False, False, False, False, True, False, False, False,
         False, True
     ])
     self.data.test_mask = torch.tensor([
         False, False, True, False, True, False, False, True, True, False,
         True, False
     ])
     # self.num_classes = 4
     self.transform = None
Exemplo n.º 15
0
    def __init__(self, root, name):
        dataset = NodePropPredDataset(name, root)
        graph, y = dataset[0]
        x = torch.tensor(graph["node_feat"])
        y = torch.tensor(y.squeeze())
        row, col, edge_attr = coalesce(graph["edge_index"][0],
                                       graph["edge_index"][1],
                                       graph["edge_feat"])
        edge_index = torch.stack([row, col], dim=0)
        edge_index, edge_attr = remove_self_loops(edge_index, edge_attr)
        row = torch.cat([edge_index[0], edge_index[1]])
        col = torch.cat([edge_index[1], edge_index[0]])
        edge_index = torch.stack([row, col], dim=0)
        if edge_attr is not None:
            edge_attr = torch.cat([edge_attr, edge_attr], dim=0)

        self.data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        self.data.num_nodes = graph["num_nodes"]
        assert self.data.num_nodes == self.data.x.shape[0]

        # split
        split_index = dataset.get_idx_split()
        self.data.train_mask = torch.zeros(self.data.num_nodes,
                                           dtype=torch.bool)
        self.data.test_mask = torch.zeros(self.data.num_nodes,
                                          dtype=torch.bool)
        self.data.val_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool)
        self.data.train_mask[split_index["train"]] = True
        self.data.test_mask[split_index["test"]] = True
        self.data.val_mask[split_index["valid"]] = True

        self.transform = None
Exemplo n.º 16
0
def read_edgelist_label_data(folder, prefix):
    graph_path = osp.join(folder, '{}.ungraph'.format(prefix))
    cmty_path = osp.join(folder, '{}.cmty'.format(prefix))

    G = nx.read_edgelist(graph_path, nodetype=int, create_using=nx.Graph())
    num_node = G.number_of_nodes()
    print('edge number: ', num_node)
    with open(graph_path) as f:
        context = f.readlines()
        print('edge number: ', len(context))
        edge_index = np.zeros((2, len(context)))
        for i, line in enumerate(context):
            edge_index[:, i] = list(map(int, line.strip().split('\t')))
    edge_index = torch.from_numpy(edge_index).to(torch.int)

    with open(cmty_path) as f:
        context = f.readlines()
        print('class number: ', len(context))
        label = np.zeros((num_node, len(context)))

        for i, line in enumerate(context):
            line = map(int, line.strip().split('\t'))
            for node in line:
                label[node, i] = 1

    y = torch.from_numpy(label).to(torch.float)
    data = Data(x=None, edge_index=edge_index, y=y)

    return data
Exemplo n.º 17
0
 def __init__(self, root, name):
     self.name = name
     edge_list_path = os.path.join(root, name + ".edgelist")
     node_label_path = os.path.join(root, name + ".nodelabel")
     edge_index, y, self.node2id = self._preprocess(edge_list_path, node_label_path)
     self.data = Data(x=None, edge_index=edge_index, y=y)
     self.transform = None
Exemplo n.º 18
0
    def process(self):
        num_nodes = 100
        num_edges = 300
        feat_dim = 30

        # load or generate your dataset
        edge_index = torch.randint(0, num_nodes, (2, num_edges))
        x = torch.randn(num_nodes, feat_dim)
        y = torch.randint(0, 2, (num_nodes, ))

        # set train/val/test mask in node_classification task
        train_mask = torch.zeros(num_nodes).bool()
        train_mask[0:int(0.3 * num_nodes)] = True
        val_mask = torch.zeros(num_nodes).bool()
        val_mask[int(0.3 * num_nodes):int(0.7 * num_nodes)] = True
        test_mask = torch.zeros(num_nodes).bool()
        test_mask[int(0.7 * num_nodes):] = True
        data = Data(x=x,
                    edge_index=edge_index,
                    y=y,
                    train_mask=train_mask,
                    val_mask=val_mask,
                    test_mask=test_mask)
        torch.save(data, "mydata.pt")
        return data
 def get(self, idx):
     data = Data()
     for key in self.data.keys:
         item, slices = self.data[key], self.slices[key]
         s = list(repeat(slice(None), item.dim()))
         s[self.data.cat_dim(key, item)] = slice(slices[idx], slices[idx + 1])
         data[key] = item[s]
     return data
Exemplo n.º 20
0
 def process(self):
     edge=self.read_txt_label(osp.join(self.raw_dir, '{}.txt'.format(self.name)),dtype=torch.int)
     edge_index=edge[:-1,:]
     edge_attr=edge[-1:,:]
     data = Data(edge_index=edge_index,edge_attr=edge_attr, x=None, y=None)
     #data = Data(edge_index=edge_index, x=None, y=None)
     data = data if self.pre_transform is None else self.pre_transform(data)
     torch.save(data, self.processed_paths[0])
Exemplo n.º 21
0
def read_triplet_data(folder):
    filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"]
    count = 0
    edge_index = []
    edge_attr = []
    count_list = []
    for filename in filenames:
        with open(osp.join(folder, filename), "r") as f:
            num = int(f.readline().strip())
            for line in f:
                items = line.strip().split()
                edge_index.append([int(items[0]), int(items[1])])
                edge_attr.append(int(items[2]))
                count += 1
            count_list.append(count)

    edge_index = torch.LongTensor(edge_index).t()
    edge_attr = torch.LongTensor(edge_attr)
    data = Data()
    data.edge_index = edge_index
    data.edge_attr = edge_attr

    def generate_mask(start, end):
        mask = torch.BoolTensor(count)
        mask[:] = False
        mask[start:end] = True
        return mask

    data.train_mask = generate_mask(0, count_list[0])
    data.val_mask = generate_mask(count_list[0], count_list[1])
    data.test_mask = generate_mask(count_list[1], count_list[2])
    return data
Exemplo n.º 22
0
    def process(self):
        data = np.load(osp.join(self.raw_dir, "reddit_data.npz"))
        x = torch.from_numpy(data["feature"]).to(torch.float)
        y = torch.from_numpy(data["label"]).to(torch.long)
        split = torch.from_numpy(data["node_types"])

        adj = sp.load_npz(osp.join(self.raw_dir, "reddit_graph.npz"))
        row = torch.from_numpy(adj.row).to(torch.long)
        col = torch.from_numpy(adj.col).to(torch.long)
        edge_index = torch.stack([row, col], dim=0)
        edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0))

        data = Data(x=x, edge_index=edge_index, y=y)
        data.train_mask = split == 1
        data.val_mask = split == 2
        data.test_mask = split == 3

        torch.save(self.collate([data]), self.processed_paths[0])
Exemplo n.º 23
0
    def top_k(self, x: torch.Tensor, edge_index: torch.Tensor,
              scores: torch.Tensor) -> Tuple[Data, torch.Tensor]:
        org_n_nodes = x.shape[0]
        num = int(self.pooling_rate * x.shape[0])
        values, indices = torch.topk(scores, max(2, num))

        if self.aug_adj:
            edge_attr = torch.ones(edge_index.shape[1]).to(x.device)
            edge_index, _ = spspmm(edge_index, edge_attr, edge_index,
                                   edge_attr, org_n_nodes, org_n_nodes,
                                   org_n_nodes)

        batch = Data(x=x, edge_index=edge_index)
        new_batch = batch.subgraph(indices)
        num_nodes = new_batch.x.shape[0]
        new_batch.edge_attr = row_normalization(num_nodes,
                                                new_batch.edge_index)
        return new_batch, indices
Exemplo n.º 24
0
    def get_subgraph(self, phase, require_norm=True):
        """
        Generate one minibatch for model. In the 'train' mode, one minibatch corresponds
        to one subgraph of the training graph. In the 'valid' or 'test' mode, one batch
        corresponds to the full graph (i.e., full-batch rather than minibatch evaluation
        for validation / test sets).

        Inputs:
            mode                str, can be 'train', 'valid', 'test'
            require_norm        boolean

        Outputs:
            data                Data object, modeling the sampled subgraph
            data.norm_aggr      aggregation normalization
            data.norm_loss      normalization normalization
        """
        if phase in ['val', 'test']:
            node_subgraph = np.arange(self.data.num_nodes)
            data = self.data
            if require_norm:
                data.norm_aggr = torch.ones(self.num_edges)
                data.norm_loss = self.norm_loss_test
        else:
            if len(self.subgraphs_nodes) == 0:
                self.gen_subgraph()

            node_subgraph = self.subgraphs_nodes.pop()
            edge_subgraph = self.subgraphs_edge_index.pop()
            num_nodes_subgraph = node_subgraph.size
            adj = sp.csr_matrix(
                (self.subgraphs_data.pop(), self.subgraphs_indices.pop(),
                 self.subgraphs_indptr.pop()),
                shape=(num_nodes_subgraph, num_nodes_subgraph))

            if require_norm:
                adj.data[:] = self.norm_aggr_train[edge_subgraph][:]
                #normalization
                D = adj.sum(1).flatten()
                norm_diag = sp.dia_matrix((1 / D, 0), shape=adj.shape)
                adj = norm_diag.dot(adj)
                adj.sort_indices()

            adj = adj.tocoo()
            data = Data(
                self.data.x[node_subgraph],
                torch.LongTensor(np.vstack(
                    (adj.row, adj.col))), None if self.data.edge_attr is None
                else self.data.edge_attr[edge_subgraph],
                self.data.y[node_subgraph], None
                if self.data.pos is None else self.data.pos[node_subgraph])

            if require_norm:
                data.norm_aggr = torch.FloatTensor(adj.data)
                data.norm_loss = self.norm_loss_train[node_subgraph]
            data.train_mask = self.data.train_mask[node_subgraph]
            data.val_mask = self.data.val_mask[node_subgraph]
            data.test_mask = self.data.test_mask[node_subgraph]

        return data
Exemplo n.º 25
0
 def __init__(self, args=None):
     x = torch.FloatTensor([[-2, -1], [-2, 1], [-1, 0], [0, 0], [0, 1], [1, 0], [2, 1], [3, 0], [2, -1]])
     edge_index = torch.LongTensor([[0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8],
                                    [1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 3, 6, 7, 8, 5, 7, 5, 6, 8, 5, 7]])
     y = torch.LongTensor([0, 0, 0, 1, 1, 2, 2, 2, 2])
     self.data = Data(x, edge_index, None, y, None)
     self.data.train_mask = torch.tensor([True, False, False, True, False, True, False, False, False])
     self.data.val_mask = torch.tensor([False, True, False, False, False, False, True, False, False])
     self.data.test_mask = torch.tensor([False, False, True, False, True, False, False, True, True])
     self.num_classes = 3
     self.transform = None
Exemplo n.º 26
0
def read_saint_data(folder):
    names = [
        "adj_full.npz", "adj_train.npz", "class_map.json", "feats.npy",
        "role.json"
    ]
    names = [osp.join(folder, name) for name in names]
    adj_full = sp.load_npz(names[0])
    adj_train = sp.load_npz(names[1])
    class_map = json.load(open(names[2]))
    feats = np.load(names[3])
    role = json.load(open(names[4]))

    train_mask = index_to_mask(role["tr"], size=feats.shape[0])
    val_mask = index_to_mask(role["va"], size=feats.shape[0])
    test_mask = index_to_mask(role["te"], size=feats.shape[0])

    feats = torch.from_numpy(feats).float()
    item = class_map["0"]
    if isinstance(item, list):
        labels = np.zeros((feats.shape[0], len(item)), dtype=float)
        for key, val in class_map.items():
            labels[int(key)] = np.array(val)
    else:
        labels = np.zeros(feats.shape[0], dtype=np.long)
        for key, val in class_map.items():
            labels[int(key)] = val

    labels = torch.from_numpy(labels)

    def get_adj(adj):
        row, col = adj.nonzero()
        data = adj.data
        row = torch.tensor(row, dtype=torch.long)
        col = torch.tensor(col, dtype=torch.long)
        edge_index = torch.stack([row, col], dim=0)
        edge_attr = torch.tensor(data, dtype=torch.float)
        return edge_index, edge_attr

    edge_index_full, edge_attr_full = get_adj(adj_full)
    edge_index_train, edge_attr_train = get_adj(adj_train)

    data = Data(
        x=feats,
        y=labels,
        edge_index=edge_index_full,
        edge_attr=edge_attr_full,
        edge_index_train=edge_index_train,
        edge_attr_train=edge_attr_train,
        train_mask=train_mask,
        val_mask=val_mask,
        test_mask=test_mask,
    )
    return data
Exemplo n.º 27
0
    def read_gtn_data(self, folder):
        edges = pickle.load(open(osp.join(folder, 'edges.pkl'), 'rb'))
        labels = pickle.load(open(osp.join(folder, 'labels.pkl'), 'rb'))
        node_features = pickle.load(
            open(osp.join(folder, 'node_features.pkl'), 'rb'))

        data = Data()
        data.x = torch.from_numpy(node_features).type(torch.FloatTensor)

        num_nodes = edges[0].shape[0]

        A = []

        for i, edge in enumerate(edges):
            edge_tmp = torch.from_numpy(
                np.vstack((edge.nonzero()[0],
                           edge.nonzero()[1]))).type(torch.LongTensor)
            value_tmp = torch.ones(edge_tmp.shape[1]).type(torch.FloatTensor)
            A.append((edge_tmp, value_tmp))
        edge_tmp = torch.stack(
            (torch.arange(0, num_nodes),
             torch.arange(0, num_nodes))).type(torch.LongTensor)
        value_tmp = torch.ones(num_nodes).type(torch.FloatTensor)
        A.append((edge_tmp, value_tmp))
        data.adj = A

        data.train_node = torch.from_numpy(np.array(labels[0])[:, 0]).type(
            torch.LongTensor)
        data.train_target = torch.from_numpy(np.array(labels[0])[:, 1]).type(
            torch.LongTensor)
        data.valid_node = torch.from_numpy(np.array(labels[1])[:, 0]).type(
            torch.LongTensor)
        data.valid_target = torch.from_numpy(np.array(labels[1])[:, 1]).type(
            torch.LongTensor)
        data.test_node = torch.from_numpy(np.array(labels[2])[:, 0]).type(
            torch.LongTensor)
        data.test_target = torch.from_numpy(np.array(labels[2])[:, 1]).type(
            torch.LongTensor)

        self.data = data
Exemplo n.º 28
0
def read_planetoid_data(folder, prefix):
    names = ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']
    items = [read_file(folder, prefix, name) for name in names]
    x, tx, allx, y, ty, ally, graph, test_index = items
    train_index = torch.arange(y.size(0), dtype=torch.long)
    val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long)
    sorted_test_index = test_index.sort()[0]

    if prefix.lower() == 'citeseer':
        # There are some isolated nodes in the Citeseer graph, resulting in
        # none consecutive test indices. We need to identify them and add them
        # as zero vectors to `tx` and `ty`.
        len_test_indices = (test_index.max() - test_index.min()).item() + 1

        tx_ext = torch.zeros(len_test_indices, tx.size(1))
        tx_ext[sorted_test_index - test_index.min(), :] = tx
        ty_ext = torch.zeros(len_test_indices, ty.size(1))
        ty_ext[sorted_test_index - test_index.min(), :] = ty

        tx, ty = tx_ext, ty_ext

    x = torch.cat([allx, tx], dim=0)
    y = torch.cat([ally, ty], dim=0).max(dim=1)[1]

    x[test_index] = x[sorted_test_index]
    y[test_index] = y[sorted_test_index]

    train_mask = sample_mask(train_index, num_nodes=y.size(0))
    val_mask = sample_mask(val_index, num_nodes=y.size(0))
    test_mask = sample_mask(test_index, num_nodes=y.size(0))

    edge_index = edge_index_from_dict(graph, num_nodes=y.size(0))

    data = Data(x=x, edge_index=edge_index, y=y)
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    return data
Exemplo n.º 29
0
Arquivo: kg_data.py Projeto: zrt/cogdl
def read_triplet_data(folder):
    filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"]
    count = 0
    edge_index = []
    edge_attr = []
    count_list = []
    triples = []
    num_entities = 0
    num_relations = 0
    entity_dic = {}
    relation_dic = {}
    for filename in filenames:
        with open(osp.join(folder, filename), "r") as f:
            _ = int(f.readline().strip())
            if "train" in filename:
                train_start_idx = len(triples)
            elif "valid" in filename:
                valid_start_idx = len(triples)
            elif "test" in filename:
                test_start_idx = len(triples)
            for line in f:
                items = line.strip().split()
                edge_index.append([int(items[0]), int(items[1])])
                edge_attr.append(int(items[2]))
                triples.append((int(items[0]), int(items[2]), int(items[1])))
                if items[0] not in entity_dic:
                    entity_dic[items[0]] = num_entities
                    num_entities += 1
                if items[1] not in entity_dic:
                    entity_dic[items[1]] = num_entities
                    num_entities += 1
                if items[2] not in relation_dic:
                    relation_dic[items[2]] = num_relations
                    num_relations += 1
                count += 1
            count_list.append(count)

    edge_index = torch.LongTensor(edge_index).t()
    edge_attr = torch.LongTensor(edge_attr)
    data = Data()
    data.edge_index = edge_index
    data.edge_attr = edge_attr

    def generate_mask(start, end):
        mask = torch.BoolTensor(count)
        mask[:] = False
        mask[start:end] = True
        return mask

    data.train_mask = generate_mask(0, count_list[0])
    data.val_mask = generate_mask(count_list[0], count_list[1])
    data.test_mask = generate_mask(count_list[1], count_list[2])
    return data, triples, train_start_idx, valid_start_idx, test_start_idx, num_entities, num_relations
Exemplo n.º 30
0
    def process(self):
        path = osp.join(self.raw_dir, "{}.mat".format(self.name))
        smat = scipy.io.loadmat(path)
        adj_matrix, group = smat["network"], smat["group"]

        y = torch.from_numpy(group.todense()).to(torch.float)

        row_ind, col_ind = adj_matrix.nonzero()
        edge_index = torch.stack([torch.tensor(row_ind), torch.tensor(col_ind)], dim=0)
        edge_attr = torch.tensor(adj_matrix[row_ind, col_ind])

        data = Data(edge_index=edge_index, edge_attr=edge_attr, x=None, y=y)

        torch.save(data, self.processed_paths[0])