示例#1
0
def test_to_simple(index_dtype):
    g = dgl.heterograph(
        {
            ('user', 'follow', 'user'): [(0, 1), (1, 3), (2, 2), (1, 3),
                                         (1, 4), (1, 4)],
            ('user', 'plays', 'game'): [(3, 5), (2, 3), (1, 4), (1, 4), (3, 5),
                                        (2, 3), (2, 3)]
        },
        index_dtype=index_dtype)
    sg = dgl.to_simple(g, return_counts='weights', writeback_mapping='new_eid')

    for etype in g.canonical_etypes:
        u, v = g.all_edges(form='uv', order='eid', etype=etype)
        u = F.asnumpy(u).tolist()
        v = F.asnumpy(v).tolist()
        uv = list(zip(u, v))
        eid_map = F.asnumpy(g.edges[etype].data['new_eid'])

        su, sv = sg.all_edges(form='uv', order='eid', etype=etype)
        su = F.asnumpy(su).tolist()
        sv = F.asnumpy(sv).tolist()
        suv = list(zip(su, sv))
        sw = F.asnumpy(sg.edges[etype].data['weights'])

        assert set(uv) == set(suv)
        for i, e in enumerate(suv):
            assert sw[i] == sum(e == _e for _e in uv)
        for i, e in enumerate(uv):
            assert eid_map[i] == suv.index(e)
示例#2
0
def to_bidirected_with_reverse_mapping(g):
    """Makes a graph bidirectional, and returns a mapping array ``mapping`` where ``mapping[i]``
    is the reverse edge of edge ID ``i``.
    Does not work with graphs that have self-loops.
    """
    g_simple, mapping = dgl.to_simple(dgl.add_reverse_edges(g),
                                      return_counts='count',
                                      writeback_mapping=True)
    c = g_simple.edata['count']
    num_edges = g.num_edges()
    mapping_offset = torch.zeros(g_simple.num_edges() + 1,
                                 dtype=g_simple.idtype)
    mapping_offset[1:] = c.cumsum(0)
    idx = mapping.argsort()
    idx_uniq = idx[mapping_offset[:-1]]
    reverse_idx = torch.where(idx_uniq >= num_edges, idx_uniq - num_edges,
                              idx_uniq + num_edges)
    reverse_mapping = mapping[reverse_idx]

    # Correctness check
    src1, dst1 = g_simple.edges()
    src2, dst2 = g_simple.find_edges(reverse_mapping)
    assert torch.equal(src1, dst2)
    assert torch.equal(src2, dst1)
    return g_simple, reverse_mapping
示例#3
0
    def edge2graph(self, edge_batch):

        u = edge_batch[:, :-1].reshape(-1)
        v = edge_batch[:, 1:].reshape(-1)
        if self.symmetric:
            tmp = u
            u = th.cat((u, v), dim=0)
            v = th.cat((v,tmp), dim=0)

        g = dgl.graph((u, v))
        sg = dgl.to_simple(g, return_counts='w')
        return sg
示例#4
0
def coalesce_graph(graph, aggr_type='sum', copy_data=False):
    """
    Coalesce multi-edge graph
    Args:
        graph(DGLGraph): graph
        aggr_type(str): type of aggregator for multi edge weights
        copy_data(bool): if copy ndata and edata in new graph

    Returns:
        graph(DGLGraph): graph


    """
    src, dst = graph.edges()
    graph_df = pd.DataFrame({'src': src, 'dst': dst})
    graph_df['edge_weight'] = graph.edata['edge_weight'].numpy()

    if aggr_type == 'sum':
        tmp = graph_df.groupby(['src', 'dst'])['edge_weight'].sum().reset_index()
    elif aggr_type == 'mean':
        tmp = graph_df.groupby(['src', 'dst'])['edge_weight'].mean().reset_index()
    else:
        raise ValueError("aggr type error")

    if copy_data:
        graph = dgl.to_simple(graph, copy_ndata=True, copy_edata=True)
    else:
        graph = dgl.to_simple(graph)

    src, dst = graph.edges()
    graph_df = pd.DataFrame({'src': src, 'dst': dst})
    graph_df = pd.merge(graph_df, tmp, how='left', on=['src', 'dst'])
    graph.edata['edge_weight'] = torch.from_numpy(graph_df['edge_weight'].values).unsqueeze(1)

    graph.edata.pop('count')
    return graph
示例#5
0
    def __init__(self,
                 g,
                 split_edge,
                 hop=1,
                 neg_samples=1,
                 subsample_ratio=1,
                 prefix=None,
                 save_dir=None,
                 num_workers=32,
                 shuffle=True,
                 use_coalesce=True,
                 print_fn=print):
        self.g = g
        self.hop = hop
        self.subsample_ratio = subsample_ratio
        self.prefix = prefix
        self.save_dir = save_dir
        self.print_fn = print_fn

        self.generator = PosNegEdgesGenerator(g=self.g,
                                              split_edge=split_edge,
                                              neg_samples=neg_samples,
                                              subsample_ratio=subsample_ratio,
                                              shuffle=shuffle)
        if use_coalesce:
            for k, v in g.edata.items():
                g.edata[k] = v.float(
                )  # dgl.to_simple() requires data is float
            self.g = dgl.to_simple(g,
                                   copy_ndata=True,
                                   copy_edata=True,
                                   aggregator='sum')

        self.ndata = {k: v for k, v in self.g.ndata.items()}
        self.edata = {k: v for k, v in self.g.edata.items()}
        self.g.ndata.clear()
        self.g.edata.clear()
        self.print_fn("Save ndata and edata in class.")
        self.print_fn("Clear ndata and edata in graph.")

        self.sampler = SEALSampler(graph=self.g,
                                   hop=hop,
                                   num_workers=num_workers,
                                   print_fn=print_fn)
示例#6
0
    def build_hetgnn_graph(self, length, walks, restart_prob):
        #edges = [[[[],[]]] * len(self.num_nodes)] * len(self.num_nodes)
        edges = [[[[], []], [[], []], [[], []]], [[[], []], [[], []], [[],
                                                                       []]],
                 [[[], []], [[], []], [[], []]]]

        for i in range(self.g.number_of_nodes()):
            nodes = th.tensor([i]).repeat(walks)
            traces, types = dgl.sampling.random_walk(self.g,
                                                     nodes,
                                                     length=length,
                                                     restart_prob=restart_prob)
            concat_vids, _, _, _ = dgl.sampling.pack_traces(traces, types)
            concat_types = th.index_select(self.NTYPE, 0, concat_vids)
            uid = concat_vids[0]
            utype = concat_types[0]
            for (vid, vtype) in zip(concat_vids, concat_types):
                edges[int(utype)][int(vtype)][0].append(self.NID[uid])
                edges[int(utype)][int(vtype)][1].append(self.NID[vid])

        edge_dict = {}
        k = {}
        num_ntypes = self.NTYPE.max() + 1
        for i in range(num_ntypes):
            for j in range(num_ntypes):
                edge = (self.hg.ntypes[j],
                        self.hg.ntypes[j] + '-' + self.hg.ntypes[i],
                        self.hg.ntypes[i])
                edge_dict[edge] = (th.tensor(edges[i][j][1]),
                                   th.tensor(edges[i][j][0]))
                if j == 2:
                    k[edge] = 3
                else:
                    k[edge] = 10

        neighbor_graph = dgl.heterograph(edge_dict, self.num_nodes)

        neighbor_graph = dgl.to_simple(neighbor_graph,
                                       return_counts=self.weight_column)
        counts = neighbor_graph.edata[self.weight_column]
        neighbor_graph = select_topk(neighbor_graph, k, self.weight_column)

        return neighbor_graph
示例#7
0
    def add_reverse_hetero(g, combine_like=True):
        r"""
        Parameters
        ----------
        g : DGLGraph
            The heterogenous graph where reverse edges should be added
        combine_like : bool, optional
            Whether reverse-edges that have identical source/destination 
            node types should be combined with the existing edge-type, 
            rather than creating a new edge type.  Default: True.
        """
        relations = {}
        num_nodes_dict = {ntype: g.num_nodes(ntype) for ntype in g.ntypes}
        for metapath in g.canonical_etypes:
            src_ntype, rel_type, dst_ntype = metapath
            src, dst = g.all_edges(etype=rel_type)

            if src_ntype == dst_ntype and combine_like:
                # Make edges un-directed instead of making a reverse edge type
                relations[metapath] = (th.cat([src, dst],
                                              dim=0), th.cat([dst, src],
                                                             dim=0))
            else:
                # Original edges
                relations[metapath] = (src, dst)

                reverse_metapath = (dst_ntype, 'rev-' + rel_type, src_ntype)
                relations[reverse_metapath] = (dst, src)  # Reverse edges

        new_g = dgl.heterograph(relations, num_nodes_dict=num_nodes_dict)
        # Remove duplicate edges
        new_g = dgl.to_simple(new_g,
                              return_counts=None,
                              writeback_mapping=False,
                              copy_ndata=True)

        # copy_ndata:
        for ntype in g.ntypes:
            for k, v in g.nodes[ntype].data.items():
                new_g.nodes[ntype].data[k] = v.detach().clone()

        return new_g
示例#8
0
def test_to_simple(index_dtype):
    # homogeneous graph
    g = dgl.graph((F.tensor([0, 1, 2, 1]), F.tensor([1, 2, 0, 2])))
    g.ndata['h'] = F.tensor([[0.], [1.], [2.]])
    g.edata['h'] = F.tensor([[3.], [4.], [5.], [6.]])
    sg, wb = dgl.to_simple(g, writeback_mapping=True)
    u, v = g.all_edges(form='uv', order='eid')
    u = F.asnumpy(u).tolist()
    v = F.asnumpy(v).tolist()
    uv = list(zip(u, v))
    eid_map = F.asnumpy(wb)

    su, sv = sg.all_edges(form='uv', order='eid')
    su = F.asnumpy(su).tolist()
    sv = F.asnumpy(sv).tolist()
    suv = list(zip(su, sv))
    sc = F.asnumpy(sg.edata['count'])
    assert set(uv) == set(suv)
    for i, e in enumerate(suv):
        assert sc[i] == sum(e == _e for _e in uv)
    for i, e in enumerate(uv):
        assert eid_map[i] == suv.index(e)
    # shared ndata
    assert F.array_equal(sg.ndata['h'], g.ndata['h'])
    assert 'h' not in sg.edata
    # new ndata to sg
    sg.ndata['hh'] = F.tensor([[0.], [1.], [2.]])
    assert 'hh' not in g.ndata

    sg = dgl.to_simple(g, writeback_mapping=False, copy_ndata=False)
    assert 'h' not in sg.ndata
    assert 'h' not in sg.edata

    # heterogeneous graph
    g = dgl.heterograph({
        ('user', 'follow', 'user'): ([0, 1, 2, 1, 1, 1],
                                     [1, 3, 2, 3, 4, 4]),
        ('user', 'plays', 'game'): ([3, 2, 1, 1, 3, 2, 2], [5, 3, 4, 4, 5, 3, 3])},
        index_dtype=index_dtype)
    g.nodes['user'].data['h'] = F.tensor([0, 1, 2, 3, 4])
    g.nodes['user'].data['hh'] = F.tensor([0, 1, 2, 3, 4])
    g.edges['follow'].data['h'] = F.tensor([0, 1, 2, 3, 4, 5])
    sg, wb = dgl.to_simple(g, return_counts='weights', writeback_mapping=True, copy_edata=True)
    g.nodes['game'].data['h'] = F.tensor([0, 1, 2, 3, 4, 5])

    for etype in g.canonical_etypes:
        u, v = g.all_edges(form='uv', order='eid', etype=etype)
        u = F.asnumpy(u).tolist()
        v = F.asnumpy(v).tolist()
        uv = list(zip(u, v))
        eid_map = F.asnumpy(wb[etype])

        su, sv = sg.all_edges(form='uv', order='eid', etype=etype)
        su = F.asnumpy(su).tolist()
        sv = F.asnumpy(sv).tolist()
        suv = list(zip(su, sv))
        sw = F.asnumpy(sg.edges[etype].data['weights'])

        assert set(uv) == set(suv)
        for i, e in enumerate(suv):
            assert sw[i] == sum(e == _e for _e in uv)
        for i, e in enumerate(uv):
            assert eid_map[i] == suv.index(e)
    # shared ndata
    assert F.array_equal(sg.nodes['user'].data['h'], g.nodes['user'].data['h'])
    assert F.array_equal(sg.nodes['user'].data['hh'], g.nodes['user'].data['hh'])
    assert 'h' not in sg.nodes['game'].data
    # new ndata to sg
    sg.nodes['user'].data['hhh'] = F.tensor([0, 1, 2, 3, 4])
    assert 'hhh' not in g.nodes['user'].data
    # share edata
    feat_idx = F.asnumpy(wb[('user', 'follow', 'user')])
    _, indices = np.unique(feat_idx, return_index=True)
    assert np.array_equal(F.asnumpy(sg.edges['follow'].data['h']),
                          F.asnumpy(g.edges['follow'].data['h'])[indices])

    sg = dgl.to_simple(g, writeback_mapping=False, copy_ndata=False)
    for ntype in g.ntypes:
        assert g.number_of_nodes(ntype) == sg.number_of_nodes(ntype)
    assert 'h' not in sg.nodes['user'].data
    assert 'hh' not in sg.nodes['user'].data
示例#9
0
    label_min_index = source_data.y.min()
    label_max_index = source_data.y.max()
    node_label_num = label_max_index-label_min_index+1
    ## data processing
    # to avoid missing nodes, we should add all the self loop in the edge index and then build up the graph
    self_loop = torch.arange(source_data.x.shape[0])
    self_loop = self_loop.unsqueeze(1).repeat(1,2)
    src_edge_index_sl = torch.cat([source_data.edge_index.T,self_loop]).T #[2,N]

    self_loop = torch.arange(target_data.x.shape[0])
    self_loop = self_loop.unsqueeze(1).repeat(1,2)
    tgt_edge_index_sl = torch.cat([target_data.edge_index.T,self_loop]).T #[2,N]
    del self_loop
    ## generate train graph
    source_graph = dgl.to_simple(dgl.graph((src_edge_index_sl[0],src_edge_index_sl[1])))
    target_graph = dgl.to_simple(dgl.graph((tgt_edge_index_sl[0],tgt_edge_index_sl[1])))
    ## make edge index to be bidirected
    source_graph = dgl.to_bidirected(source_graph)
    target_graph = dgl.to_bidirected(target_graph)
    src_edge_index_sl = torch.vstack([source_graph.edges()[0],source_graph.edges()[1]])
    tgt_edge_index_sl = torch.vstack([target_graph.edges()[0],target_graph.edges()[1]])
    ##generate all node pair label
    source_node_num = source_data.x.shape[0]
    target_node_num = target_data.x.shape[0]
    source_node_feat = source_data.x
    target_node_feat = target_data.x
    source_node_label = source_data.y
    target_node_label = target_data.y
    del source_data,target_data
    src_all_node_pair,src_all_node_pair_label,max_np_label =generate_all_node_pair(source_node_num,src_edge_index_sl,source_node_label,
示例#10
0
def preprocess_data(dataset, train_ratio):

    if dataset in ['cora', 'citeseer', 'pubmed']:

        edge = np.loadtxt('../low_freq/{}.edge'.format(dataset),
                          dtype=int).tolist()
        feat = np.loadtxt('../low_freq/{}.feature'.format(dataset))
        labels = np.loadtxt('../low_freq/{}.label'.format(dataset), dtype=int)
        train = np.loadtxt('../low_freq/{}.train'.format(dataset), dtype=int)
        val = np.loadtxt('../low_freq/{}.val'.format(dataset), dtype=int)
        test = np.loadtxt('../low_freq/{}.test'.format(dataset), dtype=int)
        nclass = len(set(labels.tolist()))
        print(dataset, nclass)

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.remove_self_loop(g)
        g = dgl.to_bidirected(g)

        feat = normalize_features(feat)
        feat = torch.FloatTensor(feat)
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(train)
        val = torch.LongTensor(val)
        test = torch.LongTensor(test)

        return g, nclass, feat, labels, train, val, test

    elif 'syn' in dataset:
        edge = np.loadtxt('../syn/{}.edge'.format(dataset), dtype=int).tolist()
        labels = np.loadtxt('../syn/{}.lab'.format(dataset), dtype=int)
        features = np.loadtxt('../syn/{}.feat'.format(dataset), dtype=float)

        n = labels.shape[0]
        idx = [i for i in range(n)]
        random.shuffle(idx)
        idx_train = np.array(idx[:100])
        idx_test = np.array(idx[100:])

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))

        c1 = 0
        c2 = 0
        lab = labels.tolist()
        for e in edge:
            if lab[e[0]] == lab[e[1]]:
                c1 += 1
            else:
                c2 += 1
        print(c1 / len(edge), c2 / len(edge))

        #normalization will make features degenerated
        #features = normalize_features(features)
        features = torch.FloatTensor(features)

        nclass = 2
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(idx_train)
        test = torch.LongTensor(idx_test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, train, test

    elif dataset in ['film']:
        graph_adjacency_list_file_path = '../high_freq/{}/out1_graph_edges.txt'.format(
            dataset)
        graph_node_features_and_labels_file_path = '../high_freq/{}/out1_node_feature_label.txt'.format(
            dataset)

        G = nx.DiGraph()
        graph_node_features_dict = {}
        graph_labels_dict = {}

        if dataset == 'film':
            with open(graph_node_features_and_labels_file_path
                      ) as graph_node_features_and_labels_file:
                graph_node_features_and_labels_file.readline()
                for line in graph_node_features_and_labels_file:
                    line = line.rstrip().split('\t')
                    assert (len(line) == 3)
                    assert (int(line[0]) not in graph_node_features_dict
                            and int(line[0]) not in graph_labels_dict)
                    feature_blank = np.zeros(932, dtype=np.uint16)
                    feature_blank[np.array(line[1].split(','),
                                           dtype=np.uint16)] = 1
                    graph_node_features_dict[int(line[0])] = feature_blank
                    graph_labels_dict[int(line[0])] = int(line[2])
        else:
            with open(graph_node_features_and_labels_file_path
                      ) as graph_node_features_and_labels_file:
                graph_node_features_and_labels_file.readline()
                for line in graph_node_features_and_labels_file:
                    line = line.rstrip().split('\t')
                    assert (len(line) == 3)
                    assert (int(line[0]) not in graph_node_features_dict
                            and int(line[0]) not in graph_labels_dict)
                    graph_node_features_dict[int(line[0])] = np.array(
                        line[1].split(','), dtype=np.uint8)
                    graph_labels_dict[int(line[0])] = int(line[2])

        with open(graph_adjacency_list_file_path) as graph_adjacency_list_file:
            graph_adjacency_list_file.readline()
            for line in graph_adjacency_list_file:
                line = line.rstrip().split('\t')
                assert (len(line) == 2)
                if int(line[0]) not in G:
                    G.add_node(int(line[0]),
                               features=graph_node_features_dict[int(line[0])],
                               label=graph_labels_dict[int(line[0])])
                if int(line[1]) not in G:
                    G.add_node(int(line[1]),
                               features=graph_node_features_dict[int(line[1])],
                               label=graph_labels_dict[int(line[1])])
                G.add_edge(int(line[0]), int(line[1]))

        adj = nx.adjacency_matrix(G, sorted(G.nodes()))
        row, col = np.where(adj.todense() > 0)

        U = row.tolist()
        V = col.tolist()
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.to_bidirected(g)
        g = dgl.remove_self_loop(g)

        features = np.array([
            features for _, features in sorted(G.nodes(data='features'),
                                               key=lambda x: x[0])
        ],
                            dtype=float)
        labels = np.array([
            label
            for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0])
        ],
                          dtype=int)

        n = labels.shape[0]
        idx = [i for i in range(n)]
        #random.shuffle(idx)
        r0 = int(n * train_ratio)
        r1 = int(n * 0.6)
        r2 = int(n * 0.8)

        idx_train = np.array(idx[:r0])
        idx_val = np.array(idx[r1:r2])
        idx_test = np.array(idx[r2:])

        features = normalize_features(features)
        features = torch.FloatTensor(features)

        nclass = 5
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(idx_train)
        val = torch.LongTensor(idx_val)
        test = torch.LongTensor(idx_test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, val, test

    # datasets in Geom-GCN
    elif dataset in ['cornell', 'texas', 'wisconsin', 'chameleon', 'squirrel']:

        graph_adjacency_list_file_path = '../high_freq/{}/out1_graph_edges.txt'.format(
            dataset)
        graph_node_features_and_labels_file_path = '../high_freq/{}/out1_node_feature_label.txt'.format(
            dataset)

        G = nx.DiGraph()
        graph_node_features_dict = {}
        graph_labels_dict = {}

        with open(graph_node_features_and_labels_file_path
                  ) as graph_node_features_and_labels_file:
            graph_node_features_and_labels_file.readline()
            for line in graph_node_features_and_labels_file:
                line = line.rstrip().split('\t')
                assert (len(line) == 3)
                assert (int(line[0]) not in graph_node_features_dict
                        and int(line[0]) not in graph_labels_dict)
                graph_node_features_dict[int(line[0])] = np.array(
                    line[1].split(','), dtype=np.uint8)
                graph_labels_dict[int(line[0])] = int(line[2])

        with open(graph_adjacency_list_file_path) as graph_adjacency_list_file:
            graph_adjacency_list_file.readline()
            for line in graph_adjacency_list_file:
                line = line.rstrip().split('\t')
                assert (len(line) == 2)
                if int(line[0]) not in G:
                    G.add_node(int(line[0]),
                               features=graph_node_features_dict[int(line[0])],
                               label=graph_labels_dict[int(line[0])])
                if int(line[1]) not in G:
                    G.add_node(int(line[1]),
                               features=graph_node_features_dict[int(line[1])],
                               label=graph_labels_dict[int(line[1])])
                G.add_edge(int(line[0]), int(line[1]))

        adj = nx.adjacency_matrix(G, sorted(G.nodes()))
        features = np.array([
            features for _, features in sorted(G.nodes(data='features'),
                                               key=lambda x: x[0])
        ])
        labels = np.array([
            label
            for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0])
        ])

        features = normalize_features(features)

        g = DGLGraph(adj)
        g = dgl.to_simple(g)
        g = dgl.to_bidirected(g)
        g = dgl.remove_self_loop(g)

        n = len(labels.tolist())
        idx = [i for i in range(n)]
        #random.shuffle(idx)
        r0 = int(n * train_ratio)
        r1 = int(n * 0.6)
        r2 = int(n * 0.8)
        train = np.array(idx[:r0])
        val = np.array(idx[r1:r2])
        test = np.array(idx[r2:])

        nclass = len(set(labels.tolist()))
        features = torch.FloatTensor(features)
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(train)
        val = torch.LongTensor(val)
        test = torch.LongTensor(test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, val, test

    # datasets in FAGCN
    elif dataset in ['new_chameleon', 'new_squirrel']:
        edge = np.loadtxt('../high_freq/{}/edges.txt'.format(dataset),
                          dtype=int)
        labels = np.loadtxt('../high_freq/{}/labels.txt'.format(dataset),
                            dtype=int).tolist()
        features = np.loadtxt('../high_freq/{}/features.txt'.format(dataset),
                              dtype=float)

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.to_bidirected(g)
        g = dgl.remove_self_loop(g)

        n = len(labels)
        idx = [i for i in range(n)]
        #random.shuffle(idx)
        r0 = int(n * train_ratio)
        r1 = int(n * 0.6)
        r2 = int(n * 0.8)
        train = np.array(idx[:r0])
        val = np.array(idx[r1:r2])
        test = np.array(idx[r2:])

        features = normalize_features(features)
        features = torch.FloatTensor(features)

        nclass = 3
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(train)
        val = torch.LongTensor(val)
        test = torch.LongTensor(test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, val, test
示例#11
0
def preprocess_data(dataset, train_percentage):
    import dgl

    # Modified from AAAI21 FA-GCN
    if dataset in ['cora', 'citeseer', 'pubmed']:
        load_default_split = train_percentage <= 0
        edge = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.edge',
                          dtype=int).tolist()
        features = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.feature')
        labels = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.label',
                            dtype=int)
        if load_default_split:
            train = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.train',
                               dtype=int)
            val = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.val', dtype=int)
            test = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.test',
                              dtype=int)
        else:
            train, val, test = stratified_train_test_split(
                np.arange(len(labels)), labels, len(labels), train_percentage)
        nclass = len(set(labels.tolist()))
        print(dataset, nclass)

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.remove_self_loop(g)
        g = dgl.to_bidirected(g)

        features = normalize_features(features)
        features = th.FloatTensor(features)
        labels = th.LongTensor(labels)
        train = th.LongTensor(train)
        val = th.LongTensor(val)
        test = th.LongTensor(test)

    elif dataset in ['airport', 'blogcatalog', 'flickr']:
        load_default_split = train_percentage <= 0
        adj_orig = pickle.load(
            open(f'{DATA_PATH}/{dataset}/{dataset}_adj.pkl', 'rb'))  # sparse
        features = pickle.load(
            open(f'{DATA_PATH}/{dataset}/{dataset}_features.pkl',
                 'rb'))  # sparase
        labels = pickle.load(
            open(f'{DATA_PATH}/{dataset}/{dataset}_labels.pkl',
                 'rb'))  # tensor
        if th.is_tensor(labels):
            labels = labels.numpy()

        if load_default_split:
            tvt_nids = pickle.load(
                open(f'{DATA_PATH}/{dataset}/{dataset}_tvt_nids.pkl',
                     'rb'))  # 3 array
            train = tvt_nids[0]
            val = tvt_nids[1]
            test = tvt_nids[2]
        else:
            train, val, test = stratified_train_test_split(
                np.arange(len(labels)), labels, len(labels), train_percentage)
        nclass = len(set(labels.tolist()))
        print(dataset, nclass)

        adj_orig = adj_orig.tocoo()
        U = adj_orig.row.tolist()
        V = adj_orig.col.tolist()
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.remove_self_loop(g)
        g = dgl.to_bidirected(g)

        if dataset in ['airport']:
            features = normalize_features(features)

        if sp.issparse(features):
            features = torch.FloatTensor(features.toarray())
        else:
            features = th.FloatTensor(features)

        labels = th.LongTensor(labels)
        train = th.LongTensor(train)
        val = th.LongTensor(val)
        test = th.LongTensor(test)

    elif dataset in ['arxiv']:
        dataset = DglNodePropPredDataset(name='ogbn-arxiv',
                                         root='data/ogb_arxiv')
        split_idx = dataset.get_idx_split()
        train, val, test = split_idx["train"], split_idx["valid"], split_idx[
            "test"]
        g, labels = dataset[0]
        features = g.ndata['feat']
        nclass = 40
        labels = labels.squeeze()
        g = dgl.to_bidirected(g)
        g = dgl.to_bidirected(g)
    if dataset in ['citeseer']:
        g = dgl.add_self_loop(g)
    return g, features, features.shape[1], nclass, labels, train, val, test