Exemplo n.º 1
0
def load(dataset):
    datadir = os.path.join('data', dataset)

    if not os.path.exists(datadir):
        os.makedirs(datadir)
        ds = download(dataset)
        adj = nx.to_numpy_array(ds.graph)
        diff = compute_ppr(ds.graph, 0.2)
        feat = ds.features[:]
        labels = ds.labels[:]

        idx_train = np.argwhere(ds.train_mask == 1).reshape(-1)
        idx_val = np.argwhere(ds.val_mask == 1).reshape(-1)
        idx_test = np.argwhere(ds.test_mask == 1).reshape(-1)

        np.save(f'{datadir}/adj.npy', adj)
        np.save(f'{datadir}/diff.npy', diff)
        np.save(f'{datadir}/feat.npy', feat)
        np.save(f'{datadir}/labels.npy', labels)
        np.save(f'{datadir}/idx_train.npy', idx_train)
        np.save(f'{datadir}/idx_val.npy', idx_val)
        np.save(f'{datadir}/idx_test.npy', idx_test)
    else:
        adj = np.load(f'{datadir}/adj.npy')
        diff = np.load(f'{datadir}/diff.npy')
        feat = np.load(f'{datadir}/feat.npy')
        labels = np.load(f'{datadir}/labels.npy')
        idx_train = np.load(f'{datadir}/idx_train.npy')
        idx_val = np.load(f'{datadir}/idx_val.npy')
        idx_test = np.load(f'{datadir}/idx_test.npy')

    if dataset == 'citeseer':
        feat = preprocess_features(feat)

        epsilons = [1e-5, 1e-4, 1e-3, 1e-2]
        avg_degree = np.sum(adj) / adj.shape[0]
        epsilon = epsilons[np.argmin([
            abs(avg_degree - np.argwhere(diff >= e).shape[0] / diff.shape[0])
            for e in epsilons
        ])]

        diff[diff < epsilon] = 0.0
        scaler = MinMaxScaler()
        scaler.fit(diff)
        diff = scaler.transform(diff)

    ori_adj = copy.deepcopy(adj)
    # print(ori_adj)
    adj = normalize_adj(adj + sp.eye(adj.shape[0])).todense()

    return ori_adj, adj, diff, feat, labels, idx_train, idx_val, idx_test
    def process_raw_data(self):
        root = os.path.abspath(
            os.path.join(os.path.dirname(__file__), os.path.pardir))
        prefix = os.path.join(root, 'dataset', self.__graph_type, self.__name,
                              self.__name)
        print(prefix)

        graph_node_dict = {}
        with open('{0}_graph_indicator.txt'.format(prefix), 'r') as f:
            for idx, line in enumerate(f):
                graph_node_dict[idx + 1] = int(line.strip('\n'))
        max_nodes = Counter(graph_node_dict.values()).most_common(1)[0][1]

        node_labels = []
        if os.path.exists('{0}_node_labels.txt'.format(prefix)):
            with open('{0}_node_labels.txt'.format(prefix), 'r') as f:
                for line in f:
                    node_labels += [int(line.strip('\n'))]  # -1 ???????????
                num_unique_node_labels = max(node_labels) + 1  # ??????????????
        else:
            print('No node labels')

        node_attrs = []
        if os.path.exists('{0}_node_attributes.txt'.format(prefix)):
            with open('{0}_node_attributes.txt'.format(prefix), 'r') as f:
                for line in f:
                    node_attrs.append(
                        np.array([
                            float(attr)
                            for attr in re.split("[,\s]+", line.strip('\s\n'))
                            if attr
                        ],
                                 dtype=np.float))
        else:
            print('No node attributes')

        graph_labels = []
        unique_labels = set()
        with open('{0}_graph_labels.txt'.format(prefix), 'r') as f:
            for line in f:
                val = int(line.strip('\n'))
                if val not in unique_labels:
                    unique_labels.add(val)
                graph_labels.append(val)
        label_idx_dict = {val: idx for idx, val in enumerate(unique_labels)}
        graph_labels = np.array(
            [label_idx_dict[element] for element in graph_labels])

        adj_list = {idx: []
                    for idx in range(1,
                                     len(graph_labels) + 1)}  # 每个图包含的边
        index_graph = {idx: []
                       for idx in range(1,
                                        len(graph_labels) + 1)}  # 每个图包含的节点
        with open('{0}_A.txt'.format(prefix), 'r') as f:
            for line in f:
                u, v = tuple(map(int, line.strip('\n').split(',')))
                adj_list[graph_node_dict[u]].append((u, v))
                index_graph[graph_node_dict[u]] += [u, v]

        for k in index_graph.keys():
            index_graph[k] = [u - 1 for u in set(index_graph[k])]

        graphs, aug_view = [], []
        for idx in range(1, 1 + len(adj_list)):
            graph = nx.from_edgelist(adj_list[idx])
            if max_nodes is not None and graph.number_of_nodes() > max_nodes:
                continue

            graph.graph['label'] = graph_labels[idx - 1]
            for u in graph.nodes():
                if len(node_labels) > 0:
                    node_label_one_hot = [0] * num_unique_node_labels
                    node_label = node_labels[u - 1]
                    node_label_one_hot[node_label] = 1
                    graph.nodes[u]['label'] = node_label_one_hot
                if len(node_attrs) > 0:
                    graph.nodes[u]['feat'] = node_attrs[u - 1]
            if len(node_attrs) > 0:
                graph.graph['feat_dim'] = node_attrs[0].shape[0]

            # relabeling
            mapping = {}
            for node_idx, node in enumerate(graph.nodes()):
                mapping[node] = node_idx

            graphs.append(nx.relabel_nodes(graph, mapping))
            aug_view.append(compute_ppr(graph, alpha=0.2))

        if 'feat_dim' in graphs[0].graph:
            pass
        else:
            max_deg = max(
                [max(dict(graph.degree).values()) for graph in graphs])
            for graph in graphs:
                for u in graph.nodes(data=True):
                    f = np.zeros(max_deg + 1)
                    f[graph.degree[u[0]]] = 1.0
                    if 'label' in u[1]:
                        f = np.concatenate((np.array(u[1]['label'],
                                                     dtype=np.float), f))
                    graph.nodes[u[0]]['feat'] = f
        return graphs, aug_view
        return 0


if __name__ == '__main__':
    import warnings
    warnings.filterwarnings("ignore")
    a = 1
    # print(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
    ds = CitationGraphDataset(name='pubmed')
    # nx.draw(ds.graph, node_size=10)
    # plt.show()
    # print(nx.to_numpy_array(ds.graph).shape)
    print('start')
    start = time.time()
    # spectral_perturbation(ds.graph)
    compute_ppr(ds.graph, 0.2)
    finish = time.time()
    print('finish ', finish - start)
    # print(ds)
    # print(nx.to_numpy_array(ds.graph).shape)
    # print(n)
    # print(ds.features)

    # path = 'D:/VSrepos/spectral-graph-augmentation/dataset/node'
    # if not os.path.exists(path):
    #     print('create')
    #     os.makedirs(path)
    # else:
    #     print('exist')

    # GraphData('PTC_MR').process_raw_data()