示例#1
0
def load_dataset(name):
    if name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(root='./data/'+name, name=name)
    elif name == "CoraFull":
        dataset = CoraFull(root='./data/'+name)
    elif name in ["Computers", "Photo"]:
        dataset = Amazon(root='./data/'+name, name=name)
    elif name in ["CS", "Physics"]:
        dataset = Coauthor(root='./data/'+name, name=name)
    else:
        exit("wrong dataset")
    return dataset
示例#2
0
 def __init__(self, path: str):
     pyg_dataset = Coauthor(os.path.join(path, '_pyg'), "CS")
     if hasattr(pyg_dataset, "__data_list__"):
         delattr(pyg_dataset, "__data_list__")
     if hasattr(pyg_dataset, "_data_list"):
         delattr(pyg_dataset, "_data_list")
     pyg_data = pyg_dataset[0]
     static_graph = GeneralStaticGraphGenerator.create_homogeneous_static_graph(
         {
             'x': pyg_data.x,
             'y': pyg_data.y
         }, pyg_data.edge_index)
     super(CoauthorCSDataset, self).__init__([static_graph])
示例#3
0
def load_dataset(dataset, transform=None):
    if dataset.lower() in ["cora", "citeseer", "pubmed"]:
        path = os.path.join(".datasets", "Plantoid")
        dataset = Planetoid(path, dataset.lower(), transform=transform)
    elif dataset.lower() in ["cs", "physics"]:
        path = os.path.join(".datasets", "Coauthor", dataset.lower())
        dataset = Coauthor(path, dataset.lower(), transform=transform)
    elif dataset.lower() in ["computers", "photo"]:
        path = os.path.join(".datasets", "Amazon", dataset.lower())
        dataset = Amazon(path, dataset.lower(), transform=transform)
    else:
        print("Dataset not supported!")
        assert False
    return dataset
示例#4
0
def load_data(dataset="Cora"):
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                        "data", dataset)
    if dataset in ["Cora", "Citeseer", "Pubmed"]:
        data = Planetoid(path,
                         dataset,
                         split="public",
                         transform=T.NormalizeFeatures())[0]
        num_nodes = data.x.size(0)
        edge_index, _ = remove_self_loops(data.edge_index)
        edge_index = add_self_loops(edge_index, num_nodes=num_nodes)
        if isinstance(edge_index, tuple):
            data.edge_index = edge_index[0]  # !!! 2*N 新版可能有改变
        else:
            data.edge_index = edge_index
        return data
    elif dataset in ["CoauthorCS"]:
        data = Coauthor(path, "cs", transform=T.NormalizeFeatures())[0]
        num_nodes = data.x.size(0)
        edge_index, _ = remove_self_loops(data.edge_index)
        edge_index = add_self_loops(edge_index, num_nodes=num_nodes)
        if isinstance(edge_index, tuple):
            data.edge_index = edge_index[0]
        else:
            data.edge_index = edge_index

        # devide training validation and testing set
        train_mask = torch.zeros((num_nodes, ), dtype=torch.bool)
        val_mask = torch.zeros((num_nodes, ), dtype=torch.bool)
        test_mask = torch.zeros((num_nodes, ), dtype=torch.bool)
        train_num = 40
        val_num = 150
        for i in range(15):  # number of labels
            index = (data.y == i).nonzero()[:, 0]
            perm = torch.randperm(index.size(0))
            train_mask[index[perm[:train_num]]] = 1
            val_mask[index[perm[train_num:(train_num + val_num)]]] = 1
            test_mask[index[perm[(train_num + val_num):]]] = 1
        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask
        return data
    else:
        raise Exception(f"the dataset of {dataset} has not been implemented")
示例#5
0
    def load_data(self):
        data_name = self._params['data_name']
        if self._params['net'] in {'combined', 'symmetric', 'asymmetric', 'combined_gcn'}:
            self._data_path = './data/{}'.format(data_name)
            gnx = nx.read_gpickle("./data/{}/gnx.pkl".format(data_name))
            bow = pickle.load(open("./data/{}/content.pkl".format(data_name), "rb"))
            nodes = sorted(gnx.nodes)
            dict = {x: i for i, x in enumerate(nodes)}
            x = torch.Tensor(np.vstack([bow[node] for node in nodes])).to(self._device)
            y = torch.LongTensor([gnx.nodes[node]['label'] for node in nodes]).to(self._device)
            edges = torch.LongTensor(np.vstack([[dict[x[0]] for x in gnx.edges],
                                               [dict[x[1]] for x in gnx.edges]])).to(self._device)
            self._data = Data(x=x, edge_index=edges, y=y)
            self._num_features = x.shape[1]
            self._num_classes = len(gnx.graph['node_labels'])

            # Adjacency matrices
            adj = nx.adjacency_matrix(gnx, nodelist=nodes).astype(np.float32)
            if self._params['net'] == 'symmetric':
                self._adj = handle_matrix_symmetric(adj)
                self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense().to(self._device)
            else:
                self._adj = handle_matrix_concat(adj, should_normalize=True)
                self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense().to(self._device)

            return self._data

        data_transform = T.NormalizeFeatures() if self._params['norm'] == True else None
        self._data_path = './DataSets/{}'.format(data_name)
        if data_name == "CoraFull":
            self._data_set = CoraFull(self._data_path)
        elif data_name in {"CS", "Physics"}:
            self._data_set = Coauthor(self._data_path, data_name)
        else:
            self._data_set = Planetoid(self._data_path, data_name, data_transform)
        self._data_set.data.to(self._device)
        self._data = self._data_set[0]
        # self._data = self._data_set.data

        self._num_features = self._data_set.num_features
        self._num_classes = self._data_set.num_classes

        return self._data
示例#6
0
def fetch_dataset(root, name):
    """
    Fetchs datasets from the PyTorch Geometric library
    
    :param root: A path to the root directory a dataset will be placed
    :param name: Name of the dataset. Currently, the following names are supported
                'cora', 'citeseer', "pubmed", 'Computers', "Photo", 'CS',  'Physics'
    :return: A PyTorch Geometric dataset
    """
    print(name.lower())
    if name.lower() in {'cora', 'citeseer', "pubmed"}:
        return Planetoid(root=root, name=name)
    elif name.lower() in {'computers', "photo"}:
        return Amazon(root=root, name=name)
    elif name.lower() in {'cs',  'physics'}:
        return Coauthor(root=root, name=name)
    elif name.lower() == "wiki":
        return WikiCS(osp.join(root, "WikiCS"))
    elif name.lower() == "actor":
        return Actor(osp.join(root, name))
示例#7
0
def load_data(dataset_name):
    """
    Loads required data set and normalizes features.
    Implemented data sets are any of type Planetoid and Reddit.
    :param dataset_name: Name of data set
    :return: Tuple of dataset and extracted graph
    """
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    dataset_name)

    if dataset_name == 'cora_full':
        dataset = CoraFull(path, T.NormalizeFeatures())
    elif dataset_name.lower() == 'coauthor':
        dataset = Coauthor(path, 'Physics', T.NormalizeFeatures())
    elif dataset_name.lower() == 'reddit':
        dataset = Reddit(path, T.NormalizeFeatures())
    elif dataset_name.lower() == 'amazon':
        dataset = Amazon(path)
    else:
        dataset = Planetoid(path, dataset_name, T.NormalizeFeatures())

    print(f"Loading data set {dataset_name} from: ", path)
    data = dataset[0]  # Extract graph
    return dataset, data
示例#8
0
def load_data(dataset="Cora", supervised=False, full_data=True, args=None):
    '''
    support semi-supervised and supervised
    :param dataset:
    :param supervised:
    :return:
    '''
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
    dataset_name = dataset
    if dataset in ["CS", "Physics"]:
        dataset = Coauthor(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Computers", "Photo"]:
        dataset = Amazon(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Cora", "Citeseer", "Pubmed"]:
        dataset = Planetoid(path, dataset, T.NormalizeFeatures())

        # path = path + '/processed/data.pt'
        # dataset = torch.load(path)
    data = dataset[0]
    data['adj'] = load_citation(dataset_name, args.normalization)
    if supervised:
        if full_data:
            data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.train_mask[:-1000] = 1
            data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.val_mask[-1000:-500] = 1
            data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.test_mask[-500:] = 1
        else:
            data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.train_mask[:1000] = 1
            data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.val_mask[1000:1500] = 1
            data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.test_mask[1500:2000] = 1
    return data
示例#9
0
if pr.net == 1:
    print("Data Cora")
    _data = Planetoid(root="./pcora", name="Cora")
elif pr.net == 2:
    print("Data CiteSeer")
    _data = Planetoid(root="./pciteseer", name="Citeseer")
elif pr.net == 3:
    print("Data Pubmed")
    _data = Planetoid(root="./ppubmed", name="Pubmed")
elif pr.net == 4:
    print("Data CoraFull")
    _data = CoraFull("./Corafull")
elif pr.net == 5:
    print("Data Coauthor CS")
    _data = Coauthor("./CS", "CS")
elif pr.net == 6:
    print("Data Coauthor Physics")
    _data = Coauthor("./Physics", "Physics")
elif pr.net == 7:
    print("Data Amazon Computer")
    _data = Amazon("./Computer", "Computers")
elif pr.net == 8:
    print("Data Amazon Photos")
    _data = Amazon("./Photo", "Photo")

#_data = Coauthor("./Physics","Physics")
#_data = Coauthor("./CS","CS")

#_data = CoraFull("./Corafull")
示例#10
0
from torch_geometric.datasets import Coauthor
from torch_geometric.utils import to_networkx
import pickle
from GraphRicciCurvature.OllivierRicci import OllivierRicci
from GraphRicciCurvature.FormanRicci import FormanRicci
import numpy as np
import sklearn.preprocessing as pp
import torch
import os

datasets = ['CS'] #,'Physics']

for dataset in datasets:
    coauth = Coauthor('data_coauthor_'+dataset,dataset)
    print("coauth done")
    data = torch.load('data_coauthor_' + dataset + '/' + dataset + '/processed/data.pt')
    print("assigned val to data")
    G = to_networkx(data[0],to_undirected=True,remove_self_loops=True)
    print("made G")
    frc = FormanRicci(G)
    frc.compute_ricci_curvature()

    orc = OllivierRicci(G, alpha=0.5, verbose="INFO")
    orc.compute_ricci_curvature()
    print("Ollivier comp done")
    X = np.array(data[0].x)
    y_arr = np.array(data[0].y)
    Y = pp.label_binarize(y_arr, list(set(y_arr)))
    counts = [0]*15
    train_idx = []
    for i,yval in enumerate(y_arr):
示例#11
0
文件: pyg.py 项目: Frozenmad/AutoGL
 def __init__(self, path):
     dataset = "CS"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     Coauthor(path, dataset)
     super(CoauthorCSDataset, self).__init__(path, dataset)
示例#12
0
def get_small_dataset(dataset_name,
                      normalize_attributes=False,
                      add_self_loops=False,
                      remove_isolated_nodes=False,
                      make_undirected=False,
                      graph_availability=None,
                      seed=0,
                      create_adjacency_lists=True):
    """
    Get the pytorch_geometric.data.Data object associated with the specified dataset name.
    :param dataset_name: str => One of the datasets mentioned below.
    :param normalize_attributes: Whether the attributes for each node should be normalized to sum to 1.
    :param add_self_loops: Add self loops to the input Graph.
    :param remove_isolated_nodes: Remove isolated nodes.
    :param make_undirected: Make the Graph undirected.
    :param graph_availability: Either inductive and transductive. If transductive, all the graph nodes are available
                               during training. Otherwise, only training split nodes are available.
    :param seed: The random seed to use while splitting into train/val/test splits.
    :param create_adjacency_lists: Whether to process and store adjacency lists that can be used for efficient
                                   r-radius neighborhood sampling.
    :return: A pytorch_geometric.data.Data object for that dataset.
    """
    assert dataset_name in {
        'amazon-computers', 'amazon-photo', 'citeseer', 'coauthor-cs',
        'coauthor-physics', 'cora', 'cora-full', 'ppi', 'pubmed', 'reddit'
    }
    assert graph_availability in {'inductive', 'transductive'}

    # Compose transforms that should be applied.
    transforms = []
    if normalize_attributes:
        transforms.append(NormalizeFeatures())
    if remove_isolated_nodes:
        transforms.append(RemoveIsolatedNodes())
    if add_self_loops:
        transforms.append(AddSelfLoops())
    transforms = Compose(transforms) if transforms else None

    # Load the specified dataset and apply transforms.
    root_dir = '/tmp/{dir}'.format(dir=dataset_name)
    processed_dir = os.path.join(root_dir, dataset_name, 'processed')
    # Remove any previously pre-processed data, so pytorch_geometric can pre-process it again.
    if os.path.exists(processed_dir) and os.path.isdir(processed_dir):
        shutil.rmtree(processed_dir)

    data = None

    def split_function(y):
        return _get_train_val_test_masks(y.shape[0], y, 0.2, 0.2, seed)

    if dataset_name in ['citeseer', 'cora', 'pubmed']:
        data = Planetoid(root=root_dir,
                         name=dataset_name,
                         pre_transform=transforms,
                         split='full').data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'cora-full':
        data = CoraFull(root=root_dir, pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-computers':
        data = Amazon(root=root_dir,
                      name='Computers',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-photo':
        data = Amazon(root=root_dir, name='Photo',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-cs':
        data = Coauthor(root=root_dir, name='CS',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-physics':
        data = Coauthor(root=root_dir,
                        name='Physics',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'reddit':
        data = Reddit(root=root_dir, pre_transform=transforms).data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'ppi':
        data = SimpleNamespace()
        data.graphs = []
        for split in ['train', 'val', 'test']:
            split_data = PPI(root=root_dir,
                             split=split,
                             pre_transform=transforms)
            x_idxs = split_data.slices['x'].numpy()
            edge_idxs = split_data.slices['edge_index'].numpy()
            split_data = split_data.data
            for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:],
                                                      edge_idxs,
                                                      edge_idxs[1:]):
                graph = Data(split_data.x[x_start:x_end],
                             split_data.edge_index[:, e_start:e_end],
                             y=split_data.y[x_start:x_end])
                graph.num_nodes = int(x_end - x_start)
                graph.split = split
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.train_mask = all_true if split == 'train' else all_false
                graph.val_mask = all_true if split == 'val' else all_false
                graph.test_mask = all_true if split == 'test' else all_false
                data.graphs.append(graph)
        if seed != 0:
            temp_random = random.Random(seed)
            val_graphs = temp_random.sample(range(len(data.graphs)), 2)
            test_candidates = [
                graph_idx for graph_idx in range(len(data.graphs))
                if graph_idx not in val_graphs
            ]
            test_graphs = temp_random.sample(test_candidates, 2)
            for graph_idx, graph in enumerate(data.graphs):
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train'
                graph.train_mask = all_true if graph.split == 'train' else all_false
                graph.val_mask = all_true if graph.split == 'val' else all_false
                graph.test_mask = all_true if graph.split == 'test' else all_false

    if make_undirected:
        for graph in data.graphs:
            graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes)

    LOG.info(f'Downloaded and transformed {len(data.graphs)} graph(s).')

    # Populate adjacency lists for efficient k-neighborhood sampling.
    # Only retain edges coming into a node and reverse the edges for the purpose of adjacency lists.
    LOG.info('Processing adjacency lists and degree information.')

    for graph in data.graphs:
        train_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        val_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        test_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        adjacency_lists = defaultdict(list)
        not_val_test_mask = (~graph.val_mask & ~graph.test_mask).numpy()
        val_mask = graph.val_mask.numpy()
        test_mask = graph.test_mask.numpy()

        if create_adjacency_lists:
            num_edges = graph.edge_index[0].shape[0]
            sources, dests = graph.edge_index[0].numpy(
            ), graph.edge_index[1].numpy()
            for source, dest in tqdm(zip(sources, dests),
                                     total=num_edges,
                                     leave=False):
                if not_val_test_mask[dest] and not_val_test_mask[source]:
                    train_in_degrees[dest] += 1
                    val_in_degrees[dest] += 1
                elif val_mask[dest] and not test_mask[source]:
                    val_in_degrees[dest] += 1
                test_in_degrees[dest] += 1
                adjacency_lists[dest].append(source)

        graph.adjacency_lists = dict(adjacency_lists)
        graph.train_in_degrees = torch.from_numpy(train_in_degrees).long()
        graph.val_in_degrees = torch.from_numpy(val_in_degrees).long()
        graph.test_in_degrees = torch.from_numpy(test_in_degrees).long()
        if graph_availability == 'transductive':
            graph.train_in_degrees = data.test_in_degrees
            graph.val_in_degrees = data.test_in_degrees

        graph.graph_availability = graph_availability

        # To accumulate any neighborhood perturbations to the graph.
        graph.perturbed_neighborhoods = defaultdict(set)
        graph.added_nodes = defaultdict(set)
        graph.modified_degrees = {}

        # For small datasets, cache the neighborhoods for all nodes for at least 3 different radii queries.
        graph.use_cache = True
        graph.neighborhood_cache = NeighborhoodCache(graph.num_nodes * 3)

        graph.train_mask_original = graph.train_mask
        graph.val_mask_original = graph.val_mask
        graph.test_mask_original = graph.test_mask

        graph.train_mask = torch.ones(
            graph.num_nodes).bool() & ~graph.val_mask & ~graph.test_mask

    return data