示例#1
0
def get_dataset(name):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data', name)
    dataset = TUDataset(path, name)
    dataset.data.edge_attr = None

    if dataset.data.x is None:
        max_degree = 0
        degs = []
        for data in dataset:
            degs += [degree(data.edge_index[0], dtype=torch.long)]
            max_degree = max(max_degree, degs[-1].max().item())

        if max_degree < 1000:
            dataset.transform = T.OneHotDegree(max_degree)
        else:
            deg = torch.cat(degs, dim=0).to(torch.float)
            mean, std = deg.mean().item(), deg.std().item()
            dataset.transform = NormalizedDegree(mean, std)

    return dataset
示例#2
0
def get_tudataset(name, rwr, cleaned=False):
    transform = None
    if rwr:
        transform = rwr_filter
    path = osp.join(osp.dirname(osp.realpath(__file__)),
                    ('rwr' if rwr else ''))
    dataset = TUDataset(path,
                        name,
                        pre_transform=transform,
                        use_edge_attr=rwr,
                        cleaned=cleaned)

    if dataset.data.x is None:
        max_degree = 0
        degs = []
        for data in dataset:
            degs += [degree(data.edge_index[0], dtype=torch.long)]
            max_degree = max(max_degree, degs[-1].max().item())
        dataset.transform = FilterConstant(10)  #T.OneHotDegree(max_degree)
    return dataset
示例#3
0
 def __init__(self, path: str):
     pyg_dataset = TUDataset(os.path.join(path, '_pyg'), "COLLAB")
     if hasattr(pyg_dataset, "__data_list__"):
         delattr(pyg_dataset, "__data_list__")
     if hasattr(pyg_dataset, "_data_list"):
         delattr(pyg_dataset, "_data_list")
     super(COLLABDataset, self).__init__([
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(
             {}, pyg_data.edge_index, graph_data={'y': pyg_data.y})
         for pyg_data in pyg_dataset
     ])
示例#4
0
def main():
    args = arg_parse()
    # print(args)

    if args.dataset == 'enzymes':
        dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
        task = 'graph'
    elif args.dataset == 'cora':
        dataset = Planetoid(root='/tmp/Cora', name='Cora')
        task = 'node'
    train(dataset, task, args)
示例#5
0
def get_dataset(name, sparse=True, cleaned=False):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name)
    dataset = TUDataset(path, name)
    dataset.data.edge_attr = None

    if dataset.data.x is None:
        max_degree = 0
        degs = []
        for data in dataset:
            degs += [degree(data.edge_index[0], dtype=torch.long)]
            max_degree = max(max_degree, degs[-1].max().item())

            deg = torch.cat(degs, dim=0).to(torch.float)
            mean, std = deg.mean().item(), deg.std().item()
            dataset.transform = NormalizedDegree(mean, std)

    if not sparse:
        num_nodes = max_num_nodes = 0
        for data in dataset:
            num_nodes += data.num_nodes
            max_num_nodes = max(data.num_nodes, max_num_nodes)

        # Filter out a few really large graphs in order to apply DiffPool.
        if name == 'REDDIT-BINARY':
            num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes)
        else:
            num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes)

        indices = []
        for i, data in enumerate(dataset):
            if data.num_nodes <= num_nodes:
                indices.append(i)
        dataset = dataset[torch.tensor(indices)]

        if dataset.transform is None:
            dataset.transform = T.ToDense(num_nodes)
        else:
            dataset.transform = T.Compose(
                [dataset.transform, T.ToDense(num_nodes)])

    return dataset
示例#6
0
def main():
    args = arg_parse()

    if args.dataset == 'enzymes':
        dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
        task = 'graph'
        print(f'ENZYMES number of graphs {len(dataset.data.y)}')
    elif args.dataset == 'cora':
        dataset = Planetoid(root='/tmp/Cora', name='Cora')
        task = 'node'
        print(f'CORA number of nodes {len(dataset.data.y)}')
    train(dataset, task, args)
示例#7
0
def cross_validation(config):
    dataset_name = config.dataset_name
    dataset_path = osp.join(osp.dirname(os.getcwd()), '..', 'data',
                            dataset_name)
    dataset = TUDataset(dataset_path, name=dataset_name).shuffle()

    prepare_config_for_dataset(config, dataset)

    cross_validation_batches = config.cross_validation_batches
    cross_validation_batch_size = len(dataset) // cross_validation_batches
    results = []
    train_histories = []
    test_histories = []
    for i in range(cross_validation_batches):
        start_index = i * cross_validation_batch_size
        end_index = (
            i + 1
        ) * cross_validation_batch_size if i + 1 < cross_validation_batches else len(
            dataset)
        test_dataset = dataset[start_index:end_index]
        train_dataset = dataset[:start_index] + dataset[end_index:]

        if config.node_features == 'categorical':
            test_loader = DataLoader(test_dataset,
                                     batch_size=config.batch_size)
            train_loader = DataLoader(train_dataset,
                                      batch_size=config.batch_size)
        elif config.node_features == 'node_degree':
            test_loader = NodeDegreeFeatureDataLoader(
                test_dataset,
                config.max_node_degree,
                batch_size=config.batch_size)
            train_loader = NodeDegreeFeatureDataLoader(
                train_dataset,
                config.max_node_degree,
                batch_size=config.batch_size)
        elif config.node_features == 'same':
            test_loader = SameFeatureDataLoader(test_dataset,
                                                config.same_feature_dim,
                                                batch_size=config.batch_size)
            train_loader = SameFeatureDataLoader(train_dataset,
                                                 config.same_feature_dim,
                                                 batch_size=config.batch_size)

        train_history, test_history = train(config, train_dataset,
                                            test_dataset)
        train_histories.append(train_history)
        test_histories.append(test_history)
        results.append(np.max(test_history))

    avg = np.mean(results)
    std = np.std(results)
    return avg, std
示例#8
0
def main():

  t_args = [{'model_type': 'GCN', 'dataset': 'cora'   , 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 32, 'dropout': 0.5, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.01},
      {'model_type': 'GraphSage', 'dataset': 'cora', 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 32, 'dropout': 0.5, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.001}, 
            {'model_type': 'GAT', 'dataset': 'cora'   , 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 32, 'dropout': 0.5, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.01},
      {'model_type': 'GCN', 'dataset': 'enzymes', 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 32, 'dropout': 0.0, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.001},
      {'model_type': 'GraphSage', 'dataset': 'enzymes'   , 'num_layers': 3, 'batch_size': 32, 'hidden_dim': 32, 'dropout': 0.0, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.001},
      {'model_type': 'GAT', 'dataset': 'enzymes', 'num_layers': 2, 'batch_size': 32, 'hidden_dim': 32, 'dropout': 0.0, 'epochs': 500, 'opt': 'adam', 'opt_scheduler': 'none', 'opt_restart': 0, 'weight_decay': 5e-3, 'lr': 0.001},
  ] # (GraphSage enzyme hyperparameters toggled)

  for i in range(len(t_args)):
    args = t_args[i]
    args = objectview(args)
    if args.dataset == 'enzymes':
        dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
        dataset = dataset.shuffle()
        task = 'graph'
    elif args.dataset == 'cora':
        dataset = Planetoid(root='/tmp/Cora', name='Cora')
        task = 'node'
    RESULTS[i] = train(dataset, task, args) 
示例#9
0
文件: eval.py 项目: wonlee2019/dgm
def get_graph_classification_dataset(dataset):
    node_transform = None
    if dataset in [
            'COLLAB', 'REDDIT-BINARY', 'IMDB-BINARY', 'IMDB-MULTI',
            'REDDIT-MULTI-5K'
    ]:
        node_transform = OneHotDegree(max_degree=64)

    path = osp.join(osp.dirname('./graph_datasets/'), dataset)
    dataset = TUDataset(path, name=dataset, pre_transform=node_transform)

    return dataset
示例#10
0
def test_baseDataset_Loader(filted_dataset=None):
    root = osp.join('/', 'tmp', str(random.randrange(sys.maxsize)))
    shutil.copytree('../input/smt', root)
    dataset = TUDataset(root, 'SMT')

    assert len(dataset) == 2688
    assert dataset.num_features == 20
    assert dataset.num_classes == 2
    assert dataset.__repr__() == 'SMT(2688)'
    assert dataset[0].keys == ['x', 'edge_index', 'y']  # ==len(data.keys)
    assert len(dataset.shuffle()) == 2688

    loader = DataLoader(dataset, batch_size=len(dataset))
    assert loader.dataset.__repr__() == 'SMT(2688)'
    for batch in loader:
        assert batch.num_graphs == 2688
        assert batch.num_nodes == sum([data.num_nodes
                                       for data in dataset])  # 2788794
        assert batch.num_edges == sum([data.num_edges
                                       for data in dataset])  # 13347768
        assert batch.keys == ['x', 'edge_index', 'y', 'batch']

    num_nodes = sum(dataset.data.num_nodes)
    max_num_nodes = max(dataset.data.num_nodes)
    num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes)

    assert num_nodes == 5187
    assert max_num_nodes == 34623

    indices = []
    for i, data in enumerate(dataset):
        if data.num_nodes < num_nodes:
            indices.append(i)

    if not filted_dataset:
        filted_dataset = dataset[torch.tensor(indices)]
        filted_dataset.transform = T.ToDense(num_nodes)  # add 'adj' attribute

    assert ('adj' in dataset[0]) is False
    assert ('adj' in filted_dataset[0]) is True
示例#11
0
def main(args):
    path = pathlib.Path('./src/gkernel')
    if not path.is_file():
        subprocess.call(["make"], cwd="./src", shell=True)
    dataset = TUDataset(root=f'{args.dir}/Pytorch_geometric/{args.dataset}', name=args.dataset)

    if dataset.num_features == 0:
        max_degree = -1
        for data in dataset:
            edge_index = data.edge_index
            degrees = Counter(list(map(int, edge_index[0])))
            if max_degree < max(degrees.values()):
                max_degree = max(degrees.values())

        dataset.transform = OneHotDegree(max_degree=max_degree, cat=False)

    path = pathlib.Path(f'{args.dir}/GraphML/{args.dataset}/{args.dataset.lower()}_{args.kernel}.kernel')
    if not path.is_file():
        save_to_graphml(dataset, f'{args.dir}/GraphML/{args.dataset}')
        cmd = ['./src/gkernel']
        cmd.append('-k')
        cmd.append(args.kernel)
        if args.parameter:
            cmd.append('-p')
            cmd.append(args.parameter)
        cmd.append('-i')
        cmd.append(f'{args.dir}/GraphML/{args.dataset}/{args.dataset.lower()}.list')
        cmd.append('-g')
        cmd.append(f'{args.dir}/GraphML/{args.dataset}/data/')
        cmd.append('-o')
        cmd.append(f'{args.dir}/GraphML/{args.dataset}/{args.dataset.lower()}_{args.kernel}.kernel')
        subprocess.call(cmd)

    K = read_kernel_matrix(f'{args.dir}/GraphML/{args.dataset}/{args.dataset.lower()}_{args.kernel}.kernel')

    y = dataset.data.y.data.numpy()

    ev = Evaluation(K, y, args, verbose=True)

    accs = ev.evaluate(dataset)
示例#12
0
def test_enzymes():
    root = osp.join('/', 'tmp', str(random.randrange(sys.maxsize)))
    dataset = TUDataset(root, 'ENZYMES')

    assert len(dataset) == 600
    assert dataset.num_features == 3
    assert dataset.num_classes == 6
    assert dataset.__repr__() == 'ENZYMES(600)'

    assert len(dataset[0]) == 3
    assert len(dataset.shuffle()) == 600
    assert len(dataset[:100]) == 100
    assert len(dataset[torch.arange(100, dtype=torch.long)]) == 100
    mask = torch.zeros(600, dtype=torch.uint8)
    mask[:100] = 1
    assert len(dataset[mask]) == 100

    loader = DataLoader(dataset, batch_size=len(dataset))
    for data in loader:
        assert data.num_graphs == 600

        avg_num_nodes = data.num_nodes / data.num_graphs
        assert pytest.approx(avg_num_nodes, abs=1e-2) == 32.63

        avg_num_edges = data.num_edges / (2 * data.num_graphs)
        assert pytest.approx(avg_num_edges, abs=1e-2) == 62.14

        assert len(data) == 4
        assert list(data.x.size()) == [data.num_nodes, 3]
        assert list(data.y.size()) == [data.num_graphs]
        assert data.y.max() + 1 == 6
        assert list(data.batch.size()) == [data.num_nodes]

        assert data.contains_isolated_nodes()
        assert not data.contains_self_loops()
        assert data.is_undirected()

    loader = DataListLoader(dataset, batch_size=len(dataset))
    for data_list in loader:
        assert len(data_list) == 600

    dataset.transform = ToDense(num_nodes=126)
    loader = DenseDataLoader(dataset, batch_size=len(dataset))
    for data in loader:
        assert len(data) == 4
        assert list(data.x.size()) == [600, 126, 3]
        assert list(data.adj.size()) == [600, 126, 126]
        assert list(data.mask.size()) == [600, 126]
        assert list(data.y.size()) == [600, 1]

    dataset = TUDataset(root, 'ENZYMES', use_node_attr=True)
    assert dataset.num_features == 21

    shutil.rmtree(root)
示例#13
0
def test_lightning_dataset(strategy):
    import pytorch_lightning as pl

    root = osp.join('/', 'tmp', str(random.randrange(sys.maxsize)))
    dataset = TUDataset(root, name='MUTAG').shuffle()
    train_dataset = dataset[:50]
    val_dataset = dataset[50:80]
    test_dataset = dataset[80:90]
    shutil.rmtree(root)

    gpus = 1 if strategy is None else torch.cuda.device_count()
    if strategy == 'ddp_spawn':
        strategy = pl.plugins.DDPSpawnPlugin(find_unused_parameters=False)

    model = LinearGraphModule(dataset.num_features, 64, dataset.num_classes)

    trainer = pl.Trainer(strategy=strategy,
                         gpus=gpus,
                         max_epochs=1,
                         log_every_n_steps=1)
    datamodule = LightningDataset(train_dataset,
                                  val_dataset,
                                  test_dataset,
                                  batch_size=5,
                                  num_workers=3)
    old_x = train_dataset.data.x.clone()
    assert str(datamodule) == ('LightningDataset(train_dataset=MUTAG(50), '
                               'val_dataset=MUTAG(30), '
                               'test_dataset=MUTAG(10), batch_size=5, '
                               'num_workers=3, pin_memory=True, '
                               'persistent_workers=True)')
    trainer.fit(model, datamodule)
    new_x = train_dataset.data.x
    offset = 10 + 6 + 2 * gpus  # `train_steps` + `val_steps` + `sanity`
    assert torch.all(new_x > (old_x + offset - 4))  # Ensure shared data.
    assert trainer._data_connector._val_dataloader_source.is_defined()
    assert trainer._data_connector._test_dataloader_source.is_defined()

    # Test with `val_dataset=None` and `test_dataset=None`:
    warnings.filterwarnings('ignore', '.*Skipping val loop.*')
    trainer = pl.Trainer(strategy=strategy,
                         gpus=gpus,
                         max_epochs=1,
                         log_every_n_steps=1)
    datamodule = LightningDataset(train_dataset, batch_size=5, num_workers=3)
    assert str(datamodule) == ('LightningDataset(train_dataset=MUTAG(50), '
                               'batch_size=5, num_workers=3, '
                               'pin_memory=True, persistent_workers=True)')
    trainer.fit(model, datamodule)
    assert not trainer._data_connector._val_dataloader_source.is_defined()
    assert not trainer._data_connector._test_dataloader_source.is_defined()
示例#14
0
def main():
    args = arg_parse()
    np.random.seed(1234)

    if args.dataset == 'enzymes':
        dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
        task = 'graph'
    elif args.dataset == 'cora':
        dataset = Planetoid(root='/tmp/Cora', name='Cora')
        task = 'node'
    if args.examine == True:
        utils.examine_dataset(dataset)

    train(dataset, task, args)
示例#15
0
def get_dataset(dataset,
                regression=False,
                multigregression=False,
                one_hot=False):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets',
                    dataset)
    TUDataset(path, name=dataset)

    if multigregression:
        return read_multi_targets(dataset)
    if not regression:
        return read_classes(dataset)
    else:
        return read_targets(dataset)
示例#16
0
def main():
    args = arg_parse()

    if args.dataset == 'enzymes':
        dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
        print("# graphs: ", len(dataset))
        task = 'graph'
    elif args.dataset == 'cora':
        dataset = Planetoid(root='/tmp/Cora', name='Cora')
        print("# nodes: ", dataset[0].num_nodes)
        print("# edges: ", dataset[0].num_edges)
        task = 'node'

    train(dataset, task, args)
示例#17
0
def get_split_data(data_name, split_dir, config, fold):
    dataset = TUDataset(f'data/TUD/{data_name}', data_name, transform=preproc)

    train_idx, test_idx = get_idx(data_name, split_dir, fold)
    np.random.shuffle(train_idx)
    np.random.shuffle(test_idx)
    train_iter = CRaWlLoader(list(dataset[train_idx]),
                             batch_size=config['batch_size'],
                             shuffle=True)
    val_iter = CRaWlLoader(list(dataset[test_idx]),
                           batch_size=config['batch_size'] if 'eval_batch_size'
                           not in config.keys() else config['eval_batch_size'])

    return train_iter, val_iter, val_iter
示例#18
0
def run():
    dataset = TUDataset(os.path.join('data',args.dataset),name=args.dataset)
    args.num_classes = dataset.num_classes
    args.num_features = dataset.num_features

    num_training = int(len(dataset)*0.8)
    num_val = int(len(dataset)*0.1)
    num_test = len(dataset) - (num_training+num_val)
    training_set,validation_set,test_set = random_split(dataset,[num_training,num_val,num_test])

    train_loader = DataLoader(training_set, batch_size=args.batch_size, shuffle=True)
    val_loader = DataLoader(validation_set,batch_size=args.batch_size,shuffle=False)
    test_loader = DataLoader(test_set,batch_size=1,shuffle=False)
    
    model = Net(args).to(args.device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)


    min_loss = 1e10
    patience = 0

    for epoch in range(args.epochs):
        model.train()
        for i, data in enumerate(train_loader):
            data = data.to(args.device)
            out = model(data)
            loss = F.nll_loss(out, data.y)
            # print("Training loss:{}".format(loss.item()))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        val_acc,val_loss = test(model,val_loader)
        print("Validation loss:{}\taccuracy:{}".format(val_loss,val_acc))
        if val_loss < min_loss:
            torch.save(model.state_dict(),'latest.pth')
            print("Model saved at epoch{}".format(epoch))
            min_loss = val_loss
            patience = 0
        else:
            patience += 1
        if patience > args.patience:
            break

    model = Net(args).to(args.device)
    model.load_state_dict(torch.load('latest.pth'))
    test_acc,test_loss = test(model,test_loader)
    print("Test accuarcy:{}".format(test_acc))
    points2txt(str(test_acc))
示例#19
0
def main():
    seed_everything(42)

    root = osp.join('data', 'TUDataset')
    dataset = TUDataset(root, 'IMDB-BINARY', pre_transform=T.OneHotDegree(135))

    dataset = dataset.shuffle()
    test_dataset = dataset[:len(dataset) // 10]
    val_dataset = dataset[len(dataset) // 10:2 * len(dataset) // 10]
    train_dataset = dataset[2 * len(dataset) // 10:]

    datamodule = LightningDataset(train_dataset, val_dataset, test_dataset,
                                  batch_size=64, num_workers=4)

    model = Model(dataset.num_node_features, dataset.num_classes)

    gpus = torch.cuda.device_count()
    strategy = pl.plugins.DDPSpawnPlugin(find_unused_parameters=False)
    checkpoint = pl.callbacks.ModelCheckpoint(monitor='val_acc', save_top_k=1)
    trainer = pl.Trainer(gpus=gpus, strategy=strategy, max_epochs=50,
                         log_every_n_steps=5, callbacks=[checkpoint])

    trainer.fit(model, datamodule)
    trainer.test(ckpt_path='best', datamodule=datamodule)
示例#20
0
文件: datasets.py 项目: hardikgw/HGCN
def get_dataset(name, sparse=True, feat_str="deg+ak3+reall", root=None):
    if root is None or root == '':
        path = osp.join(osp.expanduser('~'), 'pyG_data', name)
    else:
        path = osp.join(root, name)
    degree = feat_str.find("deg") >= 0
    onehot_maxdeg = re.findall("odeg(\d+)", feat_str)
    onehot_maxdeg = int(onehot_maxdeg[0]) if onehot_maxdeg else None
    k = re.findall("an{0,1}k(\d+)", feat_str)
    k = int(k[0]) if k else 0
    groupd = re.findall("groupd(\d+)", feat_str)
    groupd = int(groupd[0]) if groupd else 0
    remove_edges = re.findall("re(\w+)", feat_str)
    remove_edges = remove_edges[0] if remove_edges else 'none'
    edge_noises_add = re.findall("randa([\d\.]+)", feat_str)
    edge_noises_add = float(edge_noises_add[0]) if edge_noises_add else 0
    edge_noises_delete = re.findall("randd([\d\.]+)", feat_str)
    edge_noises_delete = float(
        edge_noises_delete[0]) if edge_noises_delete else 0
    centrality = feat_str.find("cent") >= 0
    coord = feat_str.find("coord") >= 0

    # Obtain max_node_num
    with open(osp.join(path, name, "raw", name + "_graph_indicator.txt")) as f:
        temp = list(csv.reader(f))
    out = [int(i[0]) for i in temp]
    max_node_num = Counter(out).most_common(1)[0][1]
    # only maintain the 1000 nodes with the maximum degree
    if max_node_num > 1000:
        max_node_num = 1000

    pre_transform = FeatureExpander(max_node_num=max_node_num,
                                    degree=degree,
                                    onehot_maxdeg=onehot_maxdeg,
                                    AK=k,
                                    centrality=centrality,
                                    remove_edges=remove_edges,
                                    edge_noises_add=edge_noises_add,
                                    edge_noises_delete=edge_noises_delete,
                                    group_degree=groupd).transform

    dataset = TUDataset(path,
                        name,
                        pre_transform=pre_transform,
                        use_node_attr=True)

    dataset.data.edge_attr = None
    return dataset
示例#21
0
def get_clean_graph_indices(dataset_name, path_to_orbits):
    '''
    Return indices of the dataset that should be included for training.
    It gets graph orbits and keep one graph from each orbit if orbit contains the same labels,
    or else removes entirely the orbit.
    :param dataset_name:
    :param path_to_orbits:
    :return:
    '''
    dataset = TUDataset(root=f'./Datasets/{dataset_name}', name=dataset_name)

    # get a list of lists, with graphs that belong to orbits
    with open(path_to_orbits + f'{dataset_name}_orbits.txt') as f:
        true_orbits = [
            list(map(int, ast.literal_eval(''.join(line.split()[2:]))))
            for line in f
        ]

    # get target labels for each graphs
    graph_labels = dict()
    for i, graph in enumerate(dataset):
        graph_labels[i + 1] = graph.y.item()

    # get labels in each orbit
    orbit_labels = [[graph_labels[graph] for graph in orbit]
                    for orbit in true_orbits]

    # keep a representative of the orbit
    orbit_graphs = []
    for i, orbit in enumerate(true_orbits):
        assert len(orbit) == len(orbit_labels[i])
        if len(set(
                orbit_labels[i])) == 1:  # there is only one label in the orbit
            orbit_graphs.append(
                orbit[0])  # keep only first graph from the orbit

    # calculate all graphs needed to be removed
    iso_graphs = set()
    for orbit in true_orbits:
        iso_graphs = iso_graphs.union(orbit)

    iso_graphs = iso_graphs.difference(orbit_graphs)

    clean_graph_idx = [
        idx for idx in range(len(dataset)) if idx + 1 not in iso_graphs
    ]

    return clean_graph_idx, true_orbits
def test_imdb_binary():
    root = osp.join('/', 'tmp', str(random.randrange(sys.maxsize)))
    dataset = TUDataset(root, 'IMDB-BINARY')

    assert len(dataset) == 1000
    assert dataset.num_features == 0
    assert dataset.num_classes == 2
    assert str(dataset) == 'IMDB-BINARY(1000)'

    data = dataset[0]
    assert len(data) == 3
    assert data.edge_index.size() == (2, 146)
    assert data.y.size() == (1, )
    assert data.num_nodes == 20

    shutil.rmtree(root)
示例#23
0
def mini_batches():
    dataset = 'ENZYMES'
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                        'data', dataset)
    dataset = TUDataset(root=path, name=dataset, use_node_attr=True)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)

    for batch in loader:
        print(batch)  # batch: 标识节点所属的图。x: 节点的特征。edge_index: 边。y: 标签
        print(batch.num_graphs)  # 该批次中图的数量

        x = scatter_mean(batch.x, batch.batch, dim=0)  # 每个图的平均节点特征
        print(x.size())
        pass

    pass
示例#24
0
def load_data():
    bs = 15
    path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data', DATASET)
    print("loading path %s" % path)
    dataset = TUDataset(path, DATASET, use_node_attr=True)
    print(dataset)
    print("dataset----------------------------------------------\n")
#     dataset = dataset.shuffle()
#     dataset.data.x = dataset.data.x[:, :-3]
    # dataset.data.node_label = dataset.data.x[:, -3:]
    
    if 'MR' in DATASET:
        real_train_size = 6398
        train_size = 7108
    elif 'R8' in DATASET:
        real_train_size = 4937
        train_size = 5485
    elif '20ng' in DATASET:
        real_train_size = 10183
        train_size = 11314
    elif 'R52' in DATASET:
        real_train_size = 100#5879
        train_size = 200#6532
    elif 'ohsumed' in DATASET:
        real_train_size = 3022
        train_size = 3357
    else:
        real_train_size = 60
        train_size = 80
    
    train_loader = DataLoader(dataset[:real_train_size], batch_size=bs)
    print(train_loader)
    print("+++++++++++++++++++++++++++++++++++++++++++++++\n")
    # raise SystemExit
    #val_loader = DataLoader(dataset[:real_train_size], batch_size=bs)
    #test_loader = DataLoader(dataset[:real_train_size], batch_size=bs)
    val_loader = DataLoader(dataset[real_train_size:train_size], batch_size=bs)
    test_loader = DataLoader(dataset[train_size:], batch_size=bs)
    print((train_loader))
    print((val_loader))
    print((test_loader))
    print(len(train_loader))
    print(len(val_loader))
    print(len(test_loader))
    # raise SystemExit
    print('batch size is : ' + str(bs))
    return dataset, dataset, dataset, train_loader, val_loader, test_loader
示例#25
0
def main():
    np.random.seed(0)
    torch.manual_seed(0)
    # --------------------- PARSE ARGS -----------------------
    parser = argparse.ArgumentParser()

    parser.add_argument("--dataset", choices=['MUTAG', 'PTC_MR', 'REDDIT-BINARY', 'REDDIT-MULTI-5K', 'IMDB-BINARY', 'IMDB-MULTI'], default='MUTAG')
    parser.add_argument("--batch-size", type=int, default=128)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--num-epoch", type=int, default=20)
    parser.add_argument("--encoder-hidden-dim", type=int, default=32)
    parser.add_argument("--num-encoder-layers", type=int, default=4)

    args = parser.parse_args()

    print("- Args ----------------------")
    for k, v in vars(args).items():
        print(" - {}={}".format(k, v))
    print("-----------------------------")

    # --------------------- LOAD DATASET ---------------------
    print("Loading dataset...")
    dataset = TUDataset(DATA_DIR / args.dataset, name=args.dataset)
    try:
        dataset_num_features = dataset.num_features
        if dataset_num_features == 0:
            dataset_num_features = 1
    except:
        dataset_num_features = 1

    dataloader = DataLoader(dataset, args.batch_size, shuffle=True)

    # --------------------- TRAIN MODEL ----------------------
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = InfoGraph(dataset_num_features, args.encoder_hidden_dim, args.num_encoder_layers).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    evaluate_downstream(model, dataloader, device)
    print("Starting training...")

    for epoch in range(1, args.num_epoch+1):
        train_loss = train(model, dataloader, optimizer, device)
        print("| Epoch: {:3} | Unsupervised Loss: {:10.4f} |".format(epoch, train_loss))

    print("Training finished!")
    evaluate_downstream(model, dataloader, device)
示例#26
0
def get_scores_profile(dataset_name=None,
                       x=None,
                       list_node_labels=None,
                       list_w_sizes=[3, 5, 7],
                       use_node_attr=False,
                       zero_sc_ends=False,
                       model=None,
                       device=None,
                       batch_size=None,
                       geometric_folder=None):
    """
    Calculate profile scores by outputting subgraphs in PyG format,
    read in and score them. Return mean profile scores in case of several
    window sizes given, otherwise single window size scores.

    """

    all_scores = []
    dataset_folder = geometric_folder + "/" + dataset_name
    save_dataset_name = dataset_name
    # Calculate profile scores for each window size.
    for w_size in list_w_sizes:
        # Create raw set containing window subgraphs.
        create_profile_dataset(save_dataset_name=save_dataset_name,
                               x=x,
                               list_node_labels=list_node_labels,
                               w_size=w_size,
                               dataset_folder=dataset_folder)
        # Read in dataset, containing one site split into windows.
        dataset_w = TUDataset(dataset_folder,
                              name=save_dataset_name,
                              use_node_attr=use_node_attr)
        # Load the set.
        loader = DataLoader(dataset_w, batch_size=batch_size, shuffle=False)
        # Score each subgraph.
        scores = get_scores(loader, device, model, min_max_norm=True)
        # Add scores at end to get full site length scores list.
        if zero_sc_ends:
            scores = [0] * int(w_size / 2) + scores + [0] * int(w_size / 2)
        else:
            scores = [scores[0]] * int(
                w_size / 2) + scores + [scores[-1]] * int(w_size / 2)
        all_scores.append(scores)
    scores_mean = list(np.mean(all_scores, axis=0))
    # Return mean scores (if only one win_size equals to win_size scores).
    return scores_mean
示例#27
0
def load_dataset(name):
    """ Load real-world datasets, available in PyTorch Geometric.

    Used as a helper for DiskDataSource.
    """
    task = "graph"
    if name == "enzymes":
        dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
    elif name == "proteins":
        dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS")
    elif name == "cox2":
        dataset = TUDataset(root="/tmp/cox2", name="COX2")
    elif name == "aids":
        dataset = TUDataset(root="/tmp/AIDS", name="AIDS")
    elif name == "reddit-binary":
        dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY")
    elif name == "imdb-binary":
        dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY")
    elif name == "firstmm_db":
        dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB")
    elif name == "dblp":
        dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1")
    elif name == "ppi":
        dataset = PPI(root="/tmp/PPI")
    elif name == "qm9":
        dataset = QM9(root="/tmp/QM9")
    elif name == "atlas":
        dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)]
    if task == "graph":
        train_len = int(0.8 * len(dataset))
        train, test = [], []
        dataset = list(dataset)
        random.shuffle(dataset)
        has_name = hasattr(dataset[0], "name")
        for i, graph in tqdm(enumerate(dataset)):
            if not type(graph) == nx.Graph:
                if has_name: del graph.name
                graph = pyg_utils.to_networkx(graph).to_undirected()
            if i < train_len:
                train.append(graph)
            else:
                test.append(graph)
    return train, test, task
示例#28
0
    def test_filter(self):
        pyg_dataset = TUDataset('./enzymes', 'ENZYMES')
        graphs = GraphDataset.pyg_to_graphs(pyg_dataset)
        dataset = GraphDataset(graphs, task="graph")
        thresh = 90

        orig_dataset_size = len(dataset)
        num_graphs_large = 0
        for graph in dataset:
            if len(graph.G) >= thresh:
                num_graphs_large += 1

        dataset = dataset.filter(
            lambda graph: len(graph.G) < thresh, deep_copy=False)
        filtered_dataset_size = len(dataset)

        self.assertEqual(
            orig_dataset_size - filtered_dataset_size, num_graphs_large)
示例#29
0
def load_TU_dataset(name = 'MUTAG'):
    pickle_path = f'{ROOT_DIR}/data_fast/TUDataset/{name}/'
    pickle_file = f"{name}.p"
    pickle_full = f"{pickle_path}/{pickle_file}"
    if file_exists(pickle_full):
        print("path exists - load fast pickle")
        f = open(pickle_full, "rb")
        gc.disable()
        dataset = pickle.load(f)
        gc.enable()
        f.close()
    else:
        print("Did not load yet: download data and store")
        from torch_geometric.datasets import TUDataset
        set_proxy()
        dataset = TUDataset(root=f'{ROOT_DIR}/data/TUDataset', name=name)
        create_folder_safe(pickle_path)
        pickle.dump(dataset, open(pickle_full, "wb"), protocol=-1)
    return dataset
示例#30
0
def load_tugraphs(graph='mutag', labels_only = False):

    # load synthetic sbm graphs
    if graph[:3] == 'syn':
        scale = int(graph[3:])
        n = 750
        p, q = 0.5, 0.1
        p_, q_ = 0.4, 0.2
        gs1 = sbms(n=n, n1=100 * scale, n2=50 * scale, p=p, q=q)
        gs2 = sbms(n=n, n1=75 * scale, n2=75 * scale, p=p, q=q)
        graphs = gs2 + gs1
        labels = [1] * n + [2] * n
        return graphs, labels

    graph = name_conversion(graph)
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', graph)
    dataset = TUDataset(path, name=graph)
    graphs, labels = torch_geometric_2nx(dataset, labels_only)
    return graphs, labels