Пример #1
0
def load_dataset():
    '''
    load raw datasets.
    :return: a list of networkx/deepsnap graphs, plus additional info if needed
    '''
    format = cfg.dataset.format
    name = cfg.dataset.name
    # dataset_dir = '{}/{}'.format(cfg.dataset.dir, name)
    dataset_dir = cfg.dataset.dir
    # Try to load customized data format
    for func in register.loader_dict.values():
        graphs = func(format, name, dataset_dir)
        if graphs is not None:
            return graphs
    # Load from Pytorch Geometric dataset
    if format == 'PyG':
        graphs = load_pyg(name, dataset_dir)
    # Load from networkx formatted data
    # todo: clean nx dataloader
    elif format == 'nx':
        graphs = load_nx(name, dataset_dir)
    # Load from OGB formatted data
    elif cfg.dataset.format == 'OGB':
        if cfg.dataset.name == 'ogbg-molhiv':
            dataset = PygGraphPropPredDataset(name=cfg.dataset.name)
            graphs = GraphDataset.pyg_to_graphs(dataset)
        # Note this is only used for custom splits from OGB
        split_idx = dataset.get_idx_split()
        return graphs, split_idx
    else:
        raise ValueError('Unknown data format: {}'.format(cfg.dataset.format))
    return graphs
Пример #2
0
def get_molhiv():
    path = osp.dirname(osp.realpath(__file__))
    dataset = PygGraphPropPredDataset(name='ogbg-molhiv', root=path)
    split_idx = dataset.get_idx_split()
    max_num_nodes = torch.tensor(dataset.data.num_nodes).max().item()
    return dataset[split_idx["train"]], dataset[split_idx["valid"]], dataset[
        split_idx["test"]], max_num_nodes
Пример #3
0
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygGraphPropPredDataset(name=args.dataset)
    args.num_tasks = dataset.num_tasks
    print(args)

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    model = DeeperGCN(args)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    train_result = eval(model, device, train_loader,
                        evaluator)[dataset.eval_metric]
    valid_result = eval(model, device, valid_loader,
                        evaluator)[dataset.eval_metric]
    test_result = eval(model, device, test_loader,
                       evaluator)[dataset.eval_metric]

    print({
        'Train': train_result,
        'Validation': valid_result,
        'Test': test_result
    })

    model.print_params(final=True)
Пример #4
0
 def setup(self, stage: Optional[str] = None):
     """Load data. Set variables: self.data_train, self.data_val, self.data_test."""
     if not self.data_train and not self.data_val and not self.data_test:
         dataset = PygGraphPropPredDataset(name="ogbg-molpcba",
                                           root=self.data_dir,
                                           transform=self.transform)
         split_idx = dataset.get_idx_split()
         self.data_train = dataset[split_idx["train"]]
         self.data_val = dataset[split_idx["valid"]]
         self.data_test = dataset[split_idx["test"]]
Пример #5
0
def mol_pred_GNN_prepare(batch_size=50):
    dataset_name = 'ogbg-molhiv'

    dataset = PygGraphPropPredDataset(name=dataset_name)
    evaluator = Evaluator(name=dataset_name)

    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=batch_size, shuffle=False)
    return train_loader, test_loader
Пример #6
0
 def __init__(self, train):
     super(Mol_pred_DNN_dataset, self).__init__()
     self.train = train
     dataset_name = 'ogbg-molhiv'
     mol_origin_dataset = PygGraphPropPredDataset(name=dataset_name)
     evaluator = Evaluator(name=dataset_name)
     split_idx = mol_origin_dataset.get_idx_split()
     if self.train == True:
         self.mol_origin_dataset = mol_origin_dataset[split_idx["train"]]
     else:
         self.mol_origin_dataset = mol_origin_dataset[split_idx["test"]]
Пример #7
0
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    if args.not_extract_node_feature:
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=add_zeros)
    else:
        extract_node_feature_func = partial(extract_node_feature,
                                            reduce=args.aggr)
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=extract_node_feature_func)

    args.num_tasks = dataset.num_classes
    evaluator = Evaluator(args.dataset)

    split_idx = dataset.get_idx_split()

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    print(args)

    model = DeeperGCN(args)
    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    train_accuracy = eval(model, device, train_loader, evaluator)
    valid_accuracy = eval(model, device, valid_loader, evaluator)
    test_accuracy = eval(model, device, test_loader, evaluator)

    print({
        'Train': train_accuracy,
        'Validation': valid_accuracy,
        'Test': test_accuracy
    })
    model.print_params(final=True)
Пример #8
0
    def train_dataloader(self):
        dataset = PygGraphPropPredDataset(name='ogbg-molhiv')
        split_idx = dataset.get_idx_split()
        train_data = dataset[split_idx["train"]]
        train_loader = DataLoader(train_data,
        batch_size=self.configuration["batch_size"], shuffle=True,
        num_workers = self.configuration["num_workers"])

        self._train_data = train_data
        self._train_loader = train_loader
        
        return train_loader
Пример #9
0
def mol_data(root, dataset, batch_size=32, num_workers=4):
    dataset = PygGraphPropPredDataset(name=f"ogbg-mol{dataset}", root=root)
    split_idx = dataset.get_idx_split()
    loaders = dict()
    for split in ["train", "valid", "test"]:
        loaders[split] = DataLoader(
            dataset[split_idx[split]],
            batch_size=batch_size,
            shuffle=(split == "train"),
            num_workers=num_workers,
        )
    return loaders
Пример #10
0
    def val_dataloader(self):
        dataset = PygGraphPropPredDataset(name='ogbg-molhiv')
        split_idx = dataset.get_idx_split()
        val_data = dataset[split_idx["valid"]]
        validation_loader = DataLoader(val_data,
        batch_size=self.configuration["batch_size"], shuffle=False,
        num_workers = self.configuration["num_workers"])


        self._validation_data = val_data
        self._validation_loader = validation_loader
        
        return validation_loader
Пример #11
0
def load_graphs(ogb_name):

    dataset = PygGraphPropPredDataset(ogb_name, root='data', transform=preproc)
    out_dim = dataset[0].y.shape[1]

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    print("Preprocessing Graphs...")
    train_graphs = list(tqdm(dataset[train_idx]))
    train_graphs = [d for d in train_graphs if d.num_edges > 0]
    valid_graphs = list(dataset[valid_idx])
    test_graphs = list(dataset[test_idx])

    return out_dim, train_graphs, valid_graphs, test_graphs
Пример #12
0
def code_data(
    root,
    batch_size=128,
    num_vocab=VOCAB_SIZE,
    seq_len=SEQ_LEN,
    use_old_code_dataset=False,
):
    dataset = PygGraphPropPredDataset(
        "ogbg-code" if use_old_code_dataset else "ogbg-code2", root=root)
    split_idx = dataset.get_idx_split()
    vocab2idx, idx2vocab = get_vocab_mapping(
        [dataset.data.y[i] for i in split_idx["train"]], num_vocab)
    dataset.transform = transforms.Compose(
        [augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, seq_len)])

    loaders = dict()
    for split in ["train", "valid", "test"]:
        loaders[split] = DataLoader(
            dataset[split_idx[split]],
            batch_size=batch_size,
            shuffle=(split == "train"),
            num_workers=2,
        )
    return loaders, idx2vocab
def run(rank, world_size: int, dataset_name: str, root: str):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group('nccl', rank=rank, world_size=world_size)

    dataset = Dataset(dataset_name,
                      root,
                      pre_transform=T.ToSparseTensor(attr='edge_attr'))
    split_idx = dataset.get_idx_split()
    evaluator = Evaluator(dataset_name)

    train_dataset = dataset[split_idx['train']]
    train_sampler = DistributedSampler(train_dataset,
                                       num_replicas=world_size,
                                       rank=rank)
    train_loader = DataLoader(train_dataset,
                              batch_size=128,
                              sampler=train_sampler)

    torch.manual_seed(12345)
    model = GIN(128, dataset.num_tasks, num_layers=3, dropout=0.5).to(rank)
    model = DistributedDataParallel(model, device_ids=[rank])
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.BCEWithLogitsLoss()

    if rank == 0:
        val_loader = DataLoader(dataset[split_idx['valid']], batch_size=256)
        test_loader = DataLoader(dataset[split_idx['test']], batch_size=256)

    for epoch in range(1, 51):
        model.train()

        total_loss = 0
        for data in train_loader:
            data = data.to(rank)
            optimizer.zero_grad()
            logits = model(data.x, data.adj_t, data.batch)
            loss = criterion(logits, data.y.to(torch.float))
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * logits.size(0)
        loss = total_loss / len(train_loader.dataset)

        dist.barrier()

        if rank == 0:  # We evaluate on a single GPU for now.
            model.eval()

            y_pred, y_true = [], []
            for data in val_loader:
                data = data.to(rank)
                with torch.no_grad():
                    y_pred.append(model.module(data.x, data.adj_t, data.batch))
                    y_true.append(data.y)
            val_rocauc = evaluator.eval({
                'y_pred': torch.cat(y_pred, dim=0),
                'y_true': torch.cat(y_true, dim=0),
            })['rocauc']

            y_pred, y_true = [], []
            for data in test_loader:
                data = data.to(rank)
                with torch.no_grad():
                    y_pred.append(model.module(data.x, data.adj_t, data.batch))
                    y_true.append(data.y)
            test_rocauc = evaluator.eval({
                'y_pred': torch.cat(y_pred, dim=0),
                'y_true': torch.cat(y_true, dim=0),
            })['rocauc']

            print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
                  f'Val: {val_rocauc:.4f}, Test: {test_rocauc:.4f}')

        dist.barrier()

    dist.destroy_process_group()
Пример #14
0
def main():

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    write_file_name = 'results/result_'
    ### automatic dataloading and splitting
    dataset = PygGraphPropPredDataset(name=args.dataset)
    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    split_idx = dataset.get_idx_split()
    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)
    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    vals, tests = [], []
    for run in range(args.runs):
        best_val, final_test = 0, 0

        if args.gnn == 'gin':
            model = GNN(gnn_type='gin',
                        num_tasks=dataset.num_tasks,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=False).to(device)
        elif args.gnn == 'gin-virtual':
            model = GNN(gnn_type='gin',
                        num_tasks=dataset.num_tasks,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=True).to(device)
        elif args.gnn == 'gcn':
            model = GNN(gnn_type='gcn',
                        num_tasks=dataset.num_tasks,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=False).to(device)
        elif args.gnn == 'gcn-virtual':
            model = GNN(gnn_type='gcn',
                        num_tasks=dataset.num_tasks,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=True).to(device)
        elif args.gnn == 'randomgin':
            model = GNN(gnn_type='randomgin',
                        num_tasks=dataset.num_tasks,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        drop_path_p=args.drop_path_p,
                        virtual_node=False).to(device)
        elif args.gnn == 'randomgin-virtual':
            model = GNN(gnn_type='randomgin',
                        num_tasks=dataset.num_tasks,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        JK=args.JK,
                        drop_ratio=args.drop_ratio,
                        drop_path_p=args.drop_path_p,
                        virtual_node=True).to(device)
        else:
            raise ValueError('Invalid GNN type')

        tot_params = sum(p.numel() for p in model.parameters()
                         if p.requires_grad)
        print("No. params: %d" % (tot_params, ))

        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            print("=====Epoch {}".format(epoch))
            print('Training...')
            loss = train(model, device, train_loader, optimizer,
                         dataset.task_type, args)
            if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs:
                print('Evaluating...')
                train_perf = eval(model, device, train_loader, evaluator)
                valid_perf = eval(model, device, valid_loader, evaluator)
                test_perf = eval(model, device, test_loader, evaluator)
                print({
                    'Train': train_perf,
                    'Validation': valid_perf,
                    'Test': test_perf
                })
                result = (train_perf[dataset.eval_metric],
                          valid_perf[dataset.eval_metric],
                          test_perf[dataset.eval_metric])
                _, val, tst = result
                if val > best_val:
                    best_val = val
                    final_test = tst
            if epoch == 1:
                print('Evaluating...')
                train_perf = eval(model, device, train_loader, evaluator)
                valid_perf = eval(model, device, valid_loader, evaluator)
                test_perf = eval(model, device, test_loader, evaluator)
                print({
                    'Train': train_perf,
                    'Validation': valid_perf,
                    'Test': test_perf
                })

        print(f'Run{run} val:{best_val}, test:{final_test}')
        with open(write_file_name + '_' + args.JK + '_run' + str(run) + '.txt',
                  'w') as f:
            f.write("""Run: {}\nVal {:.4f}\nTest: {:.4f}\n\n\n""".format(
                run, best_val, final_test))
        vals.append(best_val)
        tests.append(final_test)

    print('')
    print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}")
    print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
Пример #15
0
from ogb.graphproppred import PygGraphPropPredDataset
import os

root_folder = '/vol/deform/gbouritsas/datasets/'

datasets = ['ogbg-molpcba', 'ogbg-molhiv', 'ogbg-ppa']

for name in datasets:
    dataset = PygGraphPropPredDataset(name=name,
                                      root=os.path.join(
                                          root_folder, 'ogb',
                                          '{}'.format(name)))
    split_idx = dataset.get_idx_split()
    for split_name in {'train', 'valid', 'test'}:
        idxs = split_idx[split_name]
        split_name = split_name if split_name is not 'valid' else 'val'
        save_folder = os.path.join(root_folder, 'ogb', '{}'.format(name),
                                   '10fold_idx')
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        with open(os.path.join(save_folder, '{}_idx-0.txt'.format(split_name)),
                  'w') as handle:
            for idx in idxs:
                handle.write('{}\n'.format(idx))
Пример #16
0
def main():

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting

    dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros)

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    vals, tests = [], []
    for run in range(args.runs):
        best_val, final_test = 0, 0

        if args.gnn == 'gin':
            model = GNN(gnn_type='gin',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=False).to(device)
        elif args.gnn == 'gin-virtual':
            model = GNN(gnn_type='gin',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=True).to(device)
        elif args.gnn == 'gcn':
            model = GNN(gnn_type='gcn',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=False).to(device)
        elif args.gnn == 'gcn-virtual':
            model = GNN(gnn_type='gcn',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=True).to(device)
        else:
            raise ValueError('Invalid GNN type')

        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            loss = train(model, device, train_loader, optimizer, args)
            if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs:

                #4min
                train_perf = eval(model, device, train_loader, evaluator)
                valid_perf = eval(model, device, valid_loader, evaluator)
                test_perf = eval(model, device, test_loader, evaluator)

                result = (train_perf[dataset.eval_metric],
                          valid_perf[dataset.eval_metric],
                          test_perf[dataset.eval_metric])
                _, val, tst = result
                if val > best_val:
                    best_val = val
                    final_test = tst

        print(f'Run{run} val:{best_val}, test:{final_test}')
        vals.append(best_val)
        tests.append(final_test)

    print('')
    print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}")
    print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
Пример #17
0
def main():

    args = ArgsInit().save_exp()

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    sub_dir = 'BS_{}-NF_{}'.format(args.batch_size, args.feature)

    dataset = PygGraphPropPredDataset(name=args.dataset)
    args.num_tasks = dataset.num_tasks
    logging.info('%s' % args)

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    evaluator = Evaluator(args.dataset)
    split_idx = dataset.get_idx_split()

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    model = DeeperGCN(args).to(device)

    logging.info(model)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    results = {
        'highest_valid': 0,
        'final_train': 0,
        'final_test': 0,
        'highest_train': 0
    }

    start_time = time.time()

    for epoch in range(1, args.epochs + 1):
        logging.info("=====Epoch {}".format(epoch))
        logging.info('Training...')

        # epoch_loss = train(model, device, train_loader, optimizer, dataset.task_type)
        epoch_loss = train_flag(model, device, train_loader, optimizer,
                                dataset.task_type, args)

        logging.info('Evaluating...')
        train_result = eval(model, device, train_loader,
                            evaluator)[dataset.eval_metric]
        valid_result = eval(model, device, valid_loader,
                            evaluator)[dataset.eval_metric]
        test_result = eval(model, device, test_loader,
                           evaluator)[dataset.eval_metric]

        logging.info({
            'Train': train_result,
            'Validation': valid_result,
            'Test': test_result
        })

        model.print_params(epoch=epoch)

        if train_result > results['highest_train']:

            results['highest_train'] = train_result

        if valid_result > results['highest_valid']:
            results['highest_valid'] = valid_result
            results['final_train'] = train_result
            results['final_test'] = test_result

            # save_ckpt(model, optimizer,
            #           round(epoch_loss, 4), epoch,
            #           args.model_save_path,
            #           sub_dir, name_post='valid_best')

    logging.info("%s" % results)

    end_time = time.time()
    total_time = end_time - start_time
    logging.info('Total time: {}'.format(
        time.strftime('%H:%M:%S', time.gmtime(total_time))))
Пример #18
0
def main():
    args = get_args()
    config = process_config(args)
    print(config)

    if config.get('seed') is not None:
        torch.manual_seed(config.seed)
        np.random.seed(config.seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(config.seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ### automatic dataloading and splitting
    dataset = PygGraphPropPredDataset(name=config.dataset_name)

    seq_len_list = np.array([len(seq) for seq in dataset.data.y])
    print('Target seqence less or equal to {} is {}%.'.format(config.max_seq_len, np.sum(seq_len_list <= config.max_seq_len) / len(seq_len_list)))

    split_idx = dataset.get_idx_split()

    # print(split_idx['train'])
    # print(split_idx['valid'])
    # print(split_idx['test'])

    # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']]
    # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']]
    # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']]
    # print('#train')
    # print(len(train_method_name))
    # print('#valid')
    # print(len(valid_method_name))
    # print('#test')
    # print(len(test_method_name))

    # train_method_name_set = set(train_method_name)
    # valid_method_name_set = set(valid_method_name)
    # test_method_name_set = set(test_method_name)

    # # unique method name
    # print('#unique train')
    # print(len(train_method_name_set))
    # print('#unique valid')
    # print(len(valid_method_name_set))
    # print('#unique test')
    # print(len(test_method_name_set))

    # # unique valid/test method name
    # print('#valid unseen during training')
    # print(len(valid_method_name_set - train_method_name_set))
    # print('#test unseen during training')
    # print(len(test_method_name_set - train_method_name_set))


    ### building vocabulary for sequence predition. Only use training data.

    vocab2idx, idx2vocab = get_vocab_mapping([dataset.data.y[i] for i in split_idx['train']], config.num_vocab)

    # test encoder and decoder
    # for data in dataset:
    #     # PyG >= 1.5.0
    #     print(data.y)
    #
    #     # PyG 1.4.3
    #     # print(data.y[0])
    #     data = encode_y_to_arr(data, vocab2idx, config.max_seq_len)
    #     print(data.y_arr[0])
    #     decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab)
    #     print(decoded_seq)
    #     print('')

    ## test augment_edge
    # data = dataset[2]
    # print(data)
    # data_augmented = augment_edge(data)
    # print(data_augmented)

    ### set the transform function
    # augment_edge: add next-token edge as well as inverse edges. add edge attributes.
    # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence.
    dataset.transform = transforms.Compose([augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, config.max_seq_len)])

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(config.dataset_name)

    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=config.hyperparams.batch_size, shuffle=True, num_workers=config.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=config.hyperparams.batch_size, shuffle=False, num_workers=config.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=config.hyperparams.batch_size, shuffle=False, num_workers=config.num_workers)

    nodetypes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz'))
    nodeattributes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz'))

    ### Encoding node features into emb_dim vectors.
    ### The following three node features are used.
    # 1. node type
    # 2. node attribute
    # 3. node depth
    node_encoder = ASTNodeEncoder(config.architecture.hidden, num_nodetypes=len(nodetypes_mapping['type']), num_nodeattributes=len(nodeattributes_mapping['attr']), max_depth=20)

    model = Net(config.architecture,
                num_vocab=len(vocab2idx),
                max_seq_len=config.max_seq_len,
                node_encoder=node_encoder).to(device)

    # optimizer = optim.Adam(model.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters(), lr=config.hyperparams.learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config.hyperparams.step_size,
                                                gamma=config.hyperparams.decay_rate)

    valid_curve = []
    test_curve = []
    train_curve = []
    trainL_curve = []

    writer = SummaryWriter(config.directory)

    ts_fk_algo_hp = str(config.time_stamp) + '_' \
                    + str(config.commit_id[0:7]) + '_' \
                    + str(config.architecture.nonlinear_conv) + '_' \
                    + str(config.architecture.variants.fea_activation) + '_' \
                    + str(config.architecture.pooling) + '_' \
                    + str(config.architecture.JK) + '_' \
                    + str(config.architecture.layers) + '_' \
                    + str(config.architecture.hidden) + '_' \
                    + str(config.architecture.variants.BN) + '_' \
                    + str(config.architecture.dropout) + '_' \
                    + str(config.hyperparams.learning_rate) + '_' \
                    + str(config.hyperparams.step_size) + '_' \
                    + str(config.hyperparams.decay_rate) + '_' \
                    + 'B' + str(config.hyperparams.batch_size) + '_' \
                    + 'S' + str(config.seed)

    for epoch in range(1, config.hyperparams.epochs + 1):
        print("Epoch {} training...".format(epoch))
        train_loss = train(model, device, train_loader, optimizer)

        scheduler.step()

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))
        valid_perf = eval(model, device, valid_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))
        test_perf = eval(model, device, test_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))

        # print({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf})
        print('Train:', train_perf[dataset.eval_metric],
              'Validation:', valid_perf[dataset.eval_metric],
              'Test:', test_perf[dataset.eval_metric],
              'Train loss:', train_loss)

        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])
        trainL_curve.append(train_loss)

        writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/traP': train_perf[dataset.eval_metric]}, epoch)
        writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/valP': valid_perf[dataset.eval_metric]}, epoch)
        writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/tstP': test_perf[dataset.eval_metric]}, epoch)
        writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/traL': train_loss}, epoch)
    writer.close()

    print('F1')
    best_val_epoch = np.argmax(np.array(valid_curve))
    best_train = max(train_curve)
    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    print('Finished test: {}, Validation: {}, Train: {}, epoch: {}, best train: {}, best loss: {}'
          .format(test_curve[best_val_epoch], valid_curve[best_val_epoch], train_curve[best_val_epoch],
                  best_val_epoch, best_train, min(trainL_curve)))
Пример #19
0
def main():
    seed = args.seed
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    model_save_dir = f'models/{args.name}'
    os.makedirs(model_save_dir, exist_ok=True)

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    print("Training")
    # writer = SummaryWriter(model_save_dir)

    with open(f'{model_save_dir}/arguments.txt', 'w') as f:
        json.dump(args.__dict__, f, indent=2)

    ### automatic dataloading and splitting

    dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros)

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    vals, tests = [], []
    for run in range(args.runs):
        best_val, final_test = 0, 0

        if args.gnn == 'gin':
            model = GNN(gnn_type='gin',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=False,
                        topological=args.topological).to(device)
        elif args.gnn == 'gin-virtual':
            model = GNN(gnn_type='gin',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=True,
                        topological=args.topological).to(device)
        elif args.gnn == 'gcn':
            model = GNN(gnn_type='gcn',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=False,
                        topological=args.topological).to(device)
        elif args.gnn == 'gcn-virtual':
            model = GNN(gnn_type='gcn',
                        num_class=dataset.num_classes,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        drop_ratio=args.drop_ratio,
                        virtual_node=True,
                        topological=args.topological).to(device)
        elif args.gnn == 'controller':
            model = ControllerTransformer().to(device)
        else:
            raise ValueError('Invalid GNN type')

        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            loss = train(model, device, train_loader, optimizer, args)
            if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs:

                # 4min
                train_perf = eval(model, device, train_loader, evaluator)
                valid_perf = eval(model, device, valid_loader, evaluator)
                test_perf = eval(model, device, test_loader, evaluator)

                result = (train_perf[dataset.eval_metric],
                          valid_perf[dataset.eval_metric],
                          test_perf[dataset.eval_metric])
                _, val, tst = result
                if val > best_val:
                    torch.save(model.state_dict(),
                               os.path.join(model_save_dir, f'model-best.pth'))
                    best_val = val
                    final_test = tst

        print(f'Run{run} val:{best_val}, test:{final_test}')
        vals.append(best_val)
        tests.append(final_test)

    print('')
    print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}")
    print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
Пример #20
0
def main():

    device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting
    dataset = PygGraphPropPredDataset(name = args.dataset, root='/cmlscratch/kong/datasets/ogb')

    seq_len_list = np.array([len(seq) for seq in dataset.data.y])
    print('Target seqence less or equal to {} is {}%.'.format(args.max_seq_len, np.sum(seq_len_list <= args.max_seq_len) / len(seq_len_list)))

    split_idx = dataset.get_idx_split()

    # print(split_idx['train'])
    # print(split_idx['valid'])
    # print(split_idx['test'])

    # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']]
    # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']]
    # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']]
    # print('#train')
    # print(len(train_method_name))
    # print('#valid')
    # print(len(valid_method_name))
    # print('#test')
    # print(len(test_method_name))

    # train_method_name_set = set(train_method_name)
    # valid_method_name_set = set(valid_method_name)
    # test_method_name_set = set(test_method_name)

    # # unique method name
    # print('#unique train')
    # print(len(train_method_name_set))
    # print('#unique valid')
    # print(len(valid_method_name_set))
    # print('#unique test')
    # print(len(test_method_name_set))

    # # unique valid/test method name
    # print('#valid unseen during training')
    # print(len(valid_method_name_set - train_method_name_set))
    # print('#test unseen during training')
    # print(len(test_method_name_set - train_method_name_set))


    ### building vocabulary for sequence predition. Only use training data.

    vocab2idx, idx2vocab = get_vocab_mapping([dataset.data.y[i] for i in split_idx['train']], args.num_vocab)

    # test encoder and decoder
    # for data in dataset:
    #     # PyG >= 1.5.0
    #     print(data.y)
    #
    #     # PyG 1.4.3
    #     # print(data.y[0])
    #     data = encode_y_to_arr(data, vocab2idx, args.max_seq_len)
    #     print(data.y_arr[0])
    #     decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab)
    #     print(decoded_seq)
    #     print('')

    ## test augment_edge
    # data = dataset[2]
    # print(data)
    # data_augmented = augment_edge(data)
    # print(data_augmented)

    ### set the transform function
    # augment_edge: add next-token edge as well as inverse edges. add edge attributes.
    # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence.
    dataset.transform = transforms.Compose([augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len)])

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers = args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers)

    nodetypes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz'))
    nodeattributes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz'))

    ### Encoding node features into emb_dim vectors.
    ### The following three node features are used.
    # 1. node type
    # 2. node attribute
    # 3. node depth
    node_encoder = ASTNodeEncoder(args.emb_dim, num_nodetypes = len(nodetypes_mapping['type']), num_nodeattributes = len(nodeattributes_mapping['attr']), max_depth = 20)


    vals, tests = [], []
    for run in range(args.runs):
        best_val, final_test = 0, 0

        if args.gnn == 'gin':
            model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder,
                        num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                        virtual_node=False).to(device)
        elif args.gnn == 'gin-virtual':
            model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder,
                        num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                        virtual_node=True).to(device)
        elif args.gnn == 'gcn':
            model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder,
                        num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                        virtual_node=False).to(device)
        elif args.gnn == 'gcn-virtual':
            model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder,
                        num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                        virtual_node=True).to(device)
        else:
            raise ValueError('Invalid GNN type')

        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs+1):
            loss = train(model, device, train_loader, optimizer, args)
            if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs:

                #4min
                train_perf = eval(model, device, train_loader, evaluator,
                                  arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))
                valid_perf = eval(model, device, valid_loader, evaluator,
                                  arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))
                test_perf = eval(model, device, test_loader, evaluator,
                                 arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))

                result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric])
                _, val, tst = result
                if val > best_val:
                    best_val = val
                    final_test = tst

        print(f'Run{run} val:{best_val}, test:{final_test}')
        vals.append(best_val)
        tests.append(final_test)

    print('')
    print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}")
    print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
Пример #21
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='GNN baselines on ogbgmol* data with Pytorch Geometrics')
    parser.add_argument('--device', type=int, default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--gnn', type=str, default='gin-virtual',
                        help='GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio', type=float, default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument('--num_layer', type=int, default=5,
                        help='number of GNN message passing layers (default: 5)')
    parser.add_argument('--pooling', type=str, default='mean',
                        help='Pooling tecnhnique for graph embedding')
    parser.add_argument('--laf', type=str, default='mean',
                        help='Init function if laf pooling is specified')
    parser.add_argument('--laf_layers', type=str, default='false',
                        help='If set to true, internal layers will be initialized with laf function')
    parser.add_argument('--emb_dim', type=int, default=300,
                        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs', type=int, default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers', type=int, default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset', type=str, default="ogbg-molhiv",
                        help='dataset name (default: ogbg-molhiv)')
    parser.add_argument('--feature', type=str, default="full",
                        help='full feature or simple feature')
    parser.add_argument('--filename', type=str, default="",
                        help='filename to output result (default: )')
    parser.add_argument('--seed', type=int, default=92,
                        help='torch seed')
    parser.add_argument('--alternate', type=str, default='false',
                        help='use alternate learning with laf')
    args = parser.parse_args()

    print(args)
    torch.manual_seed(args.seed)

    device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting
    dataset = PygGraphPropPredDataset(name=args.dataset)

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False,
                             num_workers=args.num_workers)

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                    virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers,
                    device=device, lafgrad=True).to(device)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                    virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers,
                    device=device, lafgrad=True).to(device)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                    virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers,
                    device=device, lafgrad=True).to(device)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                    virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers,
                    device=device, lafgrad=True).to(device)
    elif args.gnn == 'gat':
        model = GNN(gnn_type='gat', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio,
                    virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers,
                    device=device, lafgrad=True).to(device)
    else:
        raise ValueError('Invalid GNN type')

    #model.load_state_dict(torch.load("{}_fixed_training.mdl".format(args.filename)))
    model_params = []
    laf_params = []
    for n, p in model.named_parameters():
        if n == 'pool.weights' or n == 'pool.alpha' or n == 'pool.beta' or n == 'pool.N' or n == 'pool.M':
            laf_params.append(p)
        else:
            model_params.append(p)

    optimizer = optim.Adam(model_params, lr=0.001)
    if laf_params == []:
        optimizerlaf = None
    else:
        optimizerlaf = optim.Adam(laf_params, lr=0.0001)

    flog = open(args.filename + ".log", 'a')
    valid_curve = []
    test_curve = []
    train_curve = []

    if 'classification' in dataset.task_type:
        best_val = 0
    else:
        best_val = 1e12

    flog.write("{}\n".format(args))
    bflag = True
    for epoch in range(1, args.epochs + 1):
        start = time.time()
        print("=====Epoch {}".format(epoch))
        flog.write("=====Epoch {}\n".format(epoch))

        print('Training...')
        #if args.alternate == 'false':
        #    train_perf = train(model, device, train_loader, optimizer, optimizerlaf, dataset.task_type, evaluator)
        #else:
        #    train_perf = train(model, device, train_loader, optimizer, None, dataset.task_type, evaluator)
        #if args.alternate == 'false':
        train_perf = train(model, device, train_loader, optimizer, optimizerlaf, dataset.task_type, evaluator, alternate=args.alternate)

        print('Evaluating...')
        # train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf})
        print("Time {:.4f}s".format(time.time() - start))
        if laf_params != []:
            print("{}\n".format(torch.norm(model.pool.weights)))
        flog.write("{}\n".format({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf}))
        flog.write("Time: {}\n".format(time.time()-start))
        if laf_params != []:
            flog.write("Laf weights norm: {}\n".format(torch.norm(model.pool.weights, dim=0)))
        flog.flush()

        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])

        if 'classification' in dataset.task_type:
            if valid_perf[dataset.eval_metric] >= best_val:
                best_val = valid_perf[dataset.eval_metric]
                if not args.filename == '':
                    if args.alternate == 'true':
                        torch.save(model.state_dict(), '{}_fixed_training.mdl'.format(args.filename))
                    else:
                        torch.save(model.state_dict(), '{}.mdl'.format(args.filename))
        else:
            if valid_perf[dataset.eval_metric] <= best_val:
                best_val = epoch
                if not args.filename == '':
                    if args.alternate == 'true':
                        torch.save(model.state_dict(), '{}_fixed_training.mdl'.format(args.filename))
                    else:
                        torch.save(model.state_dict(), '{}.mdl'.format(args.filename))

    if 'classification' in dataset.task_type:
        best_val_epoch = np.argmax(np.array(valid_curve))
        best_train = max(train_curve)
    else:
        best_val_epoch = np.argmin(np.array(valid_curve))
        best_train = min(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    flog.write('Finished training!\n')
    flog.write('Best validation score: {}\n'.format(valid_curve[best_val_epoch]))
    flog.write('Test score: {}\n'.format(test_curve[best_val_epoch]))
    flog.flush()

    if not args.filename == '':
        torch.save({'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch],
                    'Train': train_curve[best_val_epoch], 'BestTrain': best_train}, args.filename + "_fixed_training.res")

  #  if args.alternate == 'true'and optimizerlaf:
  #      args.alternate = 'false'
  #      flog.write("===================LAF TRAINING=================\n")
  #      valid_curve = []
  #      test_curve = []
  #      train_curve = []

  #      if 'classification' in dataset.task_type:
  #          best_val = 0
  #      else:
  #          best_val = 1e12
  #      for epoch in range(1, args.epochs + 1):
  #          start = time.time()
  #          print("=====Epoch {}".format(epoch))
  #          flog.write("=====Epoch {}\n".format(epoch))

  #          print('Training...')
  #          train_perf = train(model, device, train_loader, optimizerlaf, None, dataset.task_type, evaluator)

  #          print('Evaluating...')
  #          # train_perf = eval(model, device, train_loader, evaluator)
  #          valid_perf = eval(model, device, valid_loader, evaluator)
  #          test_perf = eval(model, device, test_loader, evaluator)

  #          print({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf})
  #          print("Time {:.4f}s".format(time.time() - start))
  #          #print("{}\n".format(torch.norm(model.pool.weights)))
  #          flog.write("{}\n".format({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf}))
  #          flog.write("Time: {}\n".format(time.time()-start))
  #          #flog.write("Laf weights norm: {}\n".format(torch.norm(model.pool.weights, dim=0)))
  #          flog.flush()

  #          train_curve.append(train_perf[dataset.eval_metric])
  #          valid_curve.append(valid_perf[dataset.eval_metric])
  #          test_curve.append(test_perf[dataset.eval_metric])

  #          if 'classification' in dataset.task_type:
  #              if valid_perf[dataset.eval_metric] >= best_val:
  #                  best_val = valid_perf[dataset.eval_metric]
  #                  if not args.filename == '':
  #                      torch.save(model.state_dict(), '{}_laf_training.mdl'.format(args.filename))
  #          else:
  #              if valid_perf[dataset.eval_metric] <= best_val:
  #                  best_val = epoch
  #                  if not args.filename == '':
  #                      torch.save(model.state_dict(), '{}_laf_training.mdl'.format(args.filename))

  #      if 'classification' in dataset.task_type:
  #          best_val_epoch = np.argmax(np.array(valid_curve))
  #          best_train = max(train_curve)
  #      else:
  #          best_val_epoch = np.argmin(np.array(valid_curve))
  #          best_train = min(train_curve)

  #      print('Finished training!')
  #      print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
  #      print('Test score: {}'.format(test_curve[best_val_epoch]))

  #      flog.write('Finished training!\n')
  #      flog.write('Best validation score: {}\n'.format(valid_curve[best_val_epoch]))
  #      flog.write('Test score: {}\n'.format(test_curve[best_val_epoch]))
  #      flog.flush()

  #      if not args.filename == '':
  #          torch.save({'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch],
  #                      'Train': train_curve[best_val_epoch], 'BestTrain': best_train}, args.filename + "_laf_training.res")
    flog.close()
Пример #22
0
def main():
    args = get_args()
    config = process_config(args)
    print(config)

    if config.get('seed') is not None:
        random.seed(config.seed)
        torch.manual_seed(config.seed)
        np.random.seed(config.seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(config.seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ### automatic dataloading and splitting

    sys.stdin = In()

    dataset = PygGraphPropPredDataset(name=config.dataset_name)

    if config.feature == 'full':
        pass
    elif config.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(config.dataset_name)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=config.hyperparams.batch_size,
                              shuffle=True,
                              num_workers=config.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=config.hyperparams.batch_size,
                              shuffle=False,
                              num_workers=config.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=config.hyperparams.batch_size,
                             shuffle=False,
                             num_workers=config.num_workers)

    model = Net(config.architecture, num_tasks=dataset.num_tasks).to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=config.hyperparams.learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config.hyperparams.step_size,
        gamma=config.hyperparams.decay_rate)

    valid_curve = []
    test_curve = []
    train_curve = []
    trainL_curve = []

    writer = SummaryWriter(config.directory)

    ts_fk_algo_hp = str(config.time_stamp) + '_' \
                    + str(config.commit_id[0:7]) + '_' \
                    + str(config.architecture.methods) + '_' \
                    + str(config.architecture.pooling) + '_' \
                    + str(config.architecture.JK) + '_' \
                    + str(config.architecture.layers) + '_' \
                    + str(config.architecture.hidden) + '_' \
                    + str(config.architecture.variants.BN) + '_' \
                    + str(config.architecture.dropout) + '_' \
                    + str(config.hyperparams.learning_rate) + '_' \
                    + str(config.hyperparams.step_size) + '_' \
                    + str(config.hyperparams.decay_rate) + '_' \
                    + 'B' + str(config.hyperparams.batch_size) + '_' \
                    + 'S' + str(config.seed if config.get('seed') is not None else "na") + '_' \
                    + 'W' + str(config.num_workers if config.get('num_workers') is not None else "na")

    for epoch in range(1, config.hyperparams.epochs + 1):
        print("Epoch {} training...".format(epoch))
        train_loss = train(model, device, train_loader, optimizer,
                           dataset.task_type)

        scheduler.step()

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print('Train:', train_perf[dataset.eval_metric], 'Validation:',
              valid_perf[dataset.eval_metric], 'Test:',
              test_perf[dataset.eval_metric], 'Train loss:', train_loss)

        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])
        trainL_curve.append(train_loss)

        writer.add_scalars(
            config.dataset_name,
            {ts_fk_algo_hp + '/traP': train_perf[dataset.eval_metric]}, epoch)
        writer.add_scalars(
            config.dataset_name,
            {ts_fk_algo_hp + '/valP': valid_perf[dataset.eval_metric]}, epoch)
        writer.add_scalars(
            config.dataset_name,
            {ts_fk_algo_hp + '/tstP': test_perf[dataset.eval_metric]}, epoch)
        writer.add_scalars(config.dataset_name,
                           {ts_fk_algo_hp + '/traL': train_loss}, epoch)

    writer.close()

    if 'classification' in dataset.task_type:
        best_val_epoch = np.argmax(np.array(valid_curve))
        best_train = max(train_curve)
    else:
        best_val_epoch = np.argmin(np.array(valid_curve))
        best_train = min(train_curve)

    print(
        'Finished test: {}, Validation: {}, epoch: {}, best train: {}, best loss: {}'
        .format(test_curve[best_val_epoch], valid_curve[best_val_epoch],
                best_val_epoch, best_train, min(trainL_curve)))
Пример #23
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbg-code2 data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gcn-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gcn-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0,
                        help='dropout ratio (default: 0)')
    parser.add_argument('--max_seq_len',
                        type=int,
                        default=5,
                        help='maximum sequence length to predict (default: 5)')
    parser.add_argument(
        '--num_vocab',
        type=int,
        default=5000,
        help=
        'the number of vocabulary used for sequence prediction (default: 5000)'
    )
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='input batch size for training (default: 128)')
    parser.add_argument('--epochs',
                        type=int,
                        default=25,
                        help='number of epochs to train (default: 25)')
    parser.add_argument('--random_split', action='store_true')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-code2",
                        help='dataset name (default: ogbg-code2)')

    parser.add_argument('--filename',
                        type=str,
                        default="",
                        help='filename to output result (default: )')
    args = parser.parse_args()
    print(args)

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting
    dataset = PygGraphPropPredDataset(name=args.dataset)

    seq_len_list = np.array([len(seq) for seq in dataset.data.y])
    print('Target seqence less or equal to {} is {}%.'.format(
        args.max_seq_len,
        np.sum(seq_len_list <= args.max_seq_len) / len(seq_len_list)))

    split_idx = dataset.get_idx_split()

    if args.random_split:
        print('Using random split')
        perm = torch.randperm(len(dataset))
        num_train, num_valid, num_test = len(split_idx['train']), len(
            split_idx['valid']), len(split_idx['test'])
        split_idx['train'] = perm[:num_train]
        split_idx['valid'] = perm[num_train:num_train + num_valid]
        split_idx['test'] = perm[num_train + num_valid:]

        assert (len(split_idx['train']) == num_train)
        assert (len(split_idx['valid']) == num_valid)
        assert (len(split_idx['test']) == num_test)

    # print(split_idx['train'])
    # print(split_idx['valid'])
    # print(split_idx['test'])

    # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']]
    # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']]
    # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']]
    # print('#train')
    # print(len(train_method_name))
    # print('#valid')
    # print(len(valid_method_name))
    # print('#test')
    # print(len(test_method_name))

    # train_method_name_set = set(train_method_name)
    # valid_method_name_set = set(valid_method_name)
    # test_method_name_set = set(test_method_name)

    # # unique method name
    # print('#unique train')
    # print(len(train_method_name_set))
    # print('#unique valid')
    # print(len(valid_method_name_set))
    # print('#unique test')
    # print(len(test_method_name_set))

    # # unique valid/test method name
    # print('#valid unseen during training')
    # print(len(valid_method_name_set - train_method_name_set))
    # print('#test unseen during training')
    # print(len(test_method_name_set - train_method_name_set))

    ### building vocabulary for sequence predition. Only use training data.

    vocab2idx, idx2vocab = get_vocab_mapping(
        [dataset.data.y[i] for i in split_idx['train']], args.num_vocab)

    # test encoder and decoder
    # for data in dataset:
    #     # PyG >= 1.5.0
    #     print(data.y)
    #
    #     # PyG 1.4.3
    #     # print(data.y[0])
    #     data = encode_y_to_arr(data, vocab2idx, args.max_seq_len)
    #     print(data.y_arr[0])
    #     decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab)
    #     print(decoded_seq)
    #     print('')

    ## test augment_edge
    # data = dataset[2]
    # print(data)
    # data_augmented = augment_edge(data)
    # print(data_augmented)

    ### set the transform function
    # augment_edge: add next-token edge as well as inverse edges. add edge attributes.
    # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence.
    dataset.transform = transforms.Compose([
        augment_edge,
        lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len)
    ])

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    nodetypes_mapping = pd.read_csv(
        os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz'))
    nodeattributes_mapping = pd.read_csv(
        os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz'))

    print(nodeattributes_mapping)

    ### Encoding node features into emb_dim vectors.
    ### The following three node features are used.
    # 1. node type
    # 2. node attribute
    # 3. node depth
    node_encoder = ASTNodeEncoder(args.emb_dim,
                                  num_nodetypes=len(nodetypes_mapping['type']),
                                  num_nodeattributes=len(
                                      nodeattributes_mapping['attr']),
                                  max_depth=20)

    if args.gnn == 'gin':
        model = GNN(num_vocab=len(vocab2idx),
                    max_seq_len=args.max_seq_len,
                    node_encoder=node_encoder,
                    num_layer=args.num_layer,
                    gnn_type='gin',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gin-virtual':
        model = GNN(num_vocab=len(vocab2idx),
                    max_seq_len=args.max_seq_len,
                    node_encoder=node_encoder,
                    num_layer=args.num_layer,
                    gnn_type='gin',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    elif args.gnn == 'gcn':
        model = GNN(num_vocab=len(vocab2idx),
                    max_seq_len=args.max_seq_len,
                    node_encoder=node_encoder,
                    num_layer=args.num_layer,
                    gnn_type='gcn',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gcn-virtual':
        model = GNN(num_vocab=len(vocab2idx),
                    max_seq_len=args.max_seq_len,
                    node_encoder=node_encoder,
                    num_layer=args.num_layer,
                    gnn_type='gcn',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    else:
        raise ValueError('Invalid GNN type')

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    print(f'#Params: {sum(p.numel() for p in model.parameters())}')

    valid_curve = []
    test_curve = []
    train_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train(model, device, train_loader, optimizer)

        print('Evaluating...')
        train_perf = eval(
            model,
            device,
            train_loader,
            evaluator,
            arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))
        valid_perf = eval(
            model,
            device,
            valid_loader,
            evaluator,
            arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))
        test_perf = eval(
            model,
            device,
            test_loader,
            evaluator,
            arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab))

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })

        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])

    print('F1')
    best_val_epoch = np.argmax(np.array(valid_curve))
    best_train = max(train_curve)
    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    if not args.filename == '':
        result_dict = {
            'Val': valid_curve[best_val_epoch],
            'Test': test_curve[best_val_epoch],
            'Train': train_curve[best_val_epoch],
            'BestTrain': best_train
        }
        torch.save(result_dict, args.filename)
Пример #24
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbgmol* data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-molhiv",
                        help='dataset name (default: ogbg-molhiv)')

    parser.add_argument('--feature',
                        type=str,
                        default="full",
                        help='full feature or simple feature')
    parser.add_argument("--verbose", "-v", action="store_true")
    parser.add_argument('--mu', type=float, default=0.5, help='hyperparameter')
    parser.add_argument('--num_seeds',
                        type=int,
                        default=10,
                        help='number of seeds (default: 10)')
    args = parser.parse_args()

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    mu = args.mu
    ### automatic dataloading and splitting
    dataset = PygGraphPropPredDataset(name=args.dataset)

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)
    num_seeds = args.num_seeds
    seeds = list(range(num_seeds))

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    result = np.zeros((4, num_seeds))
    for seed in seeds:
        torch.manual_seed(seed)
        if args.gnn == 'gin':
            model = PREGGNN(gnn_type='gin',
                            num_tasks=dataset.num_tasks,
                            emb_dim=args.emb_dim,
                            drop_ratio=args.drop_ratio,
                            virtual_node=False).to(device)
        elif args.gnn == 'gin-virtual':
            model = PREGGNN(gnn_type='gin',
                            num_tasks=dataset.num_tasks,
                            emb_dim=args.emb_dim,
                            drop_ratio=args.drop_ratio,
                            virtual_node=True).to(device)
        elif args.gnn == 'gcn':
            model = PREGGNN(gnn_type='gcn',
                            num_tasks=dataset.num_tasks,
                            emb_dim=args.emb_dim,
                            drop_ratio=args.drop_ratio,
                            virtual_node=False).to(device)
        elif args.gnn == 'gcn-virtual':
            model = PREGGNN(gnn_type='gcn',
                            num_tasks=dataset.num_tasks,
                            emb_dim=args.emb_dim,
                            drop_ratio=args.drop_ratio,
                            virtual_node=True).to(device)
        else:
            raise ValueError('Invalid GNN type')

        optimizer = optim.Adam(model.parameters(), lr=0.001)

        valid_curve = []
        test_curve = []
        train_curve = []

        for epoch in range(1, args.epochs + 1):
            print("=====Epoch {}".format(epoch))
            print('Training...')
            train(model, device, train_loader, optimizer, dataset.task_type,
                  mu)

            print('Evaluating...')
            train_perf = eval(model, device, train_loader, evaluator)
            valid_perf = eval(model, device, valid_loader, evaluator)
            test_perf = eval(model, device, test_loader, evaluator)

            print({
                'Train': train_perf,
                'Validation': valid_perf,
                'Test': test_perf
            })

            train_curve.append(train_perf[dataset.eval_metric])
            valid_curve.append(valid_perf[dataset.eval_metric])
            test_curve.append(test_perf[dataset.eval_metric])

        if 'classification' in dataset.task_type:
            best_val_epoch = np.argmax(np.array(valid_curve))
            best_train = max(train_curve)
        else:
            best_val_epoch = np.argmin(np.array(valid_curve))
            best_train = min(train_curve)

        print('Finished training!')
        print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
        print('Test score: {}'.format(test_curve[best_val_epoch]))

        result[0][seed] = best_val_epoch
        result[1][seed] = train_curve[best_val_epoch]
        result[2][seed] = valid_curve[best_val_epoch]
        result[3][seed] = test_curve[best_val_epoch]

        if not args.verbose:
            if not os.path.exists("result"):
                os.makedirs("result")

            torch.save(
                {
                    'Val': valid_curve[best_val_epoch],
                    'Test': test_curve[best_val_epoch],
                    'Train': train_curve[best_val_epoch],
                    'BestTrain': best_train,
                    'mu': mu,
                    'valid_curve': valid_curve,
                    'test_curve': test_curve,
                    "train_curve": train_curve,
                    'dataset': args.dataset,
                    "model": "iad" + args.gnn,
                    'epochs': args.epochs
                }, 'result/' + args.dataset + "_preg" + args.gnn + "_" +
                str(mu) + "_" + str(args.epochs) + "_" + str(seed) + "_" +
                str(num_seeds) + ".pth")
Пример #25
0
def main():
    args = ArgsInit().save_exp()

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    sub_dir = 'BS_{}'.format(args.batch_size)

    if args.not_extract_node_feature:
        dataset = PygGraphPropPredDataset(name=args.dataset,
                                          transform=add_zeros)
    else:
        if args.aggr == 'add':
            dataset = PygGraphPropPredDataset(
                name=args.dataset, transform=extract_node_feature_add)
        elif args.aggr == 'mean':
            dataset = PygGraphPropPredDataset(
                name=args.dataset, transform=extract_node_feature_mean)
        elif args.aggr == 'max':
            dataset = PygGraphPropPredDataset(
                name=args.dataset, transform=extract_node_feature_max)
        else:
            raise Exception('Unknown Aggregation Type')

        sub_dir = sub_dir + '-NF_{}'.format(args.aggr)

    args.num_tasks = dataset.num_classes
    evaluator = Evaluator(args.dataset)

    logging.info('%s' % args)

    split_idx = dataset.get_idx_split()

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    model = DeeperGCN(args).to(device)

    logging.info(model)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    criterion = torch.nn.CrossEntropyLoss()

    results = {
        'highest_valid': 0,
        'final_train': 0,
        'final_test': 0,
        'highest_train': 0
    }

    start_time = time.time()

    evaluate = True

    for epoch in range(1, args.epochs + 1):
        logging.info("=====Epoch {}".format(epoch))
        logging.info('Training...')

        epoch_loss = train(model, device, train_loader, optimizer, criterion)

        if args.num_layers > args.num_layers_threshold:
            if epoch % args.eval_steps != 0:
                evaluate = False
            else:
                evaluate = True

        model.print_params(epoch=epoch)

        if evaluate:

            logging.info('Evaluating...')

            train_accuracy = eval(model, device, train_loader, evaluator)
            valid_accuracy = eval(model, device, valid_loader, evaluator)
            test_accuracy = eval(model, device, test_loader, evaluator)

            logging.info({
                'Train': train_accuracy,
                'Validation': valid_accuracy,
                'Test': test_accuracy
            })

            if train_accuracy > results['highest_train']:

                results['highest_train'] = train_accuracy

            if valid_accuracy > results['highest_valid']:
                results['highest_valid'] = valid_accuracy
                results['final_train'] = train_accuracy
                results['final_test'] = test_accuracy

                save_ckpt(model,
                          optimizer,
                          round(epoch_loss, 4),
                          epoch,
                          args.model_save_path,
                          sub_dir,
                          name_post='valid_best')

    logging.info("%s" % results)

    end_time = time.time()
    total_time = end_time - start_time
    logging.info('Total time: {}'.format(
        time.strftime('%H:%M:%S', time.gmtime(total_time))))
Пример #26
0
class OGBPCBADataset(WILDSDataset):
    """
    The OGB-molpcba dataset.
    This dataset is directly adopted from Open Graph Benchmark, and originally curated by MoleculeNet.

    Supported `split_scheme`:
        'official' or 'scaffold', which are equivalent

    Input (x):
        Molecular graphs represented as Pytorch Geometric data objects

    Label (y):
        y represents 128-class binary labels.

    Metadata:
        - scaffold
            Each molecule is annotated with the scaffold ID that the molecule is assigned to.

    Website:
        https://ogb.stanford.edu/docs/graphprop/#ogbg-mol

    Original publication:
        @article{hu2020ogb,
            title={Open Graph Benchmark: Datasets for Machine Learning on Graphs},
            author={W. {Hu}, M. {Fey}, M. {Zitnik}, Y. {Dong}, H. {Ren}, B. {Liu}, M. {Catasta}, J. {Leskovec}},
            journal={arXiv preprint arXiv:2005.00687},
            year={2020}
        }

        @article{wu2018moleculenet,
            title={MoleculeNet: a benchmark for molecular machine learning},
            author={Z. {Wu}, B. {Ramsundar}, E. V {Feinberg}, J. {Gomes}, C. {Geniesse}, A. S {Pappu}, K. {Leswing}, V. {Pande}},
            journal={Chemical science},
            volume={9},
            number={2},
            pages={513--530},
            year={2018},
            publisher={Royal Society of Chemistry}
        }

    License:
        This dataset is distributed under the MIT license.
        https://github.com/snap-stanford/ogb/blob/master/LICENSE
    """

    _dataset_name = 'ogbg-molpcba'
    _versions_dict = {'1.0': {'download_url': None, 'compressed_size': None}}

    def __init__(self,
                 version=None,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._version = version
        if version is not None:
            raise ValueError(
                'Versioning for OGB-MolPCBA is handled through the OGB package. Please set version=none.'
            )
        # internally call ogb package
        self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba',
                                                   root=root_dir)

        # set variables
        self._data_dir = self.ogb_dataset.root
        if split_scheme == 'official':
            split_scheme = 'scaffold'
        self._split_scheme = split_scheme
        self._y_type = 'float'  # although the task is binary classification, the prediction target contains nan value, thus we need float
        self._y_size = self.ogb_dataset.num_tasks
        self._n_classes = self.ogb_dataset.__num_classes__

        self._split_array = torch.zeros(len(self.ogb_dataset)).long()
        split_idx = self.ogb_dataset.get_idx_split()
        self._split_array[split_idx['train']] = 0
        self._split_array[split_idx['valid']] = 1
        self._split_array[split_idx['test']] = 2

        self._y_array = self.ogb_dataset.data.y

        self._metadata_fields = ['scaffold']

        metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw',
                                          'scaffold_group.npy')
        if not os.path.exists(metadata_file_path):
            download_url(
                'https://snap.stanford.edu/ogb/data/misc/ogbg_molpcba/scaffold_group.npy',
                os.path.join(self.ogb_dataset.root, 'raw'))
        self._metadata_array = torch.from_numpy(
            np.load(metadata_file_path)).reshape(-1, 1).long()

        if torch_geometric.__version__ >= '1.7.0':
            self._collate = PyGCollater(follow_batch=[], exclude_keys=[])
        else:
            self._collate = PyGCollater(follow_batch=[])

        self._metric = Evaluator('ogbg-molpcba')

        super().__init__(root_dir, download, split_scheme)

    def get_input(self, idx):
        return self.ogb_dataset[int(idx)]

    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
        """
        Computes all evaluation metrics.
        Args:
            - y_pred (FloatTensor): Binary logits from a model
            - y_true (LongTensor): Ground-truth labels
            - metadata (Tensor): Metadata
            - prediction_fn (function): A function that turns y_pred into predicted labels. 
                                        Only None is supported because OGB Evaluators accept binary logits
        Output:
            - results (dictionary): Dictionary of evaluation metrics
            - results_str (str): String summarizing the evaluation metrics
        """
        assert prediction_fn is None, "OGBPCBADataset.eval() does not support prediction_fn. Only binary logits accepted"
        input_dict = {"y_true": y_true, "y_pred": y_pred}
        results = self._metric.eval(input_dict)

        return results, f"Average precision: {results['ap']:.3f}\n"
Пример #27
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-MolHiv')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--num_layers', type=int, default=5)
    parser.add_argument('--emb_dim', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--eval',
                        action='store_true',
                        help='If not set, we will only do the training part.')
    parser.add_argument('--eval_batch_size', type=int, default=2048)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)
    print(f"Running on {device}")

    dataset = PygGraphPropPredDataset(name='ogbg-molhiv')
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(name='ogbg-molhiv')
    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    val_loader = DataLoader(dataset[split_idx["valid"]],
                            batch_size=args.eval_batch_size,
                            shuffle=False,
                            num_workers=0)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.eval_batch_size,
                             shuffle=False,
                             num_workers=0)

    model = GCN(args.emb_dim,
                num_classes=dataset.num_tasks,
                num_layers=args.num_layers,
                dropout=args.dropout).to(device)

    logger = Logger(args.runs, args)
    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            t0 = time.time()
            loss = train(model, device, train_loader, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))

            if not args.eval:
                continue

            val_rocauc = test(model, device, val_loader,
                              evaluator)[dataset.eval_metric]
            test_rocauc = test(model, device, test_loader,
                               evaluator)[dataset.eval_metric]
            logger.add_result(run, (0.0, val_rocauc, test_rocauc))

            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Valid: {val_rocauc:.2f} '
                      f'Test: {test_rocauc:.2f}')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Пример #28
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbg-ppa data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument('--pooling',
                        type=str,
                        default='mean',
                        help='Pooling tecnhnique for graph embedding')
    parser.add_argument('--laf',
                        type=str,
                        default='mean',
                        help='Init function if laf pooling is specified')
    parser.add_argument(
        '--laf_layers',
        type=str,
        default='false',
        help=
        'If set to true, internal layers will be initialized with laf function'
    )
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-ppa",
                        help='dataset name (default: ogbg-ppa)')
    parser.add_argument('--filename',
                        type=str,
                        default="",
                        help='filename to output result (default: )')
    parser.add_argument('--seed', type=int, default=92, help='torch seed')
    args = parser.parse_args()

    print(args)

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting

    dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros)

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False,
                    graph_pooling=args.pooling,
                    laf_fun=args.laf,
                    device=args.device).to(device)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True,
                    graph_pooling=args.pooling,
                    laf_fun=args.laf,
                    device=args.device).to(device)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False,
                    graph_pooling=args.pooling,
                    laf_fun=args.laf,
                    device=args.device).to(device)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn',
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True,
                    graph_pooling=args.pooling,
                    laf_fun=args.laf,
                    device=args.device).to(device)
    else:
        raise ValueError('Invalid GNN type')

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    valid_curve = []
    test_curve = []
    train_curve = []

    best_val = 0

    flog = open(args.filename + ".log", 'w')
    flog.write("{}\n".format(args))
    for epoch in range(1, args.epochs + 1):
        start = time.time()
        print("=====Epoch {}".format(epoch))
        flog.write("=====Epoch {}\n".format(epoch))
        print('Training...')
        train(model, device, train_loader, optimizer)

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })
        print("Time {:.4f}s".format(time.time() - start))
        flog.write("{}\tTime: {}s\n".format(
            {
                'Train': train_perf,
                'Validation': valid_perf,
                'Test': test_perf
            },
            time.time() - start))
        flog.flush()

        train_curve.append(train_perf['acc'])
        valid_curve.append(valid_perf['acc'])
        test_curve.append(test_perf['acc'])

        if valid_perf[dataset.eval_metric] >= best_val:
            best_val = valid_perf[dataset.eval_metric]
            if not args.filename == '':
                torch.save(model.state_dict(), '{}.mdl'.format(args.filename))

    best_val_epoch = np.argmax(np.array(valid_curve))
    best_train = max(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    flog.write('Finished training!\n')
    flog.write('Best validation score: {}\n'.format(
        valid_curve[best_val_epoch]))
    flog.write('Test score: {}\n'.format(test_curve[best_val_epoch]))
    flog.flush()

    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename + ".res")
Пример #29
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbg-ppi data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-ppi",
                        help='dataset name (default: ogbg-ppi)')

    parser.add_argument('--filename',
                        type=str,
                        default="",
                        help='filename to output result (default: )')
    args = parser.parse_args()

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting

    dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros)

    splitted_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[splitted_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[splitted_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[splitted_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    if args.gnn == 'gin':
        model = GNN(gnn_type='gin',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gin-virtual':
        model = GNN(gnn_type='gin',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    elif args.gnn == 'gcn':
        model = GNN(gnn_type='gcn',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=False).to(device)
    elif args.gnn == 'gcn-virtual':
        model = GNN(gnn_type='gcn',
                    num_class=37,
                    emb_dim=args.emb_dim,
                    drop_ratio=args.drop_ratio,
                    virtual_node=True).to(device)
    else:
        raise ValueError('Invalid GNN type')

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    valid_curve = []
    test_curve = []
    train_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train(model, device, train_loader, optimizer)

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })

        train_curve.append(train_perf['acc'])
        valid_curve.append(valid_perf['acc'])
        test_curve.append(test_perf['acc'])

    best_val_epoch = np.argmax(np.array(valid_curve))
    best_train = max(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename)
Пример #30
0
    'Transferred from pretrained Mol-BBBP model (Damaged features)',
    'self-transfer':
    'Transferred from Mol-HIV source split',
    'self-transfer-damaged':
    'Transferred from Mol-HIV source split (Damaged features)'
}

BATCH_SIZE = 64

# ---------------------------------------------------
# Data
# ---------------------------------------------------

# Mol-BBBP
bbbp_dataset = PygGraphPropPredDataset(name='ogbg-molbbbp')
bbbp_split_idx = bbbp_dataset.get_idx_split()

train_loader = DataLoader(bbbp_dataset[bbbp_split_idx["train"]],
                          batch_size=BATCH_SIZE,
                          shuffle=True)
valid_loader = DataLoader(bbbp_dataset[bbbp_split_idx["valid"]],
                          batch_size=BATCH_SIZE,
                          shuffle=False)
test_loader = DataLoader(bbbp_dataset[bbbp_split_idx["test"]],
                         batch_size=BATCH_SIZE,
                         shuffle=False)

bbbp_evaluator = Evaluator('ogbg-molbbbp')

# Mol-HIV
dataset = PygGraphPropPredDataset(name='ogbg-molhiv')