Exemplo n.º 1
0
def train_ray(opt, checkpoint_dir=None, data_dir="../data"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data = get_dataset(opt)
    g = data[0]
    if opt['gpu'] < 0:
        cuda = False
    else:
        cuda = True
        g = g.int().to(opt['gpu'])

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    num_feats = features.shape[1]
    n_classes = data.num_classes
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
    #Edges %d
    #Classes %d
    #Train samples %d
    #Val samples %d
    #Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))

    # add self loop
    g = dgl.remove_self_loop(g)
    g = dgl.add_self_loop(g)
    n_edges = g.number_of_edges()
    # create model
    heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']]

    models = []
    optimizers = []
    datas = [g for i in range(opt['num_init'])]

    for split in range(opt['num_init']):
        if opt['model'] == 'GAT':
            model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'],
                        n_classes, heads, F.elu, opt['in_drop'],
                        opt['attn_drop'], opt['negative_slope'],
                        opt['residual'], opt)
        elif opt['model'] == 'AGNN':
            model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'],
                         n_classes, opt['in_drop'], opt)

        train_this = train
        model = model.to(device)
        models.append(model)

        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)

        # model = model.to(device)
        parameters = [p for p in model.parameters() if p.requires_grad]

        optimizer = get_optimizer(opt['optimizer'],
                                  parameters,
                                  lr=opt['lr'],
                                  weight_decay=opt['weight_decay'])
        optimizers.append(optimizer)

        # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
        # should be restored.
        if checkpoint_dir:
            checkpoint = os.path.join(checkpoint_dir, "checkpoint")
            model_state, optimizer_state = torch.load(checkpoint)
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    for epoch in range(1, opt['epochs']):
        loss = np.mean([
            train_this(model, optimizer, features, train_mask,
                       labels)[0].item()
            for model, optimizer in zip(models, optimizers)
        ])
        train_accs, val_accs, tmp_test_accs = average_test(models, datas)
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            best = np.argmax(val_accs)
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save(
                (models[best].state_dict(), optimizers[best].state_dict()),
                path)
        tune.report(loss=loss,
                    accuracy=np.mean(val_accs),
                    test_acc=np.mean(tmp_test_accs),
                    train_acc=np.mean(train_accs))
Exemplo n.º 2
0
def train_ray_int(opt, checkpoint_dir=None, data_dir="../data"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data = get_dataset(opt)
    g = data[0]
    if opt['gpu'] < 0:
        cuda = False
    else:
        cuda = True
        g = g.int().to(opt['gpu'])

    # if opt["num_splits"] > 0:
    #   dataset.data = set_train_val_test_split(
    #     23 * np.random.randint(0, opt["num_splits"]),  # random prime 23 to make the splits 'more' random. Could remove
    #     dataset.data,
    #     num_development=5000 if opt["dataset"] == "CoauthorCS" else 1500)

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    num_feats = features.shape[1]
    n_classes = data.num_classes
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
  #Edges %d
  #Classes %d
  #Train samples %d
  #Val samples %d
  #Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))

    # add self loop
    g = dgl.remove_self_loop(g)
    g = dgl.add_self_loop(g)
    n_edges = g.number_of_edges()
    # create model
    heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']]
    if opt['model'] == 'GAT':
        model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'],
                    n_classes, heads, F.elu, opt['in_drop'], opt['attn_drop'],
                    opt['negative_slope'], opt['residual'], opt)
    elif opt['model'] == 'AGNN':
        model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'],
                     n_classes, opt['in_drop'], opt)

    model = model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    parameters = [p for p in model.parameters() if p.requires_grad]
    optimizer = get_optimizer(opt["optimizer"],
                              parameters,
                              lr=opt["lr"],
                              weight_decay=opt["weight_decay"])

    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state = torch.load(checkpoint)
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    train_this = train
    this_test = test_OGB if opt['dataset'] == 'ogbn-arxiv' else test
    best_time = best_epoch = train_acc = val_acc = test_acc = 0
    for epoch in range(1, opt["epoch"]):
        # loss = train(model, optimizer, data)
        loss = train_this(model, optimizer, features, train_mask,
                          labels)[0].item()
        if opt["no_early"]:
            tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g)
            best_time = opt['time']
        else:
            tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g)
        if tmp_val_acc > val_acc:
            best_epoch = epoch
            train_acc = tmp_train_acc
            val_acc = tmp_val_acc
            test_acc = tmp_test_acc
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(loss=loss,
                    accuracy=val_acc,
                    test_acc=test_acc,
                    train_acc=train_acc,
                    best_time=best_time,
                    best_epoch=best_epoch)