Exemplo n.º 1
0
def train(args):
    set_random_seed(args.seed)
    device = f'cuda:{args.device}' if torch.cuda.is_available(
    ) and args.device >= 0 else 'cpu'
    device = torch.device(device)

    g, labels, num_classes, train_idx, val_idx, test_idx = load_data(
        args.dataset, args.ogb_root, args.seed, device)
    features = g.ndata['feat']

    model = SuperGAT(features.shape[1], args.num_hidden, num_classes,
                     args.num_heads, args.attn_type, args.neg_sample_ratio,
                     args.dropout, args.dropout).to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    for epoch in range(args.epochs):
        model.train()
        logits, attn_loss = model(g, features)
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        loss += args.attn_loss_weight * attn_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc = accuracy(logits[train_idx], labels[train_idx])
        val_acc = evaluate(model, g, features, labels, val_idx)
        print('Epoch {:04d} | Loss {:.4f} | Train Acc {:.4f} | Val Acc {:.4f}'.
              format(epoch, loss.item(), train_acc, val_acc))
    acc = evaluate(model, g, features, labels, test_idx)
    print('Test Accuracy {:.4f}'.format(acc))
Exemplo n.º 2
0
def train(args):
    set_random_seed(args.seed)
    g = load_graphs(os.path.join(args.data_path, 'neighbor_graph.bin'))[0][0]
    feats = load_info(os.path.join(args.data_path, 'in_feats.pkl'))

    model = HetGNN(feats['author'].shape[-1], args.num_hidden, g.ntypes)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    neg_sampler = RatioNegativeSampler()
    for epoch in range(args.epochs):
        model.train()
        embeds = model(g, feats)
        score = model.calc_score(g, embeds)
        neg_g = construct_neg_graph(g, neg_sampler)
        neg_score = model.calc_score(neg_g, embeds)
        logits = torch.cat([score, neg_score])  # (2A*E,)
        labels = torch.cat(
            [torch.ones(score.shape[0]),
             torch.zeros(neg_score.shape[0])])
        loss = F.binary_cross_entropy_with_logits(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('Epoch {:d} | Loss {:.4f}'.format(epoch, loss.item()))
    with torch.no_grad():
        final_embeds = model(g, feats)
        with open(args.save_node_embed_path, 'wb') as f:
            pickle.dump(final_embeds, f)
        print('Final node embeddings saved to', args.save_node_embed_path)
Exemplo n.º 3
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    data, g, feats, labels, predict_ntype, relations, neighbor_sizes, \
        pos, pos_threshold, train_mask, val_mask, test_mask = load_data(args.dataset, device)
    bgs = [g[rel] for rel in relations]  # 邻居-目标顶点二分图
    mgs = [
        dgl.add_self_loop(
            dgl.remove_self_loop(dgl.metapath_reachable_graph(g,
                                                              mp))).to(device)
        for mp in data.metapaths
    ]  # 基于元路径的邻居同构图

    model = HeCo([feat.shape[1] for feat in feats], args.num_hidden,
                 args.feat_drop, args.attn_drop, neighbor_sizes,
                 len(data.metapaths), args.tau, args.lambda_).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    for epoch in range(args.epochs):
        model.train()
        loss = model(bgs, mgs, feats, pos)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('Epoch {:d} | Train Loss {:.4f}'.format(epoch, loss.item()))
    evaluate(model, mgs, feats[0], labels, data.num_classes, train_mask,
             test_mask, args.seed)
Exemplo n.º 4
0
def train(args):
    set_random_seed(args.seed)
    g, labels, num_classes, train_idx, val_idx, test_idx = load_data(args.dataset, args.ogb_root)
    print('正在预先计算邻居聚集特征...')
    features = preprocess(g, g.ndata['feat'], args.num_hops)  # List[tensor(N, d_in)],长度为r+1
    train_feats = [feat[train_idx] for feat in features]
    val_feats = [feat[val_idx] for feat in features]
    test_feats = [feat[test_idx] for feat in features]

    model = SIGN(
        g.ndata['feat'].shape[1], args.num_hidden, num_classes, args.num_hops,
        args.num_layers, args.dropout
    )
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

    for epoch in range(args.epochs):
        model.train()
        logits = model(train_feats)
        loss = F.cross_entropy(logits, labels[train_idx])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc = accuracy(logits, labels[train_idx])
        val_acc = evaluate(model, val_feats, labels[val_idx])
        print('Epoch {:d} | Train Loss {:.4f} | Train Acc {:.4f} | Val Acc {:.4f}'.format(
            epoch, loss, train_acc, val_acc
        ))
    test_acc = evaluate(model, test_feats, labels[test_idx])
    print('Test Acc {:.4f}'.format(test_acc))
Exemplo n.º 5
0
def train(args):
    set_random_seed(args.seed)
    if args.hetero:
        data = HETERO_DATASET[args.dataset]()
        g = data[0]
        gs = [
            dgl.metapath_reachable_graph(g, metapath)
            for metapath in data.metapaths
        ]
        for i in range(len(gs)):
            gs[i] = dgl.add_self_loop(dgl.remove_self_loop(gs[i]))
        ntype = data.predict_ntype
        num_classes = data.num_classes
        features = g.nodes[ntype].data['feat']
        labels = g.nodes[ntype].data['label']
        train_mask = g.nodes[ntype].data['train_mask']
        val_mask = g.nodes[ntype].data['val_mask']
        test_mask = g.nodes[ntype].data['test_mask']
    else:
        data = DATASET[args.dataset]()
        gs = data[0]
        num_classes = data.num_classes
        features = gs[0].ndata['feat']
        labels = gs[0].ndata['label']
        train_mask = gs[0].ndata['train_mask']
        val_mask = gs[0].ndata['val_mask']
        test_mask = gs[0].ndata['test_mask']

    model = HAN(len(gs), features.shape[1], args.num_hidden, num_classes,
                args.num_heads, args.dropout)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)

    score = micro_macro_f1_score if args.task == 'clf' else nmi_ari_score
    if args.task == 'clf':
        metrics = 'Epoch {:d} | Train Loss {:.4f} | Train Micro-F1 {:.4f} | Train Macro-F1 {:.4f}' \
                  ' | Val Micro-F1 {:.4f} | Val Macro-F1 {:.4f}'
    else:
        metrics = 'Epoch {:d} | Train Loss {:.4f} | Train NMI {:.4f} | Train ARI {:.4f}' \
                  ' | Val NMI {:.4f} | Val ARI {:.4f}'
    for epoch in range(args.epochs):
        model.train()
        logits = model(gs, features)
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_metrics = score(logits[train_mask], labels[train_mask])
        val_metrics = score(logits[val_mask], labels[val_mask])
        print(metrics.format(epoch, loss.item(), *train_metrics, *val_metrics))

    test_metrics = evaluate(model, gs, features, labels, test_mask, score)
    if args.task == 'clf':
        print('Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'.format(
            *test_metrics))
    else:
        print('Test NMI {:.4f} | Test ARI {:.4f}'.format(*test_metrics))
def train(args):
    set_random_seed(args.seed)
    data = IMDbDataset()
    g = data[0]
    predict_ntype = data.predict_ntype
    features = g.ndata['feat']  # Dict[str, tensor(N_i, d_i)]
    labels = g.nodes[predict_ntype].data['label']
    train_mask = g.nodes[predict_ntype].data['train_mask']
    val_mask = g.nodes[predict_ntype].data['val_mask']
    test_mask = g.nodes[predict_ntype].data['test_mask']

    print('正在生成基于元路径的图...')
    mgs = {
        ntype:
        [metapath_based_graph(g, metapath) for metapath in METAPATHS[ntype]]
        for ntype in METAPATHS
    }
    for ntype in mgs:
        mgs[ntype][0].ndata['feat'] = g.nodes[ntype].data['feat']
    metapaths_ntype = {
        ntype: [to_ntype_list(g, metapath) for metapath in METAPATHS[ntype]]
        for ntype in METAPATHS
    }

    model = MAGNNMultiLayer(
        args.num_layers, metapaths_ntype,
        {ntype: feat.shape[1]
         for ntype, feat in features.items()}, args.num_hidden,
        data.num_classes, args.num_heads, args.encoder, args.dropout)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    for epoch in range(args.epochs):
        model.train()
        logits = model(mgs, features)[predict_ntype]
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_metrics = micro_macro_f1_score(logits[train_mask],
                                             labels[train_mask])
        print(
            'Epoch {:d} | Train Loss {:.4f} | Train Micro-F1 {:.4f} | Train Macro-F1 {:.4f}'
            .format(epoch, loss.item(), *train_metrics))
        if (epoch + 1) % 10 == 0:
            val_metrics = evaluate(model, mgs, features, predict_ntype, labels,
                                   val_mask)
            print('Val Micro-F1 {:.4f} | Val Macro-F1 {:.4f}'.format(
                *val_metrics))

    test_metrics = evaluate(model, mgs, features, predict_ntype, labels,
                            test_mask)
    print('Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'.format(*test_metrics))
Exemplo n.º 7
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    g, labels, num_classes, train_idx, val_idx, test_idx, evaluator = \
        load_data(args.ogb_path, device)
    load_pretrained_node_embed(g, args.node_embed_path)
    g = g.to(device)

    sampler = MultiLayerNeighborSampler(
        list(range(args.neighbor_size, args.neighbor_size + args.num_layers))
    )
    train_loader = NodeDataLoader(g, {'paper': train_idx}, sampler, device=device, batch_size=args.batch_size)
    val_loader = NodeDataLoader(g, {'paper': val_idx}, sampler, device=device, batch_size=args.batch_size)
    test_loader = NodeDataLoader(g, {'paper': test_idx}, sampler, device=device, batch_size=args.batch_size)

    model = RHGNN(
        {ntype: g.nodes[ntype].data['feat'].shape[1] for ntype in g.ntypes},
        args.num_hidden, num_classes, args.num_rel_hidden, args.num_rel_hidden, args.num_heads,
        g.ntypes, g.canonical_etypes, 'paper', args.num_layers, args.dropout, residual=args.residual
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=len(train_loader) * args.epochs, eta_min=args.lr / 100
    )
    warnings.filterwarnings('ignore', 'Setting attributes on ParameterDict is not supported')
    for epoch in range(args.epochs):
        model.train()
        logits, train_labels, losses = [], [], []
        for input_nodes, output_nodes, blocks in tqdm(train_loader):
            batch_labels = labels[output_nodes['paper']]
            batch_logits = model(blocks, blocks[0].srcdata['feat'])
            loss = F.cross_entropy(batch_logits, batch_labels.squeeze(dim=1))

            logits.append(batch_logits.detach().cpu())
            train_labels.append(batch_labels.detach().cpu())
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            torch.cuda.empty_cache()

        train_acc = accuracy(torch.cat(logits, dim=0), torch.cat(train_labels, dim=0), evaluator)
        val_acc = evaluate(val_loader, device, model, labels, evaluator)
        test_acc = evaluate(test_loader, device, model, labels, evaluator)
        print('Epoch {:d} | Train Loss {:.4f} | Train Acc {:.4f} | Val Acc {:.4f} | Test Acc {:.4f}'.format(
            epoch, torch.tensor(losses).mean().item(), train_acc, val_acc, test_acc
        ))
    # embed = model.inference(g, g.ndata['feat'], device, args.batch_size)
    # test_acc = accuracy(embed[test_idx], labels[test_idx], evaluator)
    test_acc = evaluate(test_loader, device, model, labels, evaluator)
    print('Test Acc {:.4f}'.format(test_acc))
Exemplo n.º 8
0
def train(args):
    set_random_seed(args.seed)
    data = DATASET[args.dataset]()
    g = data[0]
    predict_ntype = data.predict_ntype
    features = {ntype: g.nodes[ntype].data['feat'] for ntype in g.ntypes}
    labels = g.nodes[predict_ntype].data['label']
    train_mask = g.nodes[predict_ntype].data['train_mask']
    val_mask = g.nodes[predict_ntype].data['val_mask']
    test_mask = g.nodes[predict_ntype].data['test_mask']

    model = HGConv(
        {ntype: g.nodes[ntype].data['feat'].shape[1]
         for ntype in g.ntypes}, args.num_hidden, data.num_classes,
        args.num_heads, g.ntypes, g.canonical_etypes, predict_ntype,
        args.num_layers, args.dropout, args.residual)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)

    score = micro_macro_f1_score if args.task == 'clf' else nmi_ari_score
    if args.task == 'clf':
        metrics = 'Epoch {:d} | Train Loss {:.4f} | Train Micro-F1 {:.4f} | Train Macro-F1 {:.4f}' \
                  ' | Val Micro-F1 {:.4f} | Val Macro-F1 {:.4f}' \
                  ' | Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'
    else:
        metrics = 'Epoch {:d} | Train Loss {:.4f} | Train NMI {:.4f} | Train ARI {:.4f}' \
                  ' | Val NMI {:.4f} | Val ARI {:.4f}' \
                  ' | Test NMI {:.4f} | Test ARI {:.4f}'
    warnings.filterwarnings(
        'ignore', 'Setting attributes on ParameterDict is not supported')
    for epoch in range(args.epochs):
        model.train()
        logits = model(g, features)
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_metrics = score(logits[train_mask], labels[train_mask])
        val_metrics = evaluate(model, g, features, labels, val_mask, score)
        test_metrics = evaluate(model, g, features, labels, test_mask, score)
        print(
            metrics.format(epoch, loss.item(), *train_metrics, *val_metrics,
                           *test_metrics))

    test_metrics = evaluate(model, g, features, labels, test_mask, score)
    if args.task == 'clf':
        print('Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'.format(
            *test_metrics))
    else:
        print('Test NMI {:.4f} | Test ARI {:.4f}'.format(*test_metrics))
Exemplo n.º 9
0
def train(args):
    set_random_seed(args.seed)
    data = DATASET[args.dataset]()
    g = data[0]
    predict_ntype = data.predict_ntype
    features = {ntype: g.nodes[ntype].data['feat'] for ntype in g.ntypes}
    labels = g.nodes[predict_ntype].data['label']
    train_mask = g.nodes[predict_ntype].data['train_mask']
    val_mask = g.nodes[predict_ntype].data['val_mask']
    test_mask = g.nodes[predict_ntype].data['test_mask']

    model = HGT(
        {ntype: g.nodes[ntype].data['feat'].shape[1]
         for ntype in g.ntypes}, args.num_hidden, data.num_classes,
        args.num_heads, g.ntypes, g.canonical_etypes, predict_ntype,
        args.num_layers, args.dropout)
    optimizer = optim.AdamW(model.parameters())
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
                                              args.max_lr,
                                              total_steps=args.epochs)
    metrics = 'Epoch {:d} | Train Loss {:.4f} | Train Micro-F1 {:.4f} | Train Macro-F1 {:.4f}' \
              ' | Val Micro-F1 {:.4f} | Val Macro-F1 {:.4f}' \
              ' | Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'
    warnings.filterwarnings(
        'ignore', 'Setting attributes on ParameterDict is not supported')
    for epoch in range(args.epochs):
        model.train()
        logits = model(g, features)
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        scheduler.step()

        train_scores = micro_macro_f1_score(logits[train_mask],
                                            labels[train_mask])
        val_scores = evaluate(model, g, features, labels, val_mask,
                              micro_macro_f1_score)
        test_scores = evaluate(model, g, features, labels, test_mask,
                               micro_macro_f1_score)
        print(
            metrics.format(epoch, loss.item(), *train_scores, *val_scores,
                           *test_scores))
    test_scores = evaluate(model, g, features, labels, test_mask,
                           micro_macro_f1_score)
    print('Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'.format(*test_scores))
Exemplo n.º 10
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    g, labels, num_classes, train_idx, val_idx, test_idx = \
        load_data(args.dataset, args.ogb_root, device)
    feats = g.ndata['feat']

    if args.base_model == 'Linear':
        base_model = nn.Linear(feats.shape[1], num_classes)
    else:
        base_model = MLP(feats.shape[1], args.num_hidden, num_classes,
                         args.num_layers, args.dropout)
    base_model = base_model.to(device)
    train_base_model(base_model, feats, labels, train_idx, val_idx, test_idx,
                     args)
    correct_and_smooth(base_model, g, feats, labels, train_idx, val_idx,
                       test_idx, args)
Exemplo n.º 11
0
def train(args):
    set_random_seed(args.seed)
    device = get_device(args.device)
    data = RatingKnowledgeGraphDataset(args.dataset)
    user_item_graph = data.user_item_graph
    knowledge_graph = dgl.sampling.sample_neighbors(
        data.knowledge_graph, data.knowledge_graph.nodes(), args.neighbor_size, replace=True
    )

    train_eids, test_eids = train_test_split(
        torch.arange(user_item_graph.num_edges()), train_size=args.train_size,
        random_state=args.seed
    )
    sampler = MultiLayerNeighborSampler([args.neighbor_size] * args.num_hops)
    train_loader = KGCNEdgeDataLoader(
        user_item_graph, train_eids, sampler, knowledge_graph,
        device=device, batch_size=args.batch_size
    )
    test_loader = KGCNEdgeDataLoader(
        user_item_graph, test_eids, sampler, knowledge_graph,
        device=device, batch_size=args.batch_size
    )

    model = KGCN(args.num_hidden, args.neighbor_size, args.aggregator, args.num_hops, *data.get_num()).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    for epoch in range(args.epochs):
        model.train()
        losses = []
        for _, pair_graph, blocks in train_loader:
            scores = model(pair_graph, blocks)
            loss = F.binary_cross_entropy(scores, pair_graph.edata['label'])
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch {:d} | Train Loss {:.4f} | Train AUC {:.4f} | Train F1 {:.4f} | Test AUC {:.4f} | Test F1 {:.4f}'.format(
            epoch, sum(losses) / len(losses), *evaluate(model, train_loader), *evaluate(model, test_loader)
        ))
Exemplo n.º 12
0
def train(args):
    set_random_seed(args.seed)
    data = DBLPFourAreaDataset()
    g = data[0]
    metapaths = data.metapaths
    predict_ntype = data.predict_ntype
    generate_one_hot_id(g)
    features = g.ndata['feat']  # Dict[str, tensor(N_i, d_i)]
    labels = g.nodes[predict_ntype].data['label']
    train_idx = g.nodes[predict_ntype].data['train_mask'].nonzero(
        as_tuple=True)[0]
    val_idx = g.nodes[predict_ntype].data['val_mask'].nonzero(as_tuple=True)[0]
    test_idx = g.nodes[predict_ntype].data['test_mask'].nonzero(
        as_tuple=True)[0]
    out_shape = (g.num_nodes(predict_ntype), data.num_classes)

    print('正在生成基于元路径的图(有点慢)...')
    mgs = [metapath_based_graph(g, metapath) for metapath in metapaths]
    mgs[0].ndata['feat'] = features[predict_ntype]
    sampler = MultiLayerNeighborSampler([args.neighbor_size])
    collators = [NodeCollator(mg, None, sampler) for mg in mgs]
    train_dataloader = DataLoader(train_idx, batch_size=args.batch_size)
    val_dataloader = DataLoader(val_idx, batch_size=args.batch_size)
    test_dataloader = DataLoader(test_idx, batch_size=args.batch_size)

    metapaths_ntype = [to_ntype_list(g, metapath) for metapath in metapaths]
    model = MAGNNMinibatch(
        predict_ntype, metapaths_ntype,
        {ntype: feat.shape[1]
         for ntype, feat in features.items()}, args.num_hidden,
        data.num_classes, args.num_heads, args.encoder, args.dropout)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    for epoch in range(args.epochs):
        model.train()
        losses = []
        train_logits = torch.zeros(out_shape)
        for batch in train_dataloader:
            gs = [collator.collate(batch)[2][0] for collator in collators]
            train_logits[batch] = logits = model(gs, features)
            loss = F.cross_entropy(logits, labels[batch])
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_metrics = micro_macro_f1_score(train_logits[train_idx],
                                             labels[train_idx])
        print(
            'Epoch {:d} | Train Loss {:.4f} | Train Micro-F1 {:.4f} | Train Macro-F1 {:.4f}'
            .format(epoch,
                    torch.tensor(losses).mean().item(), *train_metrics))
        if (epoch + 1) % 10 == 0:
            val_metrics = evaluate(out_shape, collators, val_dataloader, model,
                                   features, labels)
            print('Val Micro-F1 {:.4f} | Val Macro-F1 {:.4f}'.format(
                *val_metrics))

    test_metrics = evaluate(out_shape, collators, test_dataloader, model,
                            features, labels)
    print('Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'.format(*test_metrics))