示例#1
0
def test_set_batch_info(idtype):
    ctx = F.ctx()

    g1 = dgl.rand_graph(30, 100).astype(idtype).to(F.ctx())
    g2 = dgl.rand_graph(40, 200).astype(idtype).to(F.ctx())
    bg = dgl.batch([g1, g2])
    batch_num_nodes = F.astype(bg.batch_num_nodes(), idtype)
    batch_num_edges = F.astype(bg.batch_num_edges(), idtype)

    # test homogeneous node subgraph
    sg_n = dgl.node_subgraph(bg, list(range(10, 20)) + list(range(50, 60)))
    induced_nodes = sg_n.ndata['_ID']
    induced_edges = sg_n.edata['_ID']
    new_batch_num_nodes = _get_subgraph_batch_info(bg.ntypes, [induced_nodes],
                                                   batch_num_nodes)
    new_batch_num_edges = _get_subgraph_batch_info(bg.canonical_etypes,
                                                   [induced_edges],
                                                   batch_num_edges)
    sg_n.set_batch_num_nodes(new_batch_num_nodes)
    sg_n.set_batch_num_edges(new_batch_num_edges)
    subg_n1, subg_n2 = dgl.unbatch(sg_n)
    subg1 = dgl.node_subgraph(g1, list(range(10, 20)))
    subg2 = dgl.node_subgraph(g2, list(range(20, 30)))
    assert subg_n1.num_edges() == subg1.num_edges()
    assert subg_n2.num_edges() == subg2.num_edges()

    # test homogeneous edge subgraph
    sg_e = dgl.edge_subgraph(bg,
                             list(range(40, 70)) + list(range(150, 200)),
                             relabel_nodes=False)
    induced_nodes = F.arange(0, bg.num_nodes(), idtype)
    induced_edges = sg_e.edata['_ID']
    new_batch_num_nodes = _get_subgraph_batch_info(bg.ntypes, [induced_nodes],
                                                   batch_num_nodes)
    new_batch_num_edges = _get_subgraph_batch_info(bg.canonical_etypes,
                                                   [induced_edges],
                                                   batch_num_edges)
    sg_e.set_batch_num_nodes(new_batch_num_nodes)
    sg_e.set_batch_num_edges(new_batch_num_edges)
    subg_e1, subg_e2 = dgl.unbatch(sg_e)
    subg1 = dgl.edge_subgraph(g1, list(range(40, 70)), relabel_nodes=False)
    subg2 = dgl.edge_subgraph(g2, list(range(50, 100)), relabel_nodes=False)
    assert subg_e1.num_nodes() == subg1.num_nodes()
    assert subg_e2.num_nodes() == subg2.num_nodes()
def prepare_data_for_year(graph,
                          features,
                          labels,
                          years,
                          current_year,
                          history,
                          exclude_class=None,
                          device=None,
                          backend='dgl',
                          num_hops=None):
    print("Preparing data for year", current_year)
    # Prepare subgraph
    subg_node_mask = ((years <= current_year) & (years >=
                                                 (current_year - history)))
    subg_nodes = torch.arange(features.size(0))[subg_node_mask]

    subg_num_nodes = subg_nodes.size(0)

    if backend == 'dgl':
        print("Creating dgl subgraph")
        subg = dgl.node_subgraph(graph, subg_nodes)
        print("Subgraph type:", type(subg))
        subg.set_n_initializer(dgl.init.zero_initializer)
    elif backend == 'geometric':
        print("Creating geometric subgraph")
        subg, __edge_attr = tg.utils.subgraph(subg_node_mask,
                                              graph,
                                              relabel_nodes=True,
                                              num_nodes=subg_num_nodes)

    else:
        raise ValueError("Unkown backend: " + backend)

    subg_features = features[subg_nodes]
    subg_labels = labels[subg_nodes]
    subg_years = years[subg_nodes]

    # Prepare masks wrt *subgraph*
    # train_nid = torch.arange(subg_num_nodes)[subg_years < current_year]
    # test_nid = torch.arange(subg_num_nodes)[subg_years == current_year]
    # print("[{}] #Training: {}".format(current_year, train_nid.size(0)))
    # print("[{}] #Test    : {}".format(current_year, test_nid.size(0)))

    train_nid = subg_years < current_year
    test_nid = subg_years == current_year
    print("[{}] #Training: {}".format(current_year, train_nid.sum()))
    print("[{}] #Test    : {}".format(current_year, test_nid.sum()))

    if device is not None:
        subg = subg.to(device)
        subg_features = subg_features.to(device)
        subg_labels = subg_labels.to(device)
        # train_nid = train_nid.to(device)
        # test_nid = test_nid.to(device)
    return subg, subg_features, subg_labels, subg_years, train_nid, test_nid
示例#3
0
    def sample_subgraph(self, target_nodes):
        """
        Args:
            target_nodes(Tensor): Tensor of two target nodes
        Returns:
            subgraph(DGLGraph): subgraph
        """
        sample_nodes = [target_nodes]
        frontiers = target_nodes

        for i in range(self.hop):
            frontiers = self.graph.out_edges(frontiers)[1]
            frontiers = torch.unique(frontiers)
            sample_nodes.append(frontiers)

        sample_nodes = torch.cat(sample_nodes)
        sample_nodes = torch.unique(sample_nodes)
        subgraph = dgl.node_subgraph(self.graph, sample_nodes)

        # Each node should have unique node id in the new subgraph
        u_id = int(
            torch.nonzero(subgraph.ndata[NID] == int(target_nodes[0]),
                          as_tuple=False))
        v_id = int(
            torch.nonzero(subgraph.ndata[NID] == int(target_nodes[1]),
                          as_tuple=False))
        # remove link between target nodes in positive subgraphs.
        # Edge removing will rearange NID and EID, which lose the original NID and EID.

        # if dgl.__version__[:5] < '0.6.0':
        #     nids = subgraph.ndata[NID]
        #     eids = subgraph.edata[EID]
        #     if subgraph.has_edges_between(u_id, v_id):
        #         link_id = subgraph.edge_ids(u_id, v_id, return_uv=True)[2]
        #         subgraph.remove_edges(link_id)
        #         eids = eids[subgraph.edata[EID]]
        #     if subgraph.has_edges_between(v_id, u_id):
        #         link_id = subgraph.edge_ids(v_id, u_id, return_uv=True)[2]
        #         subgraph.remove_edges(link_id)
        #         eids = eids[subgraph.edata[EID]]
        #     subgraph.ndata[NID] = nids
        #     subgraph.edata[EID] = eids

        if subgraph.has_edges_between(u_id, v_id):
            link_id = subgraph.edge_ids(u_id, v_id, return_uv=True)[2]
            subgraph.remove_edges(link_id)
        if subgraph.has_edges_between(v_id, u_id):
            link_id = subgraph.edge_ids(v_id, u_id, return_uv=True)[2]
            subgraph.remove_edges(link_id)

        z = drnl_node_labeling(subgraph, u_id, v_id)
        subgraph.ndata['z'] = z

        return subgraph
示例#4
0
 def sample_subgraph_dgl(whole_data):
     data = whole_data[0]  # dgl data
     # find data with different labels
     # random walk
     start = [
         random.randint(0, data.num_nodes - 1)
         for i in range(self.subgraphs)
     ]
     traces, _ = dgl.sampling.random_walk_with_restart(
         data,
         start,
         length=self.sample_batch_size,
         restart_prob=1 / self.sample_walk_length)
     subgraphs = dgl.node_subgraph(
         data, [traces[i, :] for i in traces.size(0)])
     return subgraphs
示例#5
0
    def forward(self, graph: dgl.DGLGraph, feature: torch.Tensor):
        score = self.score_layer(graph, feature).squeeze()
        perm, next_batch_num_nodes = topk(
            score, self.ratio, get_batch_id(graph.batch_num_nodes()),
            graph.batch_num_nodes())
        feature = feature[perm] * self.non_linearity(score[perm]).view(-1, 1)
        graph = dgl.node_subgraph(graph, perm)

        # node_subgraph currently does not support batch-graph,
        # the 'batch_num_nodes' of the result subgraph is None.
        # So we manually set the 'batch_num_nodes' here.
        # Since global pooling has nothing to do with 'batch_num_edges',
        # we can leave it to be None or unchanged.
        graph.set_batch_num_nodes(next_batch_num_nodes)

        return graph, feature, perm
示例#6
0
    def forward(self,
                graph: DGLGraph,
                feat: Tensor,
                select_idx: Tensor,
                non_select_idx: Optional[Tensor] = None,
                scores: Optional[Tensor] = None,
                pool_graph=False):
        """
        Description
        -----------
        Perform graph pooling.

        Parameters
        ----------
        graph : dgl.DGLGraph
            The input graph
        feat : torch.Tensor
            The input node feature
        select_idx : torch.Tensor
            The index in fine graph of node from
            coarse graph, this is obtained from
            previous graph pooling layers. 
        non_select_idx : torch.Tensor, optional
            The index that not included in output graph.
            default: :obj:`None`
        scores : torch.Tensor, optional
            Scores for nodes used for pooling and scaling.
            default: :obj:`None`
        pool_graph : bool, optional
            Whether perform graph pooling on graph topology.
            default: :obj:`False`
        """
        if self.use_gcn:
            feat = self.down_sample_gcn(graph, feat)

        feat = feat[select_idx]
        if scores is not None:
            feat = feat * scores.unsqueeze(-1)

        if pool_graph:
            num_node_batch = graph.batch_num_nodes()
            graph = dgl.node_subgraph(graph, select_idx)
            graph.set_batch_num_nodes(num_node_batch)
            return feat, graph
        else:
            return feat
示例#7
0
 def extract_graph_new(self, G, u_id, v_id):
     v_id += self.num_user
     static_u = torch.zeros(len(self.class_values))
     static_v = torch.zeros(len(self.class_values))
     start0 = time.time()
     u_nodes, v, e_ids_1 = G.in_edges(v_id, "all")
     u, v_nodes, e_ids_2 = G.out_edges(u_id, "all")
     e_ids = []
     nodes = torch.cat([u_nodes, v_nodes])
     for i in range(u_nodes.shape[0]):
         if u_nodes[i] == u_id:
             e_ids.append(e_ids_1[i])
     for i in range(v_nodes.shape[0]):
         if v_nodes[i] == v_id:
             e_ids.append(e_ids_2[i])
     #start1 = time.time()
     #print(start1 - start0)
     subg = dgl.node_subgraph(G, nodes)
     #start2 = time.time()
     #print(start2 - start1)
     subg.ndata['node_label'] = torch.zeros([subg.num_nodes(), 4])
     pid = subg.ndata[dgl.NID]
     #start3 = time.time()
     #print(start3 - start2)
     for i in range(pid.shape[0]):
         if pid[i] == u_id:
             e_u = i
             subg.ndata['node_label'][i, 0] = 1
         elif pid[i] == v_id:
             e_v = i
             subg.ndata['node_label'][i, 1] = 1
         elif pid[i] in u:
             subg.ndata['node_label'][i, 2] = 1
         elif pid[i] in v:
             subg.ndata['node_label'][i, 3] = 1
     subg = dgl.remove_edges(subg, e_ids)
     start6 = time.time()
     print(start6 - start0)
     print()
     return subg
示例#8
0
    def sample_subgraph(self, target_nodes):
        """
        Args:
            target_nodes(Tensor): Tensor of two target nodes
        Returns:
            subgraph(DGLGraph): subgraph
        """
        sample_nodes = [target_nodes]
        frontiers = target_nodes

        for i in range(self.hop):
            frontiers = self.graph.out_edges(frontiers)[1]
            frontiers = torch.unique(frontiers)
            sample_nodes.append(frontiers)

        sample_nodes = torch.cat(sample_nodes)
        sample_nodes = torch.unique(sample_nodes)
        subgraph = dgl.node_subgraph(self.graph, sample_nodes)

        # Each node should have unique node id in the new subgraph
        u_id = int(
            torch.nonzero(subgraph.ndata[NID] == int(target_nodes[0]),
                          as_tuple=False))
        v_id = int(
            torch.nonzero(subgraph.ndata[NID] == int(target_nodes[1]),
                          as_tuple=False))

        # remove link between target nodes in positive subgraphs.
        if subgraph.has_edges_between(u_id, v_id):
            link_id = subgraph.edge_ids(u_id, v_id, return_uv=True)[2]
            subgraph.remove_edges(link_id)
        if subgraph.has_edges_between(v_id, u_id):
            link_id = subgraph.edge_ids(v_id, u_id, return_uv=True)[2]
            subgraph.remove_edges(link_id)

        z = drnl_node_labeling(subgraph, u_id, v_id)
        subgraph.ndata['z'] = z

        return subgraph
示例#9
0
    def step(self, pick_nodes, kick_nodes=[]):
        self.picked_nodes = list(set(self.picked_nodes + pick_nodes.tolist()))
        self.picked_nodes = [
            x for x in self.picked_nodes if x not in kick_nodes
        ]
        self.graph = dgl.node_subgraph(self.dataset, self.picked_nodes)
        features = torch.stack([
            x for i, x in enumerate(self.dataset.ndata['feat'])
            if i in self.picked_nodes
        ])
        labels = torch.stack([
            x for i, x in enumerate(self.dataset.ndata['label'])
            if i in self.picked_nodes
        ])
        loss_fcn = torch.nn.CrossEntropyLoss()

        logits = self.GNN(self.graph, features)
        loss = loss_fcn(logits, labels)

        if len(self.picked_nodes) >= 2000:
            self.done = True

        return -loss, self.done
示例#10
0
                              lr=args.lr1,
                              weight_decay=args.wd1)
    loss_fn = nn.BCEWithLogitsLoss()

    node_list = list(range(n_node))

    # Step 4: Training epochs ================================================================ #
    best = float('inf')
    cnt_wait = 0
    for epoch in range(args.epochs):
        model.train()
        optimizer.zero_grad()

        sample_idx = random.sample(node_list, sample_size)

        g = dgl.node_subgraph(graph, sample_idx)
        dg = dgl.node_subgraph(diff_graph, sample_idx)

        f = g.ndata.pop('feat')
        ew = dg.edata.pop('edge_weight')

        shuf_idx = np.random.permutation(sample_size)
        sf = f[shuf_idx, :]

        g = g.to(args.device)
        dg = dg.to(args.device)
        f = f.to(args.device)
        ew = ew.to(args.device)

        sf = sf.to(args.device)
示例#11
0
文件: train.py 项目: yuk12/dgl
def train(args, device):
    elliptic_dataset = EllipticDataset(raw_dir=args.raw_dir,
                                       processed_dir=args.processed_dir,
                                       self_loop=True,
                                       reverse_edge=True)

    g, node_mask_by_time = elliptic_dataset.process()
    num_classes = elliptic_dataset.num_classes

    cached_subgraph = []
    cached_labeled_node_mask = []
    for i in range(len(node_mask_by_time)):
        # we add self loop edge when we construct full graph, not here
        node_subgraph = dgl.node_subgraph(graph=g, nodes=node_mask_by_time[i])
        cached_subgraph.append(node_subgraph.to(device))
        valid_node_mask = node_subgraph.ndata['label'] >= 0
        cached_labeled_node_mask.append(valid_node_mask)

    if args.model == 'EvolveGCN-O':
        model = EvolveGCNO(in_feats=int(g.ndata['feat'].shape[1]),
                           n_hidden=args.n_hidden,
                           num_layers=args.n_layers)
    elif args.model == 'EvolveGCN-H':
        model = EvolveGCNH(in_feats=int(g.ndata['feat'].shape[1]),
                           num_layers=args.n_layers)
    else:
        return NotImplementedError('Unsupported model {}'.format(args.model))
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # split train, valid, test(0-30,31-35,36-48)
    # train/valid/test split follow the paper.
    train_max_index = 30
    valid_max_index = 35
    test_max_index = 48
    time_window_size = args.n_hist_steps
    loss_class_weight = [float(w) for w in args.loss_class_weight.split(',')]
    loss_class_weight = torch.Tensor(loss_class_weight).to(device)

    train_measure = Measure(num_classes=num_classes,
                            target_class=args.eval_class_id)
    valid_measure = Measure(num_classes=num_classes,
                            target_class=args.eval_class_id)
    test_measure = Measure(num_classes=num_classes,
                           target_class=args.eval_class_id)

    test_res_f1 = 0
    for epoch in range(args.num_epochs):
        model.train()
        for i in range(time_window_size, train_max_index + 1):
            g_list = cached_subgraph[i - time_window_size:i + 1]
            predictions = model(g_list)
            # get predictions which has label
            predictions = predictions[cached_labeled_node_mask[i]]
            labels = cached_subgraph[i].ndata['label'][
                cached_labeled_node_mask[i]].long()
            loss = F.cross_entropy(predictions,
                                   labels,
                                   weight=loss_class_weight)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_measure.append_measures(predictions, labels)

        # get each epoch measures during training.
        cl_precision, cl_recall, cl_f1 = train_measure.get_total_measure()
        train_measure.update_best_f1(cl_f1, epoch)
        # reset measures for next epoch
        train_measure.reset_info()

        print(
            "Train Epoch {} | class {} | precision:{:.4f} | recall: {:.4f} | f1: {:.4f}"
            .format(epoch, args.eval_class_id, cl_precision, cl_recall, cl_f1))

        # eval
        model.eval()
        for i in range(train_max_index + 1, valid_max_index + 1):
            g_list = cached_subgraph[i - time_window_size:i + 1]
            predictions = model(g_list)
            # get node predictions which has label
            predictions = predictions[cached_labeled_node_mask[i]]
            labels = cached_subgraph[i].ndata['label'][
                cached_labeled_node_mask[i]].long()

            valid_measure.append_measures(predictions, labels)

        # get each epoch measure during eval.
        cl_precision, cl_recall, cl_f1 = valid_measure.get_total_measure()
        valid_measure.update_best_f1(cl_f1, epoch)
        # reset measures for next epoch
        valid_measure.reset_info()

        print(
            "Eval Epoch {} | class {} | precision:{:.4f} | recall: {:.4f} | f1: {:.4f}"
            .format(epoch, args.eval_class_id, cl_precision, cl_recall, cl_f1))

        # early stop
        if epoch - valid_measure.target_best_f1_epoch >= args.patience:
            print("Best eval Epoch {}, Cur Epoch {}".format(
                valid_measure.target_best_f1_epoch, epoch))
            break
        # if cur valid f1 score is best, do test
        if epoch == valid_measure.target_best_f1_epoch:
            print("###################Epoch {} Test###################".format(
                epoch))
            for i in range(valid_max_index + 1, test_max_index + 1):
                g_list = cached_subgraph[i - time_window_size:i + 1]
                predictions = model(g_list)
                # get predictions which has label
                predictions = predictions[cached_labeled_node_mask[i]]
                labels = cached_subgraph[i].ndata['label'][
                    cached_labeled_node_mask[i]].long()

                test_measure.append_measures(predictions, labels)

            # we get each subgraph measure when testing to match fig 4 in EvolveGCN paper.
            cl_precisions, cl_recalls, cl_f1s = test_measure.get_each_timestamp_measure(
            )
            for index, (sub_p, sub_r, sub_f1) in enumerate(
                    zip(cl_precisions, cl_recalls, cl_f1s)):
                print(
                    "  Test | Time {} | precision:{:.4f} | recall: {:.4f} | f1: {:.4f}"
                    .format(valid_max_index + index + 2, sub_p, sub_r, sub_f1))

            # get each epoch measure during test.
            cl_precision, cl_recall, cl_f1 = test_measure.get_total_measure()
            test_measure.update_best_f1(cl_f1, epoch)
            # reset measures for next test
            test_measure.reset_info()

            test_res_f1 = cl_f1

            print(
                "  Test | Epoch {} | class {} | precision:{:.4f} | recall: {:.4f} | f1: {:.4f}"
                .format(epoch, args.eval_class_id, cl_precision, cl_recall,
                        cl_f1))

    print("Best test f1 is {}, in Epoch {}".format(
        test_measure.target_best_f1, test_measure.target_best_f1_epoch))
    if test_measure.target_best_f1_epoch != valid_measure.target_best_f1_epoch:
        print(
            "The Epoch get best Valid measure not get the best Test measure, "
            "please checkout the test result in Epoch {}, which f1 is {}".
            format(valid_measure.target_best_f1_epoch, test_res_f1))
示例#12
0
文件: cluster_gcn.py 项目: yifeim/dgl
def main(args):
    torch.manual_seed(args.rnd_seed)
    np.random.seed(args.rnd_seed)
    random.seed(args.rnd_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    multitask_data = set(['ppi'])
    multitask = args.dataset in multitask_data

    # load and preprocess dataset
    data = load_data(args)
    g = data.g
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    labels = g.ndata['label']

    train_nid = np.nonzero(train_mask.data.numpy())[0].astype(np.int64)

    # Normalize features
    if args.normalize:
        feats = g.ndata['feat']
        train_feats = feats[train_mask]
        scaler = sklearn.preprocessing.StandardScaler()
        scaler.fit(train_feats.data.numpy())
        features = scaler.transform(feats.data.numpy())
        g.ndata['feat'] = torch.FloatTensor(features)

    in_feats = g.ndata['feat'].shape[1]
    n_classes = data.num_classes
    n_edges = g.number_of_edges()

    n_train_samples = train_mask.int().sum().item()
    n_val_samples = val_mask.int().sum().item()
    n_test_samples = test_mask.int().sum().item()

    print("""----Data statistics------'
    #Edges %d
    #Classes %d
    #Train samples %d
    #Val samples %d
    #Test samples %d""" %
            (n_edges, n_classes,
            n_train_samples,
            n_val_samples,
            n_test_samples))
    # create GCN model
    if args.self_loop and not args.dataset.startswith('reddit'):
        g = dgl.remove_self_loop(g)
        g = dgl.add_self_loop(g)
        print("adding self-loop edges")
    # metis only support int64 graph
    g = g.long()

    if args.use_pp:
        g.update_all(fn.copy_u('feat', 'm'), fn.sum('m', 'feat_agg'))
        g.ndata['feat'] = torch.cat([g.ndata['feat'], g.ndata['feat_agg']], 1)
        del g.ndata['feat_agg']

    cluster_iterator = dgl.dataloading.GraphDataLoader(
        dgl.dataloading.ClusterGCNSubgraphIterator(
            dgl.node_subgraph(g, train_nid), args.psize, './cache'),
        batch_size=args.batch_size, num_workers=4)
    #cluster_iterator = ClusterIter(
    #    args.dataset, g, args.psize, args.batch_size, train_nid, use_pp=args.use_pp)

    # set device for dataset tensors
    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()
        g = g.int().to(args.gpu)

    print('labels shape:', g.ndata['label'].shape)
    print("features shape, ", g.ndata['feat'].shape)

    model = GraphSAGE(in_feats,
                      args.n_hidden,
                      n_classes,
                      args.n_layers,
                      F.relu,
                      args.dropout,
                      args.use_pp)

    if cuda:
        model.cuda()

    # logger and so on
    log_dir = save_log_dir(args)
    logger = Logger(os.path.join(log_dir, 'loggings'))
    logger.write(args)

    # Loss function
    if multitask:
        print('Using multi-label loss')
        loss_f = nn.BCEWithLogitsLoss()
    else:
        print('Using multi-class loss')
        loss_f = nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # set train_nids to cuda tensor
    if cuda:
        train_nid = torch.from_numpy(train_nid).cuda()
        print("current memory after model before training",
              torch.cuda.memory_allocated(device=train_nid.device) / 1024 / 1024)
    start_time = time.time()
    best_f1 = -1

    for epoch in range(args.n_epochs):
        for j, cluster in enumerate(cluster_iterator):
            # sync with upper level training graph
            if cuda:
                cluster = cluster.to(torch.cuda.current_device())
            model.train()
            # forward
            batch_labels = cluster.ndata['label']
            batch_train_mask = cluster.ndata['train_mask']
            if batch_train_mask.sum().item() == 0:
                continue
            pred = model(cluster)
            loss = loss_f(pred[batch_train_mask],
                          batch_labels[batch_train_mask])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # in PPI case, `log_every` is chosen to log one time per epoch. 
            # Choose your log freq dynamically when you want more info within one epoch
            if j % args.log_every == 0:
                print(f"epoch:{epoch}/{args.n_epochs}, Iteration {j}/"
                      f"{len(cluster_iterator)}:training loss", loss.item())
        print("current memory:",
              torch.cuda.memory_allocated(device=pred.device) / 1024 / 1024)

        # evaluate
        if epoch % args.val_every == 0:
            val_f1_mic, val_f1_mac = evaluate(
                model, g, labels, val_mask, multitask)
            print(
                "Val F1-mic{:.4f}, Val F1-mac{:.4f}". format(val_f1_mic, val_f1_mac))
            if val_f1_mic > best_f1:
                best_f1 = val_f1_mic
                print('new best val f1:', best_f1)
                torch.save(model.state_dict(), os.path.join(
                    log_dir, 'best_model.pkl'))

    end_time = time.time()
    print(f'training using time {start_time-end_time}')

    # test
    if args.use_val:
        model.load_state_dict(torch.load(os.path.join(
            log_dir, 'best_model.pkl')))
    test_f1_mic, test_f1_mac = evaluate(
        model, g, labels, test_mask, multitask)
    print("Test F1-mic{:.4f}, Test F1-mac{:.4f}". format(test_f1_mic, test_f1_mac))
示例#13
0
    compact_g.ndata['orig_id'] = th.as_tensor(per_type_ids)
    compact_g.ndata[dgl.NTYPE] = th.as_tensor(ntype)
    compact_g.ndata[dgl.NID] = th.as_tensor(uniq_ids)
    compact_g.ndata['inner_node'] = th.as_tensor(np.logical_and(
        uniq_ids >= nid_range[0], uniq_ids <= nid_range[1]))
    local_nids = compact_g.ndata[dgl.NID][compact_g.ndata['inner_node'].bool()]
    assert np.all((local_nids == th.arange(
        local_nids[0], local_nids[-1] + 1)).numpy())
    print('|V|={}'.format(compact_g.number_of_nodes()))
    print('|E|={}'.format(compact_g.number_of_edges()))

    # We need to reshuffle nodes in a partition so that all local nodes are labelled starting from 0.
    reshuffle_nodes = th.arange(compact_g.number_of_nodes())
    reshuffle_nodes = th.cat([reshuffle_nodes[compact_g.ndata['inner_node'].bool()],
                              reshuffle_nodes[compact_g.ndata['inner_node'] == 0]])
    compact_g1 = dgl.node_subgraph(compact_g, reshuffle_nodes)
    compact_g1.ndata['orig_id'] = compact_g.ndata['orig_id'][reshuffle_nodes]
    compact_g1.ndata[dgl.NTYPE] = compact_g.ndata[dgl.NTYPE][reshuffle_nodes]
    compact_g1.ndata[dgl.NID] = compact_g.ndata[dgl.NID][reshuffle_nodes]
    compact_g1.ndata['inner_node'] = compact_g.ndata['inner_node'][reshuffle_nodes]
    compact_g1.edata['orig_id'] = compact_g.edata['orig_id'][compact_g1.edata[dgl.EID]]
    compact_g1.edata[dgl.ETYPE] = compact_g.edata[dgl.ETYPE][compact_g1.edata[dgl.EID]]
    compact_g1.edata['inner_edge'] = compact_g.edata['inner_edge'][compact_g1.edata[dgl.EID]]

    # reshuffle edges on ETYPE as node_subgraph relabels edges
    idx = th.argsort(compact_g1.edata[dgl.ETYPE])
    u, v = compact_g1.edges()
    u = u[idx]
    v = v[idx]
    compact_g2 = dgl.graph((u, v))
    compact_g2.ndata['orig_id'] = compact_g1.ndata['orig_id']
示例#14
0
    def forward(self, graph: DGLGraph, feat: Tensor, e_feat=None):
        # top-k pool first
        if e_feat is None:
            e_feat = torch.ones((graph.number_of_edges(), ),
                                dtype=feat.dtype,
                                device=feat.device)
        batch_num_nodes = graph.batch_num_nodes()
        x_score = self.calc_info_score(graph, feat, e_feat)
        perm, next_batch_num_nodes = topk(x_score, self.ratio,
                                          get_batch_id(batch_num_nodes),
                                          batch_num_nodes)
        feat = feat[perm]
        pool_graph = None
        if not self.sample or not self.sl:
            # pool graph
            graph.edata["e"] = e_feat
            pool_graph = dgl.node_subgraph(graph, perm)
            e_feat = pool_graph.edata.pop("e")
            pool_graph.set_batch_num_nodes(next_batch_num_nodes)

        # no structure learning layer, directly return.
        if not self.sl:
            return pool_graph, feat, e_feat, perm

        # Structure Learning
        if self.sample:
            # A fast mode for large graphs.
            # In large graphs, learning the possible edge weights between each
            # pair of nodes is time consuming. To accelerate this process,
            # we sample it's K-Hop neighbors for each node and then learn the
            # edge weights between them.

            # first build multi-hop graph
            row, col = graph.all_edges()
            num_nodes = graph.num_nodes()

            scipy_adj = scipy.sparse.coo_matrix(
                (e_feat.detach().cpu(),
                 (row.detach().cpu(), col.detach().cpu())),
                shape=(num_nodes, num_nodes))
            for _ in range(self.k_hop):
                two_hop = scipy_adj**2
                two_hop = two_hop * (1e-5 / two_hop.max())
                scipy_adj = two_hop + scipy_adj
            row, col = scipy_adj.nonzero()
            row = torch.tensor(row, dtype=torch.long, device=graph.device)
            col = torch.tensor(col, dtype=torch.long, device=graph.device)
            e_feat = torch.tensor(scipy_adj.data,
                                  dtype=torch.float,
                                  device=feat.device)

            # perform pooling on multi-hop graph
            mask = perm.new_full((num_nodes, ), -1)
            i = torch.arange(perm.size(0),
                             dtype=torch.long,
                             device=perm.device)
            mask[perm] = i
            row, col = mask[row], mask[col]
            mask = (row >= 0) & (col >= 0)
            row, col = row[mask], col[mask]
            e_feat = e_feat[mask]

            # add remaining self loops
            mask = row != col
            num_nodes = perm.size(0)  # num nodes after pool
            loop_index = torch.arange(0,
                                      num_nodes,
                                      dtype=row.dtype,
                                      device=row.device)
            inv_mask = ~mask
            loop_weight = torch.full((num_nodes, ),
                                     0,
                                     dtype=e_feat.dtype,
                                     device=e_feat.device)
            remaining_e_feat = e_feat[inv_mask]
            if remaining_e_feat.numel() > 0:
                loop_weight[row[inv_mask]] = remaining_e_feat
            e_feat = torch.cat([e_feat[mask], loop_weight], dim=0)
            row, col = row[mask], col[mask]
            row = torch.cat([row, loop_index], dim=0)
            col = torch.cat([col, loop_index], dim=0)

            # attention scores
            weights = (torch.cat([feat[row], feat[col]], dim=1) *
                       self.att).sum(dim=-1)
            weights = F.leaky_relu(weights,
                                   self.negative_slop) + e_feat * self.lamb

            # sl and normalization
            sl_graph = dgl.graph((row, col))
            if self.sparse:
                weights = edge_sparsemax(sl_graph, weights)
            else:
                weights = edge_softmax(sl_graph, weights)

            # get final graph
            mask = torch.abs(weights) > 0
            row, col, weights = row[mask], col[mask], weights[mask]
            pool_graph = dgl.graph((row, col))
            pool_graph.set_batch_num_nodes(next_batch_num_nodes)
            e_feat = weights

        else:
            # Learning the possible edge weights between each pair of
            # nodes in the pooled subgraph, relative slower.

            # construct complete graphs for all graph in the batch
            # use dense to build, then transform to sparse.
            # maybe there's more efficient way?
            batch_num_nodes = next_batch_num_nodes
            block_begin_idx = torch.cat([
                batch_num_nodes.new_zeros(1),
                batch_num_nodes.cumsum(dim=0)[:-1]
            ],
                                        dim=0)
            block_end_idx = batch_num_nodes.cumsum(dim=0)
            dense_adj = torch.zeros(
                (pool_graph.num_nodes(), pool_graph.num_nodes()),
                dtype=torch.float,
                device=feat.device)
            for idx_b, idx_e in zip(block_begin_idx, block_end_idx):
                dense_adj[idx_b:idx_e, idx_b:idx_e] = 1.
            row, col = torch.nonzero(dense_adj).t().contiguous()

            # compute weights for node-pairs
            weights = (torch.cat([feat[row], feat[col]], dim=1) *
                       self.att).sum(dim=-1)
            weights = F.leaky_relu(weights, self.negative_slop)
            dense_adj[row, col] = weights

            # add pooled graph structure to weight matrix
            pool_row, pool_col = pool_graph.all_edges()
            dense_adj[pool_row, pool_col] += self.lamb * e_feat
            weights = dense_adj[row, col]
            del dense_adj
            torch.cuda.empty_cache()

            # edge softmax/sparsemax
            complete_graph = dgl.graph((row, col))
            if self.sparse:
                weights = edge_sparsemax(complete_graph, weights)
            else:
                weights = edge_softmax(complete_graph, weights)

            # get new e_feat and graph structure, clean up.
            mask = torch.abs(weights) > 1e-9
            row, col, weights = row[mask], col[mask], weights[mask]
            e_feat = weights
            pool_graph = dgl.graph((row, col))
            pool_graph.set_batch_num_nodes(next_batch_num_nodes)

        return pool_graph, feat, e_feat, perm
import numpy as np
import networkx as nx
from karateclub.node_embedding.attributed import TENE

import dgl
from dgl.data import CoraGraphDataset

data = CoraGraphDataset()
g = data[0]
g = dgl.add_self_loop(
    dgl.node_subgraph(g, list(set(np.random.choice(len(g.nodes()), 5)))))
X = np.array(g.ndata['feat'])
g = dgl.to_networkx(g).to_undirected()

# g = nx.newman_watts_strogatz_graph(200, 20, 0.05)
#
# X = np.random.uniform(0, 1, (200, 200))

model = TENE()

model.fit(g, X)
embedding = model.get_embedding()
embedding = np.sum(embedding, axis=0)
# print(np.concatenate((embedding, embedding), axis=0).shape)
print(embedding.shape)
示例#16
0
 def reset(self):
     self.picked_nodes = list(
         set(np.random.choice(dataset.nodes().numpy(), 1)))
     self.graph = dgl.node_subgraph(self.dataset, self.picked_nodes)
示例#17
0
import argparse
import dgl
from dgl.data import CoraGraphDataset, CiteseerGraphDataset, PubmedGraphDataset
import networkx as nx
import torch
import torch.nn.functional as F
import numpy as np
from GAT.model import GAT

data = CoraGraphDataset()
dataset = data[0]
picked_nodes = list(set(np.random.choice(dataset.nodes().numpy(), 200)))
graph = dgl.node_subgraph(dataset, picked_nodes)
graph = dgl.add_self_loop(graph)
features = torch.stack(
    [x for i, x in enumerate(dataset.ndata['feat']) if i in picked_nodes])
g = dgl.to_networkx(graph)
f = np.array(dataset.ndata['feat'])
print("*************")
print(graph.edata['_ID'])

parser = argparse.ArgumentParser(description='GAT')
parser.add_argument("--gpu",
                    type=int,
                    default=0,
                    help="which GPU to use. Set -1 to use CPU.")
parser.add_argument("--epochs",
                    type=int,
                    default=500,
                    help="number of training epochs")
parser.add_argument("--num-heads",