Пример #1
0
def test_negative_sampling():
    edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]])

    neg_edge_index = negative_sampling(edge_index)
    assert neg_edge_index.size(1) == edge_index.size(1)

    adj = torch.zeros(4, 4, dtype=torch.bool)
    adj[edge_index[0], edge_index[1]] = 1

    neg_adj = torch.zeros(4, 4, dtype=torch.bool)
    neg_adj[neg_edge_index[0], neg_edge_index[1]] = 1
    assert (adj & neg_adj).sum() == 0

    neg_edge_index = negative_sampling(edge_index, num_neg_samples=2)
    assert neg_edge_index.size(1) == 2

    undirected_edge_index = to_undirected(edge_index)
    undirected_neg_edge_index = negative_sampling(undirected_edge_index,
                                                  force_undirected=True)
    assert is_undirected(undirected_neg_edge_index)
    assert undirected_neg_edge_index.size(1) <= undirected_edge_index.size(1)

    undirected_adj = torch.zeros(4, 4, dtype=torch.bool)
    undirected_adj[undirected_edge_index[0], undirected_edge_index[1]] = 1

    undirected_neg_adj = torch.zeros(4, 4, dtype=torch.bool)
    undirected_neg_adj[undirected_neg_edge_index[0],
                       undirected_neg_edge_index[1]] = 1
    assert (undirected_adj & undirected_neg_adj).sum() == 0
def test_bipartite_negative_sampling():
    edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]])

    neg_edge_index = negative_sampling(edge_index, num_nodes=(3, 4))
    assert neg_edge_index.size(1) == edge_index.size(1)
    assert is_negative(edge_index, neg_edge_index, (3, 4), bipartite=True)

    neg_edge_index = negative_sampling(edge_index, num_nodes=(3, 4),
                                       num_neg_samples=2)
    assert neg_edge_index.size(1) == 2
    assert is_negative(edge_index, neg_edge_index, (3, 4), bipartite=True)
Пример #3
0
def test_negative_sampling():
    edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]])

    neg_edge_index = negative_sampling(edge_index)
    assert neg_edge_index.size(1) == edge_index.size(1)

    adj = torch.zeros(4, 4, dtype=torch.uint8)
    adj[edge_index[0], edge_index[1]] = 1

    neg_adj = torch.zeros(4, 4, dtype=torch.uint8)
    neg_adj[neg_edge_index[0], neg_edge_index[1]] = 1
    assert (adj & neg_adj).sum() == 0

    neg_edge_index = negative_sampling(edge_index, num_neg_samples=2)
    assert neg_edge_index.size(1) == 2
def train():
	alpha = 0.7
	model.train()
	#negative sampling
	neg_row, neg_col = negative_sampling(data.train_pos_edge_index, 
																	num_nodes = data.num_nodes,  
																	num_neg_samples= data.train_pos_edge_index.size(1))
	to_keep = ~ torch.logical_and(neg_row >= data.x_paper.size(0) , neg_col >= data.x_paper.size(0)) #keep exclude mesh-mesh edges
	neg_row, neg_col = neg_row[to_keep], neg_col[to_keep]
	train_neg_edge_index = torch.stack([neg_row, neg_col], dim=0)
	train_neg_edge_type = torch.logical_or(torch.logical_and(neg_row < data.x_paper.size(0) , neg_col >= data.x_paper.size(0)), torch.logical_and(neg_row >= data.x_paper.size(0) , neg_col < data.x_paper.size(0))).to(torch.float32)
	sort_indices = torch.argsort(train_neg_edge_type)
	train_neg_edge_index = train_neg_edge_index[:, sort_indices]
	train_neg_edge_type = train_neg_edge_type[sort_indices]
	optimizer.zero_grad()
	z = model.encode()
	link_logits = model.decode(z, data.train_pos_edge_index, data.train_pos_edge_type, train_neg_edge_index, train_neg_edge_type)
	link_labels = get_link_labels(data.train_pos_edge_index, train_neg_edge_index)
	link_logits_paper_paper = model.decode(z, data.train_pos_edge_index[:, data.train_pos_edge_type == 0], data.train_pos_edge_type[data.train_pos_edge_type == 0], train_neg_edge_index[:, train_neg_edge_type ==0], train_neg_edge_type[train_neg_edge_type ==0])
	link_logits_paper_mesh = model.decode(z,  data.train_pos_edge_index[:, data.train_pos_edge_type == 1], data.train_pos_edge_type[data.train_pos_edge_type == 1], train_neg_edge_index[:, train_neg_edge_type ==1], train_neg_edge_type[train_neg_edge_type ==1])
	link_labels_paper_paper = get_link_labels(data.train_pos_edge_index[:, data.train_pos_edge_type == 0], train_neg_edge_index[:, train_neg_edge_type ==0])
	link_labels_paper_mesh = get_link_labels(data.train_pos_edge_index[:, data.train_pos_edge_type == 1], train_neg_edge_index[:, train_neg_edge_type ==1])
	loss_paper_paper = F.binary_cross_entropy_with_logits(link_logits_paper_paper, link_labels_paper_paper)
	loss_paper_mesh = F.binary_cross_entropy_with_logits(link_logits_paper_mesh, link_labels_paper_mesh)
	loss = (1/2) * ((1 - alpha) * loss_paper_paper + alpha * loss_paper_mesh)
	# loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
	loss.backward()
	optimizer.step()
	link_probs = link_logits.sigmoid()
	link_probs_paper_paper = link_logits_paper_paper.sigmoid()
	link_probs_paper_mesh = link_logits_paper_mesh.sigmoid()
	rocauc=roc_auc_score(link_labels.detach().cpu().numpy(), link_probs.detach().cpu().numpy())
	roc_auc_pp=roc_auc_score(link_labels_paper_paper.detach().cpu().numpy(), link_probs_paper_paper.detach().cpu().numpy())
	roc_auc_pm=roc_auc_score(link_labels_paper_mesh.detach().cpu().numpy(), link_probs_paper_mesh.detach().cpu().numpy())
	return loss, rocauc, roc_auc_pp, roc_auc_pm
Пример #5
0
    def recon_loss1(self, z, edge_index, batch):

        EPS = 1e-15
        MAX_LOGSTD = 10
        r"""Given latent variables :obj:`z`, computes the binary cross
        entropy loss for positive edges :obj:`pos_edge_index` and negative
        sampled edges.
  
        Args:
            z (Tensor): The latent space :math:`\mathbf{Z}`.
            pos_edge_index (LongTensor): The positive edges to train against.
        """



        recon_adj = self.edge_recon(z, edge_index)


        pos_loss = -torch.log(
            recon_adj + EPS).mean()

        # Do not include self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)

        neg_edge_index = negative_sampling(pos_edge_index, z.size(0)) #random thingggg
        neg_loss = -torch.log(1 -
                              self.edge_recon(z, neg_edge_index) +
                              EPS).mean()

        return pos_loss + neg_loss
Пример #6
0
    def __collate__(self, data_list):
        assert len(data_list) == 1
        node_idx, adj = data_list[0]

        data = self.data.__class__()
        data.num_nodes = node_idx.size(0)
        row, col, edge_idx = adj.coo()
        data.edge_index = torch.stack([row, col], dim=0)

        if self.use_negative_sampling:
            data.neg_edge_index = negative_sampling(
                edge_index=data.edge_index,
                num_nodes=data.num_nodes,
                num_neg_samples=int(data.edge_index.size(1) * self.neg_sample_ratio),
            )

        for key, item in self.data:
            if isinstance(item, torch.Tensor) and item.size(0) == self.N:
                data[key] = item[node_idx]
            elif isinstance(item, torch.Tensor) and item.size(0) == self.E:
                data[key] = item[edge_idx]
            else:
                data[key] = item

        # if self.sample_coverage > 0:
        #     data.node_norm = self.node_norm[node_idx]
        #     data.edge_norm = self.edge_norm[edge_idx]
        return data
Пример #7
0
def test_negative_sampling():
    edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]])

    neg_edge_index = negative_sampling(edge_index)
    assert neg_edge_index.size(1) == edge_index.size(1)
    assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False)

    neg_edge_index = negative_sampling(edge_index, num_neg_samples=2)
    assert neg_edge_index.size(1) == 2
    assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False)

    edge_index = to_undirected(edge_index)
    neg_edge_index = negative_sampling(edge_index, force_undirected=True)
    assert neg_edge_index.size(1) == edge_index.size(1) - 1
    assert is_undirected(neg_edge_index)
    assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False)
Пример #8
0
def train(model, x, adj_t, split_edge, optimizer, batch_size):
    model.train()

    row, col, _ = adj_t.coo()
    edge_index = torch.stack([col, row], dim=0)

    pos_train_edge = split_edge.train_pos_edge_index.to(x.device)

    total_loss = total_examples = 0
    for index, perm in enumerate(DataLoader(range(pos_train_edge.size(1)), batch_size,
                           shuffle=True)):

        optimizer.zero_grad()
        
        pos_edge = pos_train_edge[:,perm]

        neg_edge = negative_sampling(pos_train_edge, num_nodes=x.size(0),
                                 num_neg_samples=perm.size(0), method='dense')

        loss = model.loss(x, pos_train_edge, pos_edge, neg_edge)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        num_examples = perm.size(0)
        total_loss += loss.item() * num_examples
        total_examples += num_examples

        # if 
        
    return total_loss / total_examples
Пример #9
0
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index,
        num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1),
        method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ],
                           dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss
Пример #10
0
def train(data, model, optimizer):
    model.train()

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=data.train_pos_edge_index.size(1))

    train_neg_edge_set = set(map(tuple, neg_edge_index.T.tolist()))
    val_pos_edge_set = set(map(tuple, data.val_pos_edge_index.T.tolist()))
    test_pos_edge_set = set(map(tuple, data.test_pos_edge_index.T.tolist()))
    if (len(train_neg_edge_set & val_pos_edge_set) >
            0) or (len(train_neg_edge_set & test_pos_edge_set) > 0):
        # 训练集负样本与验证集负样本存在交集,或训练集负样本与测试集负样本存在交集
        print('wrong!')

    optimizer.zero_grad()
    z = model.encode(data.x, data.train_pos_edge_index)
    link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index)
    link_labels = get_link_labels(data.train_pos_edge_index,
                                  neg_edge_index).to(data.x.device)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss
Пример #11
0
def train_emb(model, edge_index, data, optimizer, batch_size, device):
    model.train()
    data_loader = DataLoader(range(edge_index.shape[1]),
                             batch_size,
                             shuffle=True)

    total_loss = total_examples = 0
    for perm in data_loader:
        model.zero_grad()

        pos_edge = edge_index[:, perm].to(device)  # 2 x batch_size
        pos_out = model.forward_emb(*pos_edge)
        # add 1e-15 to avoid exploding gradients
        pos_loss = -torch.log(pos_out + 1e-15).mean()

        # negative sampling on the graph
        neg_edge = negative_sampling(edge_index,
                                     num_nodes=data.num_nodes,
                                     num_neg_samples=perm.size(0),
                                     method='sparse').to(
                                         device)  # 2 x batch_size
        neg_out = model.forward_emb(*neg_edge)
        neg_loss = -torch.log(1 - neg_out + 1e-15).mean()

        loss = pos_loss + neg_loss
        loss.backward()
        optimizer.step()

        num_examples = pos_out.size(0)
        total_loss += loss.item() * num_examples
        total_examples += num_examples

    return total_loss / total_examples
Пример #12
0
    def nll_loss(self, z, pos_edge_index, neg_edge_index):
        """Computes the discriminator loss based on node embeddings :obj:`z`,
        and positive edges :obj:`pos_edge_index` and negative nedges
        :obj:`neg_edge_index`.

        Args:
            z (Tensor): The node embeddings.
            pos_edge_index (LongTensor): The positive edge indices.
            neg_edge_index (LongTensor): The negative edge indices.
        """

        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
        none_edge_index = negative_sampling(edge_index, z.size(0))

        nll_loss = 0
        nll_loss += F.nll_loss(
            self.discriminate(z, pos_edge_index),
            pos_edge_index.new_full((pos_edge_index.size(1), ), 0))
        nll_loss += F.nll_loss(
            self.discriminate(z, neg_edge_index),
            neg_edge_index.new_full((neg_edge_index.size(1), ), 1))
        nll_loss += F.nll_loss(
            self.discriminate(z, none_edge_index),
            none_edge_index.new_full((none_edge_index.size(1), ), 2))
        return nll_loss / 3.0
Пример #13
0
            def train():
                model.train()
                optimizer.zero_grad()

                x, pos_edge_index = data.x, data.train_pos_edge_index

                _edge_index, _ = remove_self_loops(pos_edge_index)
                pos_edge_index_with_self_loops, _ = add_self_loops(
                    _edge_index, num_nodes=x.size(0))

                neg_edge_index = negative_sampling(
                    edge_index=pos_edge_index_with_self_loops,
                    num_nodes=x.size(0),
                    num_neg_samples=pos_edge_index.size(1))

                link_logits, attr_prediction, attack_prediction = model(
                    pos_edge_index, neg_edge_index)
                link_labels = get_link_labels(pos_edge_index, neg_edge_index)

                loss = F.binary_cross_entropy_with_logits(
                    link_logits, link_labels)
                loss.backward(retain_graph=True)
                optimizer.step()

                optimizer_att.zero_grad()
                loss2 = F.nll_loss(attack_prediction, labels)
                loss2.backward()
                optimizer_att.step()

                return loss
Пример #14
0
def test(model):

    global all_data, best_embeddings, best_hyperparameters, all_losses

    model.load_state_dict(
        torch.load(str(config.DATASET_DIR / "best_model.pth")))
    model.to(device)
    model.eval()

    test_pos = all_data.edge_index[:, all_data.test_mask]
    test_neg = negative_sampling(test_pos,
                                 num_neg_samples=test_pos.size(1) // 4)
    test_total = torch.cat([test_pos, test_neg], dim=-1)
    test_pos_edges = torch.zeros(test_total.size(1)).bool()
    test_pos_edges[:test_pos.size(1)] = 1
    test_neg_edges = (test_pos_edges == 0)

    dot_embed = utils.el_dot(best_embeddings, test_total, test=True)
    roc_score, ap_score, test_acc, test_f1 = utils.calc_roc_score(
        pred_all=dot_embed,
        pos_edges=test_pos_edges.flatten(),
        neg_edges=test_neg_edges.flatten(),
        loss=all_losses,
        save_plots=config.DATASET_DIR / "train_plots.pdf")
    print('Test ROC score: {:.5f}'.format(roc_score))
    print('Test AP score: {:.5f}'.format(ap_score))
    print('Test Accuracy: {:.5f}'.format(test_acc))
    print('Test F1 score: {:.5f}'.format(test_f1))
    log_f.write('Test ROC score: {:.5f}\n'.format(roc_score))
    log_f.write('Test AP score: {:.5f}\n'.format(ap_score))
    log_f.write('Test Accuracy: {:.5f}\n'.format(test_acc))
    log_f.write('Test F1 score: {:.5f}\n'.format(test_f1))
Пример #15
0
    def recon_loss(self, z, pos_edge_index, neg_edge_index=None):
        r"""Given latent variables :obj:`z`, computes the binary cross
        entropy loss for positive edges :obj:`pos_edge_index` and negative
        sampled edges.

        Args:
            z (Tensor): The latent space :math:`\mathbf{Z}`.
            pos_edge_index (LongTensor): The positive edges to train against.
            neg_edge_index (LongTensor, optional): The negative edges to train
                against. If not given, uses negative sampling to calculate
                negative edges. (default: :obj:`None`)
        """

        pos_loss = -torch.log(
            self.decoder(z, pos_edge_index, sigmoid=True) + EPS).mean()

        # Do not include self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)
        if neg_edge_index is None:
            neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = -torch.log(1 -
                              self.decoder(z, neg_edge_index, sigmoid=True) +
                              EPS).mean()

        return pos_loss + neg_loss
Пример #16
0
def test_attr(model, data):
    model.eval()
    accs = []
    m = ['train_mask', 'val_mask', 'test_mask']
    i = 0
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):

        if m[i] == 'train_mask':
            x, pos_edge_index = data.x, data.train_pos_edge_index

            _edge_index, _ = remove_self_loops(pos_edge_index)
            pos_edge_index_with_self_loops, _ = add_self_loops(_edge_index,
                                                               num_nodes=x.size(0))

            neg_edge_index = negative_sampling(
                edge_index=pos_edge_index_with_self_loops, num_nodes=x.size(0),
                num_neg_samples=pos_edge_index.size(1))
        else:
            pos_edge_index, neg_edge_index = [
                index for _, index in data("{}_pos_edge_index".format(m[i].split("_")[0]),
                                           "{}_neg_edge_index".format(m[i].split("_")[0]))
            ]
        neg_edge_index = neg_edge_index.to(pos_edge_index.device)
        _, logits, _, _ = model(pos_edge_index, neg_edge_index)

        pred = logits[mask].max(1)[1]

        macro = f1_score((data.y[mask]).cpu().numpy(), pred.cpu().numpy(), average='macro')
        accs.append(macro)

        i += 1
    return accs
    def get_batch(self, splt: str) -> Tuple[torch.Tensor, torch.Tensor]:
        from torch_geometric.utils import (negative_sampling,
                                           remove_self_loops, add_self_loops)
        n = self.x.shape[0]

        if splt == 'train':
            pos_edge_index = self.train_edge_index
            num_neg_edges = pos_edge_index.shape[1]

            pos_edge_clean, _ = remove_self_loops(pos_edge_index)
            pos_edge_w_self_loop, _ = add_self_loops(pos_edge_clean,
                                                     num_nodes=n)

            neg_edge_index = negative_sampling(edge_index=pos_edge_w_self_loop,
                                               num_nodes=n,
                                               num_neg_samples=num_neg_edges)
        elif splt == 'val':
            pos_edge_index, neg_edge_index = self.val_edge_index
        elif splt == 'test':
            pos_edge_index, neg_edge_index = self.test_edge_index
        else:
            raise ValueError(f'Unknown splt: {splt}')

        query_edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
        link_y = torch.zeros_like(query_edge_index[0], dtype=torch.float)
        link_y[:pos_edge_index.shape[1]] = 1

        return query_edge_index, link_y
Пример #18
0
def train(predictor, x, edge_index, split_edge, optimizer, batch_size):
    predictor.train()

    pos_train_edge = split_edge['train']['edge'].to(x.device)

    total_loss = total_examples = 0
    for perm in DataLoader(range(pos_train_edge.size(0)),
                           batch_size,
                           shuffle=True):
        optimizer.zero_grad()

        edge = pos_train_edge[perm].t()

        pos_out = predictor(x[edge[0]], x[edge[1]])
        pos_loss = -torch.log(pos_out + 1e-15).mean()

        edge = negative_sampling(edge_index,
                                 num_nodes=x.size(0),
                                 num_neg_samples=perm.size(0),
                                 method='dense')

        neg_out = predictor(x[edge[0]], x[edge[1]])
        neg_loss = -torch.log(1 - neg_out + 1e-15).mean()

        loss = pos_loss + neg_loss
        loss.backward()
        optimizer.step()

        num_examples = pos_out.size(0)
        total_loss += loss.item() * num_examples
        total_examples += num_examples

    return total_loss / total_examples
Пример #19
0
    def __call__(self, data: Data) -> Tuple[Data, Data, Data]:
        perm = torch.randperm(data.num_edges, device=data.edge_index.device)
        if self.is_undirected:
            perm = perm[data.edge_index[0] <= data.edge_index[1]]

        num_val, num_test = self.num_val, self.num_test
        if isinstance(num_val, float):
            num_val = int(num_val * perm.numel())
        if isinstance(num_test, float):
            num_test = int(num_test * perm.numel())

        num_train = perm.numel() - num_val - num_test
        if num_train <= 0:
            raise ValueError("Insufficient number of edges for training.")

        train_edges = perm[:num_train]
        val_edges = perm[num_train:num_train + num_val]
        test_edges = perm[num_train + num_val:]
        train_val_edges = perm[:num_train + num_val]

        # Create data splits:
        train_data = self._split_data(data, train_edges)
        val_data = self._split_data(data, train_edges)
        test_data = self._split_data(data, train_val_edges)

        # Create negative samples:
        num_neg_train = 0
        if self.add_negative_train_samples:
            num_neg_train = int(num_train * self.neg_sampling_ratio)
        num_neg_val = int(num_val * self.neg_sampling_ratio)
        num_neg_test = int(num_test * self.neg_sampling_ratio)

        num_neg = num_neg_train + num_neg_val + num_neg_test
        neg_edge_index = negative_sampling(add_self_loops(data.edge_index)[0],
                                           num_nodes=data.num_nodes,
                                           num_neg_samples=num_neg,
                                           method='sparse')

        # Create labels:
        self._create_label(
            data,
            train_edges,
            neg_edge_index[:, num_neg_val + num_neg_test:],
            out=train_data,
        )
        self._create_label(
            data,
            val_edges,
            neg_edge_index[:, :num_neg_val],
            out=val_data,
        )
        self._create_label(
            data,
            test_edges,
            neg_edge_index[:, num_neg_val:num_neg_val + num_neg_test],
            out=test_data,
        )

        return train_data, val_data, test_data
Пример #20
0
def load_data(args, datapath):
    if args.dataset in ['arxiv'] and args.task == 'lp':
        data = {}
        dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset),
                                         root='/pasteur/u/jeffgu/hgcn/data')
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index)
        induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index)
        induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index)
        neg_edges_train = negative_sampling(induced_edges_train)
        neg_edges_valid = negative_sampling(induced_edges_valid)
        neg_edges_test = negative_sampling(induced_edges_test)
        data['adj_train'] = to_scipy_sparse_matrix(
            dataset[0].edge_index).tocsr()
        data['features'] = dataset[0].x
        data['train_edges'], data[
            'train_edges_false'] = induced_edges_train, neg_edges_train
        data['val_edges'], data[
            'val_edges_false'] = induced_edges_valid, neg_edges_valid
        data['test_edges'], data[
            'test_edges_false'] = induced_edges_test, neg_edges_test
    elif args.task == 'nc':
        data = load_data_nc(args.dataset, args.use_feats, datapath,
                            args.split_seed)
    else:
        data = load_data_lp(args.dataset, args.use_feats, datapath)
        adj = data['adj_train']
        if args.task == 'lp':
            adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges(
                adj, args.val_prop, args.test_prop, args.split_seed)
            data['adj_train'] = adj_train
            data['train_edges'], data[
                'train_edges_false'] = train_edges, train_edges_false
            data['val_edges'], data[
                'val_edges_false'] = val_edges, val_edges_false
            data['test_edges'], data[
                'test_edges_false'] = test_edges, test_edges_false
    data['adj_train_norm'], data['features'] = process(data['adj_train'],
                                                       data['features'],
                                                       args.normalize_adj,
                                                       args.normalize_feats)
    if args.dataset == 'airport':
        data['features'] = augment(data['adj_train'], data['features'])
    return data
Пример #21
0
def do_edge_split(dataset, fast_split=False, val_ratio=0.05, test_ratio=0.1):
    data = dataset[0]
    random.seed(234)
    torch.manual_seed(234)

    if not fast_split:
        data = train_test_split_edges(data, val_ratio, test_ratio)
        edge_index, _ = add_self_loops(data.train_pos_edge_index)
        data.train_neg_edge_index = negative_sampling(
            edge_index,
            num_nodes=data.num_nodes,
            num_neg_samples=data.train_pos_edge_index.size(1))
    else:
        num_nodes = data.num_nodes
        row, col = data.edge_index
        # Return upper triangular portion.
        mask = row < col
        row, col = row[mask], col[mask]
        n_v = int(math.floor(val_ratio * row.size(0)))
        n_t = int(math.floor(test_ratio * row.size(0)))
        # Positive edges.
        perm = torch.randperm(row.size(0))
        row, col = row[perm], col[perm]
        r, c = row[:n_v], col[:n_v]
        data.val_pos_edge_index = torch.stack([r, c], dim=0)
        r, c = row[n_v:n_v + n_t], col[n_v:n_v + n_t]
        data.test_pos_edge_index = torch.stack([r, c], dim=0)
        r, c = row[n_v + n_t:], col[n_v + n_t:]
        data.train_pos_edge_index = torch.stack([r, c], dim=0)
        # Negative edges (cannot guarantee (i,j) and (j,i) won't both appear)
        neg_edge_index = negative_sampling(data.edge_index,
                                           num_nodes=num_nodes,
                                           num_neg_samples=row.size(0))
        data.val_neg_edge_index = neg_edge_index[:, :n_v]
        data.test_neg_edge_index = neg_edge_index[:, n_v:n_v + n_t]
        data.train_neg_edge_index = neg_edge_index[:, n_v + n_t:]

    split_edge = {'train': {}, 'valid': {}, 'test': {}}
    split_edge['train']['edge'] = data.train_pos_edge_index.t()
    split_edge['train']['edge_neg'] = data.train_neg_edge_index.t()
    split_edge['valid']['edge'] = data.val_pos_edge_index.t()
    split_edge['valid']['edge_neg'] = data.val_neg_edge_index.t()
    split_edge['test']['edge'] = data.test_pos_edge_index.t()
    split_edge['test']['edge_neg'] = data.test_neg_edge_index.t()
    return split_edge
Пример #22
0
 def recont_loss(self, z, edge_index):
     pos_edge_index = edge_index
     neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
     pos_recont_loss = -torch.log(self.decoder(z, pos_edge_index) +
                                  1e-7).mean()
     neg_recont_loss = -torch.log(1 - self.decoder(z, neg_edge_index) +
                                  1e-7).mean()
     recont_loss = pos_recont_loss + neg_recont_loss
     return recont_loss
Пример #23
0
def load_ogb(name, dataset_dir):
    if name[:4] == 'ogbn':
        dataset = PygNodePropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = ['train_mask', 'val_mask', 'test_mask']
        for i, key in enumerate(splits.keys()):
            mask = index2mask(splits[key], size=dataset.data.y.shape[0])
            set_dataset_attr(dataset, split_names[i], mask, len(mask))
        edge_index = to_undirected(dataset.data.edge_index)
        set_dataset_attr(dataset, 'edge_index', edge_index,
                         edge_index.shape[1])

    elif name[:4] == 'ogbg':
        dataset = PygGraphPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = [
            'train_graph_index', 'val_graph_index', 'test_graph_index'
        ]
        for i, key in enumerate(splits.keys()):
            id = splits[key]
            set_dataset_attr(dataset, split_names[i], id, len(id))

    elif name[:4] == "ogbl":
        dataset = PygLinkPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_edge_split()

        id = splits['train']['edge'].T
        if cfg.dataset.resample_negative:
            set_dataset_attr(dataset, 'train_pos_edge_index', id, id.shape[1])
            # todo: applying transform for negative sampling is very slow
            dataset.transform = neg_sampling_transform
        else:
            id_neg = negative_sampling(edge_index=id,
                                       num_nodes=dataset.data.num_nodes[0],
                                       num_neg_samples=id.shape[1])
            id_all = torch.cat([id, id_neg], dim=-1)
            label = get_link_label(id, id_neg)
            set_dataset_attr(dataset, 'train_edge_index', id_all,
                             id_all.shape[1])
            set_dataset_attr(dataset, 'train_edge_label', label, len(label))

        id, id_neg = splits['valid']['edge'].T, splits['valid']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'val_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'val_edge_label', label, len(label))

        id, id_neg = splits['test']['edge'].T, splits['test']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'test_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'test_edge_label', label, len(label))

    else:
        raise ValueError('OGB dataset: {} non-exist')

    return dataset
Пример #24
0
 def _sample_train_neg_edge_index(self, is_undirected_edges=True):
     num_pos_samples = self.train_pos_edge_index.size(1)
     num_neg_samples = num_pos_samples // 2 if is_undirected_edges else num_pos_samples
     neg_edge_index = negative_sampling(
         edge_index=self.train_pos_edge_index,
         num_nodes=self.data.x.size(0),
         num_neg_samples=num_neg_samples,
     )
     return to_undirected(
         neg_edge_index) if is_undirected_edges else neg_edge_index
Пример #25
0
def create_dataset(path):
    # [discrete_x_matrix, continous_x_matrix, edges, edge_attr_matrix, churn_x
    #            label_matrix]

    data_samples = np.load(path, allow_pickle=True)

    discrete_x = torch.tensor(data_samples[0], dtype=torch.float)
    discrete_x = discrete_x[:, :16]
    continous_x = torch.tensor(data_samples[1], dtype=torch.float)
    edge_index = torch.tensor(data_samples[2], dtype=torch.long)
    edge_index = edge_index.t().contiguous()
    print('edge_index shape: ', edge_index.size())
    edge_attr = torch.tensor(data_samples[3], dtype=torch.float)
    print('edge_attr shape: ', edge_attr.size())
    labels = torch.tensor([list(u) for u in data_samples[4]],
                          dtype=torch.float)
    y = (labels[:, 1] > 0).float().view(-1, 1)
    t = (labels[:, 0] > 0).float().view(-1, 1)
    print('y shape: ', y.size())
    # treatment = torch.tensor(data_samples[5], dtype=torch.float)
    # churn_date = torch.tensor(data_samples[6], dtype=torch.float)
    tem = torch.tensor(data_samples[5], dtype=torch.float)
    treatment = tem[:, :1]
    churn_date = tem[:, 1:]

    churn_date = churn_date - 364
    churn_date[churn_date < 0] = 183
    churn_date[churn_date > 183] = 183
    churn_date = 1 - churn_date / 183

    y_cf = ((y == 1) * (t == 0)).float().view(-1, 1)
    id_cf = ((y == 1) * (t == 0) + (y == 0) * (t == 1)).float().view(-1, 1)

    # 给t = 1加了一堆会流失的人;给t = 0加了一堆不会流失的人;但问题是,其实t = 1才更不容易流失= =
    print('#cf_t1 & t0_y1: ', torch.sum((y == 1) * (t == 0)))
    print('#cf_t0 & t1_y0: ', torch.sum((y == 0) * (t == 1)))
    print('#t0: ', torch.sum(t == 0))
    print('#t1: ', torch.sum(t == 1))

    pos_edge_index, _ = remove_self_loops(edge_index)
    pos_edge_index, _ = add_self_loops(pos_edge_index)
    neg_edge_index = negative_sampling(pos_edge_index, discrete_x.size(0))

    dataset = Data(discrete_x=discrete_x,
                   continous_x=continous_x,
                   edge_index=edge_index,
                   edge_attr=edge_attr,
                   treatment=treatment,
                   y=y,
                   t=t,
                   y_cf=y_cf,
                   id_cf=id_cf,
                   neg_edge_index=neg_edge_index,
                   churn_date=churn_date)
    return dataset
Пример #26
0
def neg_sampling_transform(data):
    train_neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=data.train_pos_edge_index.size(1))
    data.train_edge_index = torch.cat(
        [data.train_pos_edge_index, train_neg_edge_index], dim=-1)
    data.train_edge_label = get_link_label(data.train_pos_edge_index,
                                           train_neg_edge_index)

    return data
Пример #27
0
def train(model, optimizer, evaluator, graph, x_feature, edge_index, adj_t, split_idx, device,
          batch_size=1024*64, num_epochs=200, save_model=False):
    best_val_score = 0
    best_epoch = 0
    best_test_score = 0
    best_model = model

    all_pos_edges = split_idx['train']['edge'].transpose(0,1).to(device)

    for epoch in range(1, num_epochs+1):
        sum_loss = 0
        count = 0 
        for batch in DataLoader(list(range(all_pos_edges.shape[1])), batch_size=batch_size, shuffle=True):
            model.train()
            batch_pos_edges = all_pos_edges[:, batch]
            batch_neg_edges = negative_sampling(edge_index=edge_index, 
                                            num_nodes=graph.num_nodes,
                                            num_neg_samples=batch_pos_edges.shape[1], 
                                            method='dense').to(device)
            edge_label_index = torch.cat([batch_pos_edges, batch_neg_edges], dim=1).to(device)
          
            pos_label = torch.ones(batch_pos_edges.shape[1], )
            neg_label = torch.zeros(batch_neg_edges.shape[1], )
            edge_label = torch.cat([pos_label, neg_label], dim=0).to(device)

            optimizer.zero_grad()  
            pred = model(x_feature, adj_t, edge_label_index)
            loss = model.loss(pred, edge_label.type_as(pred))
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
            optimizer.step()

            sum_loss += loss.item() * edge_label.shape[0]
            count += edge_label.shape[0]

        val_score, test_score = evaluate(model, x_feature, adj_t, split_idx, evaluator)
        if best_val_score < val_score:
            best_val_score = val_score
            best_epoch = epoch
            best_test_score = test_score
            if save_model:
                best_model = copy.deepcopy(model)

        log = 'Epoch: {:03d}, Loss: {:.4f}, Val Hits: {:.2f}%, Test Hits: {:.2f}%'
        print(log.format(epoch, sum_loss/count, 100*val_score, 100*test_score))

    print('Final model:')
    log = 'Epoch: {:03d}, Val Hits: {:.2f}%, Test Hits: {:.2f}%'
    print(log.format(best_epoch, 100*best_val_score, 100*best_test_score))
    return best_model, best_val_score, best_test_score
Пример #28
0
    def loss(self, z, edge_index):
        # recont loss
        pos_loss = -torch.log(self.decoder(z, edge_index) + 1e-7).mean()
        neg_edge_index = negative_sampling(edge_index, z.size(0))
        neg_loss = -torch.log(1 - self.decoder(z, neg_edge_index) +
                              1e-7).mean()
        recont_loss = pos_loss + neg_loss

        # kl loss
        kl_loss = -0.5 * torch.mean(
            torch.sum(1 + 2 * self.logstd - self.mu**2 - self.logstd.exp()**2,
                      dim=1))
        return recont_loss + (1 / z.size(0)) * kl_loss
Пример #29
0
    def recon_loss_without_reduction(self,
                                     z,
                                     pos_edge_index,
                                     neg_edge_index=None):

        pos_loss = -torch.log(
            self.decoder(z, pos_edge_index, sigmoid=True) + 1e-15)
        if neg_edge_index is None:
            neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = -torch.log(1 -
                              self.decoder(z, neg_edge_index, sigmoid=True) +
                              1e-15)

        return pos_loss, neg_loss
Пример #30
0
def train(model, predictor, x, adj_t, split_edge, optimizer, batch_size):

    row, col, _ = adj_t.coo()  #?
    edge_index = torch.stack([col, row], dim=0)

    # set training mode
    model.train()
    predictor.train()

    pos_train_edge = split_edge['train']['edge'].to(x.device)

    total_loss = total_examples = 0

    for perm in DataLoader(range(pos_train_edge.size(0)),
                           batch_size,
                           shuffle=True):

        optimizer.zero_grad()

        h = model(x, adj_t)  # output node embedding

        edge = pos_train_edge[perm].t()

        pos_out = predictor(h[edge[0]], h[edge[1]])
        pos_loss = -torch.log(pos_out + 1e-15).mean()

        edge = negative_sampling(edge_index,
                                 num_nodes=x.size(0),
                                 num_neg_samples=perm.size(0),
                                 method='dense')

        neg_out = predictor(h[edge[0]], h[edge[1]])
        neg_loss = -torch.log(1 - neg_out + 1e-15).mean()

        loss = pos_loss + neg_loss
        loss.backward()

        torch.nn.utils.clip_grad_norm(x, 1.0)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        torch.nn.utils.clip_grad_norm_(predictor.parameters(), 1.0)

        optimizer.step()

        num_examples = pos_out.size(0)

        total_loss += loss.item() * num_examples
        total_examples += num_examples

    return total_loss / total_examples