예제 #1
0
def train(args):
    data = load_kg_dataset(args.dataset)
    g = data[0]
    train_idx = g.edata['train_mask'].nonzero(as_tuple=False).squeeze()
    val_idx = g.edata['val_mask'].nonzero(as_tuple=False).squeeze()
    test_idx = g.edata['test_mask'].nonzero(as_tuple=False).squeeze()

    train_g = dgl.edge_subgraph(g, train_idx, preserve_nodes=True)
    train_triplets = g.find_edges(train_idx) + (train_g.edata['etype'],)
    model = LinkPrediction(
        data.num_nodes, args.num_hidden, data.num_rels * 2, args.num_layers,
        args.regularizer, args.num_bases, args.dropout
    )
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    neg_sampler = Uniform(args.num_neg_samples)
    labels = torch.cat([torch.ones(train_g.num_edges()), torch.zeros(train_g.num_edges() * args.num_neg_samples)] )
    for epoch in range(args.epochs):
        model.train()
        embed = model(train_g, train_g.edata['etype'])

        neg_triplets = neg_sampler(train_g, torch.arange(train_g.num_edges())) \
            + (train_g.edata['etype'].repeat_interleave(args.num_neg_samples),)
        pos_score = model.calc_score(embed, train_triplets)
        neg_score = model.calc_score(embed, neg_triplets)
        loss = F.binary_cross_entropy_with_logits(torch.cat([pos_score, neg_score]), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # TODO 计算MRR
        # FB-15k和FB15k-237反向传播很慢?
        print('Epoch {:04d} | Loss {:.4f}'.format(epoch, loss.item()))
def main():
    data = UserItemDataset()
    g = data[0]
    in_feats = g.nodes['user'].data['feat'].shape[1]
    train_eids = g.edges['click'].data['train_mask'].nonzero(as_tuple=True)[0]

    sampler = MultiLayerFullNeighborSampler(2)
    dataloader = EdgeDataLoader(g, {'click': train_eids},
                                sampler,
                                negative_sampler=Uniform(5),
                                batch_size=256)

    model = Model(in_feats, 20, 5, g.etypes)
    optimizer = optim.Adam(model.parameters())
    loss_func = MarginLoss()

    for epoch in range(10):
        model.train()
        losses = []
        for input_nodes, pos_g, neg_g, blocks in dataloader:
            pos_score, neg_score = model(pos_g, neg_g, blocks,
                                         blocks[0].srcdata['feat'], 'click')
            loss = loss_func(pos_score, neg_score)
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch {:d} | Loss {:.4f}'.format(epoch,
                                                sum(losses) / len(losses)))
예제 #3
0
def main():
    data = RandomGraphDataset(100, 500, 10)
    g = data[0]
    train_mask = torch.zeros(g.num_edges(), dtype=torch.bool).bernoulli(0.6)
    train_idx = train_mask.nonzero(as_tuple=True)[0]

    sampler = MultiLayerFullNeighborSampler(2)
    dataloader = EdgeDataLoader(g,
                                train_idx,
                                sampler,
                                negative_sampler=Uniform(5),
                                batch_size=32)

    model = Model(10, 100, 10)
    optimizer = optim.Adam(model.parameters())
    loss_func = MarginLoss()

    for epoch in range(10):
        model.train()
        losses = []
        for input_nodes, pos_g, neg_g, blocks in dataloader:
            pos_score, neg_score = model(pos_g, neg_g, blocks,
                                         blocks[0].srcdata['feat'])
            loss = loss_func(pos_score, neg_score)
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch {:d} | Loss {:.4f}'.format(epoch,
                                                sum(losses) / len(losses)))
예제 #4
0
    def __init__(self, dataset):
        self.dataset = dataset
        cfg = self.CONFIG[dataset]

        rating = pd.read_csv(
            cfg['rating_path'], sep=cfg['rating_sep'], names=['user_id', 'item_id', 'rating'],
            usecols=[0, 1, 2], skiprows=1
        )
        kg = pd.read_csv(cfg['kg_path'], sep='\t', names=['head', 'relation', 'tail'])
        item2entity = pd.read_csv(cfg['item2id_path'], sep='\t', names=['item_id', 'entity_id'])

        rating = rating[rating['item_id'].isin(item2entity['item_id'])]
        rating.reset_index(drop=True, inplace=True)
        rating['user_id'] = LabelEncoder().fit_transform(rating['user_id'])
        item2entity = dict(zip(item2entity['item_id'], item2entity['entity_id']))
        rating['item_id'] = rating['item_id'].apply(item2entity.__getitem__)
        rating['label'] = rating['rating'].apply(lambda r: int(r >= cfg['threshold']))
        rating = rating[rating['label'] == 1]
        user_item_graph = dgl.heterograph({
            ('user', 'rate', 'item'): (rating['user_id'].to_numpy(), rating['item_id'].to_numpy())
        })

        # 负采样
        neg_sampler = Uniform(1)
        nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges()))
        u, v = user_item_graph.edges()
        self.user_item_graph = dgl.heterograph({('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))})
        self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])])

        kg['relation'] = LabelEncoder().fit_transform(kg['relation'])
        # 有重边,即两个实体之间可能存在多条边,关系类型不同
        knowledge_graph = dgl.graph((kg['head'], kg['tail']))
        knowledge_graph.edata['relation'] = torch.tensor(kg['relation'].tolist())
        self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
예제 #5
0
    def __init__(self):
        g = OAGCoreDataset()[0]
        author_rank = load_author_rank()
        rating = pd.DataFrame(
            [[i, a] for i, (f, r) in enumerate(author_rank.items()) for a in r],
            columns=['user_id', 'item_id']
        )
        user_item_graph = dgl.heterograph(
            {('user', 'rate', 'item'): (rating['user_id'], rating['item_id'])},
            num_nodes_dict={'user': len(author_rank), 'item': g.num_nodes('author')}
        )

        # 负采样
        neg_sampler = Uniform(1)
        nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges()))
        u, v = user_item_graph.edges()
        self.user_item_graph = dgl.heterograph(
            {('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))},
            num_nodes_dict={ntype: user_item_graph.num_nodes(ntype) for ntype in user_item_graph.ntypes}
        )
        self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])])

        knowledge_graph = dgl.to_homogeneous(dgl.node_type_subgraph(g, ['author', 'institution', 'paper']))
        knowledge_graph.edata['relation'] = knowledge_graph.edata[dgl.NTYPE]
        self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
예제 #6
0
 def __init__(self,
              g,
              split_edge,
              neg_samples=1,
              subsample_ratio=0.1,
              shuffle=True):
     self.neg_sampler = Uniform(neg_samples)
     self.subsample_ratio = subsample_ratio
     self.split_edge = split_edge
     self.g = g
     self.shuffle = shuffle