def train(args): data = load_kg_dataset(args.dataset) g = data[0] train_idx = g.edata['train_mask'].nonzero(as_tuple=False).squeeze() val_idx = g.edata['val_mask'].nonzero(as_tuple=False).squeeze() test_idx = g.edata['test_mask'].nonzero(as_tuple=False).squeeze() train_g = dgl.edge_subgraph(g, train_idx, preserve_nodes=True) train_triplets = g.find_edges(train_idx) + (train_g.edata['etype'],) model = LinkPrediction( data.num_nodes, args.num_hidden, data.num_rels * 2, args.num_layers, args.regularizer, args.num_bases, args.dropout ) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) neg_sampler = Uniform(args.num_neg_samples) labels = torch.cat([torch.ones(train_g.num_edges()), torch.zeros(train_g.num_edges() * args.num_neg_samples)] ) for epoch in range(args.epochs): model.train() embed = model(train_g, train_g.edata['etype']) neg_triplets = neg_sampler(train_g, torch.arange(train_g.num_edges())) \ + (train_g.edata['etype'].repeat_interleave(args.num_neg_samples),) pos_score = model.calc_score(embed, train_triplets) neg_score = model.calc_score(embed, neg_triplets) loss = F.binary_cross_entropy_with_logits(torch.cat([pos_score, neg_score]), labels) optimizer.zero_grad() loss.backward() optimizer.step() # TODO 计算MRR # FB-15k和FB15k-237反向传播很慢? print('Epoch {:04d} | Loss {:.4f}'.format(epoch, loss.item()))
def main(): data = UserItemDataset() g = data[0] in_feats = g.nodes['user'].data['feat'].shape[1] train_eids = g.edges['click'].data['train_mask'].nonzero(as_tuple=True)[0] sampler = MultiLayerFullNeighborSampler(2) dataloader = EdgeDataLoader(g, {'click': train_eids}, sampler, negative_sampler=Uniform(5), batch_size=256) model = Model(in_feats, 20, 5, g.etypes) optimizer = optim.Adam(model.parameters()) loss_func = MarginLoss() for epoch in range(10): model.train() losses = [] for input_nodes, pos_g, neg_g, blocks in dataloader: pos_score, neg_score = model(pos_g, neg_g, blocks, blocks[0].srcdata['feat'], 'click') loss = loss_func(pos_score, neg_score) losses.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() print('Epoch {:d} | Loss {:.4f}'.format(epoch, sum(losses) / len(losses)))
def main(): data = RandomGraphDataset(100, 500, 10) g = data[0] train_mask = torch.zeros(g.num_edges(), dtype=torch.bool).bernoulli(0.6) train_idx = train_mask.nonzero(as_tuple=True)[0] sampler = MultiLayerFullNeighborSampler(2) dataloader = EdgeDataLoader(g, train_idx, sampler, negative_sampler=Uniform(5), batch_size=32) model = Model(10, 100, 10) optimizer = optim.Adam(model.parameters()) loss_func = MarginLoss() for epoch in range(10): model.train() losses = [] for input_nodes, pos_g, neg_g, blocks in dataloader: pos_score, neg_score = model(pos_g, neg_g, blocks, blocks[0].srcdata['feat']) loss = loss_func(pos_score, neg_score) losses.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() print('Epoch {:d} | Loss {:.4f}'.format(epoch, sum(losses) / len(losses)))
def __init__(self, dataset): self.dataset = dataset cfg = self.CONFIG[dataset] rating = pd.read_csv( cfg['rating_path'], sep=cfg['rating_sep'], names=['user_id', 'item_id', 'rating'], usecols=[0, 1, 2], skiprows=1 ) kg = pd.read_csv(cfg['kg_path'], sep='\t', names=['head', 'relation', 'tail']) item2entity = pd.read_csv(cfg['item2id_path'], sep='\t', names=['item_id', 'entity_id']) rating = rating[rating['item_id'].isin(item2entity['item_id'])] rating.reset_index(drop=True, inplace=True) rating['user_id'] = LabelEncoder().fit_transform(rating['user_id']) item2entity = dict(zip(item2entity['item_id'], item2entity['entity_id'])) rating['item_id'] = rating['item_id'].apply(item2entity.__getitem__) rating['label'] = rating['rating'].apply(lambda r: int(r >= cfg['threshold'])) rating = rating[rating['label'] == 1] user_item_graph = dgl.heterograph({ ('user', 'rate', 'item'): (rating['user_id'].to_numpy(), rating['item_id'].to_numpy()) }) # 负采样 neg_sampler = Uniform(1) nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges())) u, v = user_item_graph.edges() self.user_item_graph = dgl.heterograph({('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))}) self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])]) kg['relation'] = LabelEncoder().fit_transform(kg['relation']) # 有重边,即两个实体之间可能存在多条边,关系类型不同 knowledge_graph = dgl.graph((kg['head'], kg['tail'])) knowledge_graph.edata['relation'] = torch.tensor(kg['relation'].tolist()) self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
def __init__(self): g = OAGCoreDataset()[0] author_rank = load_author_rank() rating = pd.DataFrame( [[i, a] for i, (f, r) in enumerate(author_rank.items()) for a in r], columns=['user_id', 'item_id'] ) user_item_graph = dgl.heterograph( {('user', 'rate', 'item'): (rating['user_id'], rating['item_id'])}, num_nodes_dict={'user': len(author_rank), 'item': g.num_nodes('author')} ) # 负采样 neg_sampler = Uniform(1) nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges())) u, v = user_item_graph.edges() self.user_item_graph = dgl.heterograph( {('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))}, num_nodes_dict={ntype: user_item_graph.num_nodes(ntype) for ntype in user_item_graph.ntypes} ) self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])]) knowledge_graph = dgl.to_homogeneous(dgl.node_type_subgraph(g, ['author', 'institution', 'paper'])) knowledge_graph.edata['relation'] = knowledge_graph.edata[dgl.NTYPE] self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
def __init__(self, g, split_edge, neg_samples=1, subsample_ratio=0.1, shuffle=True): self.neg_sampler = Uniform(neg_samples) self.subsample_ratio = subsample_ratio self.split_edge = split_edge self.g = g self.shuffle = shuffle