def train(args): print(args) # dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, # test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) dataset = DataSetLoader(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() net.train() pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() count_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss/iter_idx, count_rmse/count_num, np.average(dur)) count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) dataset = MovieLens( args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio, ) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type="in"), ctx=args.ctx) net.hybridize() nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) rating_loss_net = gluon.loss.SoftmaxCELoss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {"learning_rate": args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger( ["iter", "loss", "rmse"], ["%d", "%.4f", "%.4f"], os.path.join(args.save_dir, "train_loss%d.csv" % args.save_id), ) valid_loss_logger = MetricLogger( ["iter", "rmse"], ["%d", "%.4f"], os.path.join(args.save_dir, "valid_loss%d.csv" % args.save_id), ) test_loss_logger = MetricLogger( ["iter", "rmse"], ["%d", "%.4f"], os.path.join(args.save_dir, "test_loss%d.csv" % args.save_id), ) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 avg_gnorm = 0 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() with mx.autograd.record(): pred_ratings = net( dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature, ) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() loss.backward() count_loss += loss.asscalar() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) avg_gnorm += gnorm trainer.step(1.0) if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print( gluon_net_info(net, save_path=os.path.join( args.save_dir, "net%d.txt" % args.save_id))) real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape( (1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum() count_rmse += rmse.asscalar() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, avg_gnorm / args.train_log_interval, count_loss / iter_idx, count_rmse / count_num, np.average(dur), ) avg_gnorm = 0 count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment="valid") valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ",\tVal RMSE={:.4f}".format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx net.save_parameters(filename=os.path.join( args.save_dir, "best_valid_net{}.params".format( args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment="test") best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ", Test RMSE={:.4f}".format(test_rmse) else: no_better_valid += 1 if (no_better_valid > args.train_early_stopping_patience and trainer.learning_rate <= args.train_min_lr): logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print("Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}". format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm) print("Loading data finished ...\n") args.src_key = dataset.name_user args.dst_key = dataset.name_movie args.src_in_units = dataset.user_feature.shape[1] args.dst_in_units = dataset.movie_feature.shape[1] args.nratings = dataset.possible_rating_values.size ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx) net.hybridize() if args.gen_r_use_classification: nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) rating_loss_net = gluon.loss.SoftmaxCELoss() else: rating_mean = dataset.train_rating_values.mean() rating_std = dataset.train_rating_values.std() rating_loss_net = gluon.loss.L2Loss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_rating_pairs = mx.nd.array(dataset.train_rating_pairs, ctx=args.ctx, dtype=np.int64) train_gt_ratings = mx.nd.array(dataset.train_rating_values, ctx=args.ctx, dtype=np.float32) ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 avg_gnorm = 0 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") for iter_idx in range(1, args.train_max_iter): if args.gen_r_use_classification: train_gt_label = mx.nd.array(np.searchsorted( dataset.possible_rating_values, dataset.train_rating_values), ctx=args.ctx, dtype=np.int32) with mx.autograd.record(): pred_ratings = net(dataset.train_graph, train_rating_pairs) if args.gen_r_use_classification: loss = rating_loss_net(pred_ratings, train_gt_label).mean() else: loss = rating_loss_net( mx.nd.reshape(pred_ratings, shape=(-1, )), (train_gt_ratings - rating_mean) / rating_std).mean() #loss.wait_to_read() loss.backward() count_loss += loss.asscalar() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) avg_gnorm += gnorm trainer.step(1.0) #, ignore_stale_grad=True) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print( gluon_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) if args.gen_r_use_classification: real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape( (1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum() else: rmse = mx.nd.square( pred_ratings.reshape((-1, )) * rating_std + rating_mean - train_gt_ratings).sum() count_rmse += rmse.asscalar() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format( iter_idx, avg_gnorm / args.train_log_interval, count_loss / iter_idx, count_rmse / count_num) avg_gnorm = 0 count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and trainer.learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx) net.hybridize() rating_loss_net = gluon.loss.SoftmaxCELoss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'idx', 'loss', 'rmse'], ['%d', '%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 enc_graph = dataset.train_enc_graph nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) g_user_fea = mx.nd.zeros((dataset.num_user,)) g_movie_fea = mx.nd.zeros((dataset.num_movie,)) train_truths = dataset.train_truths train_labels = dataset.train_labels print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() num_edges = dataset.train_truths.shape[0] seed = mx.nd.arange(num_edges, dtype='int64') edges = mx.nd.shuffle(seed) # each iteration will go through all edges for sample_idx in range(0, (num_edges + args.minibatch_size - 1) // args.minibatch_size): edge_ids = edges[sample_idx * args.minibatch_size: (sample_idx + 1) * args.minibatch_size if (sample_idx + 1) * args.minibatch_size < num_edges else num_edges] head_ids, tail_ids = dataset.train_dec_graph.find_edges(edge_ids.asnumpy()) head_subgraphs = {} tail_subgraphs = {} head_node_ids = np.unique(head_ids.asnumpy()) tail_node_ids = np.unique(tail_ids.asnumpy()) for i, _ in enumerate(args.rating_vals): t = enc_graph.canonical_etypes[i * 2] rev_t = enc_graph.canonical_etypes[i * 2 + 1] head_in_edges = enc_graph.in_edges(head_node_ids, 'eid', etype=rev_t) tail_in_edges = enc_graph.in_edges(tail_node_ids, 'eid', etype=t) if head_in_edges.shape[0] > 0: head_subgraphs[rev_t] = head_in_edges if tail_in_edges.shape[0] > 0: tail_subgraphs[t] = tail_in_edges head_subgraph = enc_graph.edge_subgraph(head_subgraphs, preserve_nodes=True) tail_subgraph = enc_graph.edge_subgraph(tail_subgraphs, preserve_nodes=True) edge_ids = edge_ids.as_in_context(args.ctx) true_relation_ratings = train_truths[edge_ids] true_relation_labels = train_labels[edge_ids] head_NID = head_subgraph.nodes['user'].data[dgl.NID] tail_NID = tail_subgraph.nodes['movie'].data[dgl.NID] g_user_fea[head_NID] = mx.nd.arange(head_NID.shape[0], dtype='int32') g_movie_fea[tail_NID] = mx.nd.arange(tail_NID.shape[0], dtype='int32') true_head_idx = g_user_fea[head_ids].as_in_context(args.ctx) true_tail_idx = g_movie_fea[tail_ids].as_in_context(args.ctx) with mx.autograd.record(): pred_ratings = net(head_subgraph, tail_subgraph, true_head_idx, true_tail_idx) loss = rating_loss_net(pred_ratings, true_relation_labels).mean() loss.backward() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) trainer.step(1.0, ignore_stale_grad=True) real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape((1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - true_relation_ratings).mean().asscalar() rmse = np.sqrt(rmse) loss = loss.asscalar() if sample_idx % 100 == 0: train_loss_logger.log(iter=iter_idx, idx=sample_idx, loss=loss, rmse=rmse) print("Iter={}, sample_idx={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(iter_idx, sample_idx, gnorm, loss, rmse)) gc.collect() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) if iter_idx % args.train_log_interval == 0: logging_str = "Iter={}, time={:.4f}".format( iter_idx, np.average(dur)) if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and trainer.learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train_eval(args): logging_config(folder=args.save_dir, name='log{:d}'.format(args.save_id), no_console=False) logging.info(args) ### check context use_cuda = args.gpu >= 0 and th.cuda.is_available() if use_cuda: th.cuda.set_device(args.gpu) ### load data dataset = DataLoader(data_name=args.data_name, seed=args.seed) print(dataset) model = Model(use_KG=True, input_node_dim=args.entity_embed_dim, gnn_model=args.gnn_model, num_gnn_layers=args.gnn_num_layer, n_hidden=args.gnn_hidden_size, dropout=args.dropout_rate, n_entities=dataset.n_KG_entity, n_relations=dataset.n_KG_relation, relation_dim=args.relation_embed_dim, reg_lambda_kg=args.regs, reg_lambda_gnn=args.regs) if use_cuda: model.cuda() logging.info(model) ### optimizer optimizer = optim.Adam(model.parameters(), lr=args.lr) valid_metric_logger = MetricLogger( ['epoch', 'recall', 'ndcg', 'is_best'], ['%d', '%.5f', '%.5f', '%d'], os.path.join(args.save_dir, 'valid{:d}.csv'.format(args.save_id))) test_metric_logger = MetricLogger( ['epoch', 'recall', 'ndcg'], ['%d', '%.5f', '%.5f'], os.path.join(args.save_dir, 'test{:d}.csv'.format(args.save_id))) best_epoch = -1 best_recall = 0.0 train_g = dataset.train_g nid_th = th.LongTensor(train_g.ndata["id"]) etype_th = th.LongTensor(train_g.edata["type"]) if use_cuda: nid_th, etype_th = nid_th.cuda(), etype_th.cuda() train_g.ndata['id'] = nid_th train_g.edata['type'] = etype_th test_g = dataset.test_g nid_th = th.LongTensor(test_g.ndata["id"]) etype_th = th.LongTensor(test_g.edata["type"]) if use_cuda: nid_th, etype_th = nid_th.cuda(), etype_th.cuda() test_g.ndata['id'] = nid_th test_g.edata['type'] = etype_th item_id_range = th.LongTensor(dataset.item_id_range).cuda() if use_cuda \ else th.LongTensor(dataset.item_id_range) for epoch in range(1, args.max_epoch + 1): ### train kg time1 = time() kg_sampler = dataset.KG_sampler(batch_size=args.batch_size_kg) iter = 0 total_loss = 0.0 for h, r, pos_t, neg_t, _ in kg_sampler: iter += 1 model.train() h_th = th.LongTensor(h) r_th = th.LongTensor(r) pos_t_th = th.LongTensor(pos_t) neg_t_th = th.LongTensor(neg_t) if use_cuda: h_th, r_th, pos_t_th, neg_t_th = h_th.cuda(), r_th.cuda( ), pos_t_th.cuda(), neg_t_th.cuda() loss = model.transR(h_th, r_th, pos_t_th, neg_t_th) loss.backward() optimizer.step() optimizer.zero_grad() total_loss += loss.item() if (iter % args.print_every) == 0 or iter == 1: logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format( epoch, iter, total_loss / iter)) logging.info('Time for KGE: {:.1f}s, loss {:.4f}'.format( time() - time1, total_loss / iter)) ### train GNN if args.use_attention: time1 = time() print("Compute attention weight in train ...") with th.no_grad(): A_w = model.compute_attention(train_g) train_g.edata['w'] = A_w print("Time: {:.2f}s".format(time() - time1)) time1 = time() cf_sampler = dataset.CF_pair_sampler(batch_size=args.batch_size) iter = 0 total_loss = 0.0 for user_ids, item_pos_ids, item_neg_ids, _ in cf_sampler: iter += 1 model.train() user_ids_th = th.LongTensor(user_ids) item_pos_ids_th = th.LongTensor(item_pos_ids) item_neg_ids_th = th.LongTensor(item_neg_ids) if use_cuda: user_ids_th, item_pos_ids_th, item_neg_ids_th = \ user_ids_th.cuda(), item_pos_ids_th.cuda(), item_neg_ids_th.cuda() embedding = model.gnn(train_g, train_g.ndata['id']) loss = model.get_loss(embedding, user_ids_th, item_pos_ids_th, item_neg_ids_th) loss.backward() # th.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) # clip gradients optimizer.step() optimizer.zero_grad() total_loss += loss.item() if (iter % args.print_every) == 0 or iter == 1: logging.info("Epoch {:04d} Iter {:04d} | Loss {:.4f} ".format( epoch, iter, total_loss / iter)) logging.info('Time for GNN: {:.1f}s, loss {:.4f}'.format( time() - time1, total_loss / iter)) if epoch % args.evaluate_every == 0: time1 = time() val_recall, val_ndcg = eval(model, train_g, dataset.train_user_dict, dataset.valid_user_dict, item_id_range, use_cuda, args.use_attention) info = "Epoch{}, [{:.1f}s] val recall:{:.5f}, val ndcg:{:.5f}".format( epoch, time() - time1, val_recall, val_ndcg) # save best model if val_recall > best_recall: valid_metric_logger.log(epoch=epoch, recall=val_recall, ndcg=val_ndcg, is_best=1) best_recall = val_recall #best_ndcg = val_ndcg best_epoch = epoch time1 = time() test_recall, test_ndcg = eval(model, test_g, dataset.train_valid_user_dict, dataset.test_user_dict, item_id_range, use_cuda, args.use_attention) test_metric_logger.log(epoch=epoch, recall=test_recall, ndcg=test_ndcg) info += "\t[{:.1f}s] test recall:{:.5f}, test ndcg:{:.5f}".format( time() - time1, test_recall, test_ndcg) #th.save({'state_dict': model.state_dict(), 'epoch': epoch}, model_state_file) else: valid_metric_logger.log(epoch=epoch, recall=val_recall, ndcg=val_ndcg, is_best=0) recall, ndcg = eval(model, test_g, dataset.train_valid_user_dict, dataset.test_user_dict, item_id_range, use_cuda, args.use_attention) print("test recall:{}, test_ndcg: {}".format(recall, ndcg)) logging.info(info) logging.info( "Final test recall:{:.5f}, test ndcg:{:.5f}, best epoch:{}".format( test_recall, test_ndcg, best_epoch))
def train(args): print(args) dataset = DataSetLoader(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio, sample_rate=args.sample_rate) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net #args.decoder = "MLP" net = Net(args=args) #print(args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor( dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger( ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f', '%.4f', '%.4f', '%.4f'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger( ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f', '%.4f', '%.4f', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf best_valid_ndcg = -np.inf best_test_ndcg = -np.inf no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to( args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to( args.device) print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): ''' noisy_labels = th.LongTensor(np.random.choice([-1, 0, 1], train_gt_ratings.shape[0], replace=True, p=[0.001, 0.998, 0.001])).to(args.device) train_gt_labels += noisy_labels max_label = dataset.max_l + th.zeros_like(train_gt_labels) min_label = dataset.min_l + th.zeros_like(train_gt_labels) max_label = max_label.long() min_label = min_label.long() train_gt_labels = th.where(train_gt_labels > max_label, max_label, train_gt_labels) train_gt_labels = th.where(train_gt_labels < min_label, min_label, train_gt_labels) ''' if iter_idx > 3: t0 = time.time() net.train() if iter_idx > 250: Two_Stage = True else: Two_Stage = False Two_Stage = False pred_ratings, reg_loss, user_out, movie_out, W = net( dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature, Two_Stage) #print("user_out:\n", user_out[0]) #print("movie_out:\n", movie_out[0]) #print("W:\n", W.shape) if args.loss_func == "CE": loss = rating_loss_net( pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss ''' real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2) loss += mse_loss * 0.0001 ''' elif args.loss_func == "Hinge": real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum( dim=1) gap = (real_pred_ratings - train_gt_labels)**2 hinge_loss = th.where(gap > 1.0, gap * gap, gap).mean() loss = hinge_loss elif args.loss_func == "MSE": ''' seeds = th.arange(pred_ratings.shape[0]) random.shuffle(seeds) for i in range((pred_ratings.shape[0] - 1) // 50 + 1): start = i * 50 end = (i + 1) * 50 if end > (pred_ratings.shape[0] - 1): end = pred_ratings.shape[0] - 1 batch = seeds[start:end] loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss count_loss += loss.item() * 50 / pred_ratings.shape[0] optimizer.zero_grad() loss.backward(retain_graph=True) #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature) ''' loss = th.mean((pred_ratings[:, 0] - nd_possible_rating_values[train_gt_labels])** 2) + args.ARR * reg_loss count_loss += loss.item() optimizer.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print( torch_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) if args.loss_func == "CE": real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum( dim=1) elif args.loss_func == "MSE": real_pred_ratings = pred_ratings[:, 0] rmse = ((real_pred_ratings - train_gt_ratings)**2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss / iter_idx, count_rmse / count_num, np.average(dur)) count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') ndcg_valid = evaluate_metric(args=args, net=net, dataset=dataset, segment='valid', debug=False) print("ndcg_valid:", ndcg_valid) valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse, ndcg_20=ndcg_valid[0], ndcg_40=ndcg_valid[1], ndcg_80=ndcg_valid[2]) print("-" * 80) #test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') #test_loss_logger.log(iter=iter_idx, rmse=test_rmse, ndcg_20 = ndcg_k[0], ndcg_40 = ndcg_k[1], ndcg_80 = ndcg_k[2]) #logging_str += ', Test RMSE={:.4f}'.format(test_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) logging_str += ',\tndcg_valid_20={:.4f}'.format(ndcg_valid[0]) logging_str += ',\tndcg_valid_40={:.4f}'.format(ndcg_valid[1]) logging_str += ',\tndcg_valid_80={:.4f}'.format(ndcg_valid[2]) ndcg_valid_20 = ndcg_valid[0] #print("***********",ndcg_valid_20) if ndcg_valid_20 > best_valid_ndcg: best_valid_ndcg = ndcg_valid_20 print("************best_valid_ndcg:", best_valid_ndcg) print("************ndcg_valid_20:", ndcg_valid_20) no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test', debug=True, idx=iter_idx) ndcg_test = evaluate_metric(args=args, net=net, dataset=dataset, segment='test', debug=False) logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[0]) logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[1]) logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[2]) #best_test_rmse = test_rmse best_test_ndcg = ndcg_test #test_loss_logger.log(iter=iter_idx, rmse=test_rmse) test_loss_logger.log(iter=iter_idx, rmse=test_rmse, ndcg_20=ndcg_test[0], ndcg_40=ndcg_test[1], ndcg_80=ndcg_test[2]) #logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 #print("************best_valid_ndcg:",best_valid_ndcg) #print("************ndcg_valid_20:",ndcg_valid_20) if iter_idx % args.train_log_interval == 0: print(logging_str) print( 'Best Iter Idx={}, best ndcg_20={:.4f}, best ndcg_40={:.4f}, best ndcg_80={:.4f}' .format(best_iter, best_test_ndcg[0], best_test_ndcg[1], best_test_ndcg[2])) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) if args.data_name == 'jukebox': dataset = JukeboxDataset('dataset/listen_count.txt') else: dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device) rating_loss_net = nn.MSELoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'loss'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'ndcg','precision','recall','fscore','support'], ['%d','%.4f', '%.4f','%s','%s','%s','%s'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 count_rmse = 1 count_num = 1 count_loss = 0 count_step = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) def batch(iterable, n=1): current_batch = [] for item in iterable: current_batch.append(item) if len(current_batch) == n: yield current_batch current_batch = [] if current_batch: yield current_batch batches = [] print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() net.train() unique_item_list = dataset.train['item_id'].unique().tolist() ufeat, ifeat = net.encoder(dataset.train_enc_graph, dataset.user_feature, dataset.movie_feature) from tqdm import tqdm if iter_idx ==1: for row in tqdm(list(dataset.train.itertuples())): user, item, rating = row.user_id, row.item_id, row.rating userid = dataset.global_user_id_map[user] observed = dataset.train[dataset.train['user_id'] == user]['item_id'].unique().tolist() negatives = set() while len(negatives) < 1: sample = random.choice(unique_item_list) if sample not in observed: negatives.add(sample) batches.append((userid, dataset.global_item_id_map[item], dataset.global_item_id_map[sample])) for bt in tqdm(list(batch(batches, 2**14))): uidfeat = ufeat[[e[0] for e in bt]] posfeat = ifeat[[e[1] for e in bt]] negfeat = ifeat[[e[2] for e in bt]] pos_scores = uidfeat @ net.decoder.Q @ posfeat.T neg_scores = uidfeat @ net.decoder.Q @ negfeat.T lmbd = 1e-5 mf_loss = -nn.BCELoss()(th.sigmoid(pos_scores), th.ones_like(pos_scores)) + nn.LogSigmoid()(pos_scores - neg_scores).mean() mf_loss = -1 * mf_loss regularizer = (th.norm(uidfeat,dim=1)**2).mean() + (th.norm(posfeat,dim=1)**2).mean() + (th.norm(negfeat,dim=1)**2).mean() + (th.norm(net.decoder.Q)) emb_loss = lmbd * regularizer print('mf_loss', mf_loss) print('emb_loss', emb_loss) optimizer.zero_grad() loss = mf_loss + emb_loss count_loss += loss.item() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() ufeat, ifeat = net.encoder(dataset.train_enc_graph, dataset.user_feature, dataset.movie_feature) count_step += 1 print('train done') if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (count_step + 1)) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss/(count_step + 1), count_rmse/count_num, np.average(dur)) count_rmse = 1 count_num = 1 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') precision, recall, fscore, support = evaluate_others(args=args, net=net, dataset=dataset, segment='valid') ndcg = evaluate_ndcg(args=args, net=net, dataset=dataset, segment='valid') print('ndcg', ndcg, 'precision', precision, 'recall', recall, 'fscore', fscore, 'support', support) valid_loss_logger.log(iter=iter_idx, ndcg=ndcg, precision=precision, recall=recall, fscore=fscore, support=support) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(proc_id, n_gpus, args, devices, movielens): # Start up distributed training, if enabled. dev_id = devices[proc_id] if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=proc_id) th.cuda.set_device(dev_id) # set random seed in each gpu th.manual_seed(args.seed) if th.cuda.is_available(): th.cuda.manual_seed_all(args.seed) # Split train_dataset and set dataloader train_rating_pairs = th.split(th.stack(movielens.train_rating_pairs), len(movielens.train_rating_values) // args.n_gpus, dim=1)[proc_id] train_rating_values = th.split(movielens.train_rating_values, len(movielens.train_rating_values) // args.n_gpus, dim=0)[proc_id] train_dataset = MovieLensDataset(train_rating_pairs, train_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) train_loader = th.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_movielens) if proc_id == 0: if args.testing: test_dataset = MovieLensDataset(movielens.test_rating_pairs, movielens.test_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) else: test_dataset = MovieLensDataset(movielens.valid_rating_pairs, movielens.valid_rating_pairs, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) test_loader = th.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_movielens) model = IGMC( in_feats=(args.hop + 1) * 2, latent_dim=[32, 32, 32, 32], num_relations=5, #dataset_base.num_rating, num_bases=4, regression=True, edge_dropout=args.edge_dropout, # side_features=args.use_features, # n_side_features=n_features, # multiply_by=args.multiply_by ).to(dev_id) if n_gpus > 1: model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) loss_fn = nn.MSELoss().to(dev_id) optimizer = optim.Adam(model.parameters(), lr=args.train_lr, weight_decay=0) if proc_id == 0: print("Loading network finished ...\n") # prepare the logger logger = MetricLogger(args.save_dir, args.valid_log_interval) best_epoch = 0 best_rmse = np.inf print("Start training ...") for epoch_idx in range(1, args.train_epochs + 1): if proc_id == 0: print('Epoch', epoch_idx) train_loss = train_epoch(proc_id, n_gpus, model, loss_fn, optimizer, args.arr_lambda, train_loader, dev_id, args.train_log_interval) if n_gpus > 1: th.distributed.barrier() if proc_id == 0: test_rmse = evaluate(model, test_loader, dev_id) eval_info = { 'epoch': epoch_idx, 'train_loss': train_loss, 'test_rmse': test_rmse, } print( '=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format( *eval_info.values())) if epoch_idx % args.train_lr_decay_step == 0: for param in optimizer.param_groups: param['lr'] = args.train_lr_decay_factor * param['lr'] logger.log(eval_info, model, optimizer) if best_rmse > test_rmse: best_rmse = test_rmse best_epoch = epoch_idx if n_gpus > 1: th.distributed.barrier() if proc_id == 0: eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format( best_rmse, best_epoch) print(eval_info) with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f: f.write(eval_info)
def run(proc_id, n_gpus, args, devices, dataset): dev_id = devices[proc_id] train_labels = dataset.train_labels train_truths = dataset.train_truths num_edges = train_truths.shape[0] reverse_types = { to_etype_name(k): 'rev-' + to_etype_name(k) for k in dataset.possible_rating_values } reverse_types.update({v: k for k, v in reverse_types.items()}) sampler = dgl.dataloading.MultiLayerNeighborSampler([None], return_eids=True) dataloader = dgl.dataloading.EdgeDataLoader(dataset.train_enc_graph, { to_etype_name(k): th.arange( dataset.train_enc_graph.number_of_edges(etype=to_etype_name(k))) for k in dataset.possible_rating_values }, sampler, batch_size=args.minibatch_size, shuffle=True, drop_last=False) if proc_id == 0: valid_dataloader = dgl.dataloading.EdgeDataLoader( dataset.valid_dec_graph, th.arange(dataset.valid_dec_graph.number_of_edges()), sampler, g_sampling=dataset.valid_enc_graph, batch_size=args.minibatch_size, shuffle=False, drop_last=False) test_dataloader = dgl.dataloading.EdgeDataLoader( dataset.test_dec_graph, th.arange(dataset.test_dec_graph.number_of_edges()), sampler, g_sampling=dataset.test_enc_graph, batch_size=args.minibatch_size, shuffle=False, drop_last=False) if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus th.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=dev_id) if n_gpus > 0: th.cuda.set_device(dev_id) nd_possible_rating_values = \ th.FloatTensor(dataset.possible_rating_values) nd_possible_rating_values = nd_possible_rating_values.to(dev_id) start = time.time() net = Net(args=args, dev_id=dev_id) net = net.to(dev_id) if n_gpus > 1: net = DistributedDataParallel(net, device_ids=[dev_id], output_device=dev_id) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_epoch = -1 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") dur = [] iter_idx = 1 logging_str = None ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'test_loss%d.csv' % args.save_id)) for epoch in range(1, args.train_max_epoch): if epoch == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print( torch_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) if epoch > 1: t0 = time.time() net.train() with tqdm.tqdm(dataloader) as tq: for step, (input_nodes, pair_graph, blocks) in enumerate(tq): head_feat, tail_feat, blocks = load_subtensor( input_nodes, pair_graph, blocks, dataset, dataset.train_enc_graph) frontier = blocks[0] compact_g = flatten_etypes(pair_graph, dataset, 'train').to(dev_id) true_relation_labels = compact_g.edata['label'] true_relation_ratings = compact_g.edata['rating'] head_feat = head_feat.to(dev_id) tail_feat = tail_feat.to(dev_id) frontier = frontier.to(dev_id) pred_ratings = net(compact_g, frontier, head_feat, tail_feat, dataset.possible_rating_values) loss = rating_loss_net(pred_ratings, true_relation_labels.to(dev_id)).mean() count_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if proc_id == 0 and iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) real_pred_ratings = ( th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) rmse = ((real_pred_ratings - true_relation_ratings.to(dev_id))**2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) tq.set_postfix( { 'loss': '{:.4f}'.format(count_loss / iter_idx), 'rmse': '{:.4f}'.format(count_rmse / count_num) }, refresh=False) iter_idx += 1 if epoch > 1: epoch_time = time.time() - t0 print("Epoch {} time {}".format(epoch, epoch_time)) if epoch % args.train_valid_interval == 0: if n_gpus > 1: th.distributed.barrier() if proc_id == 0: valid_rmse = evaluate(args=args, dev_id=dev_id, net=net, dataset=dataset, dataloader=valid_dataloader, segment='valid') valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str = 'Val RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_epoch = epoch test_rmse = evaluate(args=args, dev_id=dev_id, net=net, dataset=dataset, dataloader=test_dataloader, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: logging.info("\tChange the LR to %g" % new_lr) learning_rate = new_lr for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 print("Change the LR to %g" % new_lr) # sync on evalution if n_gpus > 1: th.distributed.barrier() if logging_str is not None: print(logging_str) if proc_id == 0: print( 'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_epoch, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close() with open( os.path.join(args.save_dir, f'duration_{args.save_id:d}.txt'), 'a') as f: print(f'wall: {time.time() - start}') f.write(f'wall: {time.time() - start}')
def train(args): print(args) dataset = DataSetLoader(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio, sample_rate = args.sample_rate) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger # train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], # os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) # valid_loss_logger = MetricLogger(['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f', '%.4f', '%.4f', '%.4f'], # os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) # test_loss_logger = MetricLogger(['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f', '%.4f', '%.4f', '%.4f'], # os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### prepare the logger train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss.csv')) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'valid_loss.csv')) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss.csv')) ### declare the loss information best_valid_rmse = np.inf best_valid_ndcg = -np.inf best_test_ndcg = [] no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) #dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(args.device) #dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(args.device) print("Start training ...") dur = [] train_rating_pairs, train_rating_values = dataset._generate_pair_value(dataset.train_rating_info) # 首先需要对每个用户采样出他的0样本,这个操作做一次就可以了。 # 其次每次从这些样本中随机抽取一些边作为0的边 # def sample_negative(interact_status, sample_num, random_number): # #"""return sample_num sampled negative items""" # random.seed(random_number) # interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, sample_num)) # return interact_status[['user_id', 'negative_samples']] # def update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data, seed): # #train_rating_pairs, train_rating_values = dataset._generate_pair_value(dataset.train_rating_info) # train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero(sampled_data) # train_rating_pairs_new = (np.append(train_rating_pairs[0], train_rating_pairs_zeros[0]), np.append(train_rating_pairs[1], train_rating_pairs_zeros[1])) # train_rating_values_new = np.append(train_rating_values, train_rating_values_zeros) # train_enc_graph_NS = dataset._generate_enc_graph(train_rating_pairs_new, train_rating_values_new, add_support = True) # #print("dataset.train_dec_graph:", dataset.train_enc_graph) # train_enc_graph_NS = train_enc_graph_NS.int().to(args.device) # valid_enc_graph_NS = train_enc_graph_NS # return train_enc_graph_NS def update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data): train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero(dataset.train_rating_info, sampled_data) train_rating_pairs = (np.append(train_rating_pairs[0], train_rating_pairs_zeros[0]), np.append(train_rating_pairs[1], train_rating_pairs_zeros[1])) train_rating_values = np.append(train_rating_values, train_rating_values_zeros) dataset.train_enc_graph = dataset._generate_enc_graph(train_rating_pairs, train_rating_values, add_support = True) dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph return dataset.train_enc_graph def sample_data(interact_status, random_number, sample_rate): random.seed(random_number) #print("length:", len(interact_status['negative_items'])) #for i in interact_status['negative_items'] #print("neg:\n",interact_status['negative_items']) interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, sample_rate)) return interact_status[['user_id', 'negative_items', 'negative_samples']] seed_list = np.random.randint(0, 10000, (args.train_max_iter,)) #negitive_all = dataset.negative_all(dataset.train_rating_info) # max_num = 0 # for i in range(0,len(negitive_all)): # if len(negitive_all['negative_items'][i]) > max_num: # max_num = len(negitive_all['negative_items'][i]) # min_num = np.inf # for i in range(0,len(negitive_all)): # if len(negitive_all['negative_items'][i]) < min_num: # min_num = len(negitive_all['negative_items'][i]) # sheet = np.zeros((len(negitive_all), max_num)) # for i in range(0,len(negitive_all)): # for j in range (0, len(np.array(negitive_all['negative_items'][i]))): # sheet[i][j] = np.array(negitive_all['negative_items'][i])[j] # sheet_new = sheet[:,:min_num] # print(sheet_new) # X = np.array(negitive_all['negative_items']) # max_len = max(len(xx) for xx in X) # M = np.array( [np.concatenate([xx, np.zeros( max_len - len(xx))]) for xx in X]) # sheet = [] # for i in range(M.shape[0]): # random.shuffle(M[i]) #print(list(M[i])) #print(np.random.shuffle(list(M[i]))) #sheet.append[np.random.shuffle(list(M[i]))] #np.random.randint(0,10,(4,3)) # print("neg_all:",negitive_all) # sampled_data = sample_data(negitive_all, random_number = 1, sample_rate = 3) # dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data) # dataset.valid_enc_graph = dataset.train_enc_graph for iter_idx in range(1, args.train_max_iter): # """ # 方法:创建一个最基本的encode图,然后再加边,每次加一种点 # """ #print(len(negitive_all)) #print ("max_num:", max_num,min_num) #print("M shape:", sheet_new.shape) #print("sheet:",M) #print(np.random.shuffle(np.array(sheet_new))) #map_matrix = np.random.randint(0,min_num,(sheet_new.shape[0], sheet_new.shape[1])) < 20 #print(sheet_new[map_matrix].shape) #print(np.where(sheet_new[map_matrix])) #print(sheet_new) # if args.sample_rate > 0: # 这是随机采样的代码 # """ # 如何采样? # 1. 单次采样:时间占用还好 # 2. 每次随机采样: # 我们先存一个所有负样本的表,每次在这个负样本的表中去采样 # 对于更新函数,我们需要对train_enc_graph进行更新, # 函数: # 1. 一个采样函数 # 2. 更新函数 # """ # print(1) # sampled_data = sample_data(negitive_all, random_number = seed_list[iter_idx], sample_rate = 10) # print(2) # dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data) # print(3) # dataset.valid_enc_graph = dataset.train_enc_graph #print(4) if iter_idx > 3: t0 = time.time() net.train() if iter_idx > 250: Two_Stage = True else: Two_Stage = False Two_Stage = False pred_ratings, reg_loss, user_out, movie_out, W = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature, Two_Stage) #print("pre:",pred_ratings[0]) if args.loss_func == "CE": loss = rating_loss_net(pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss ''' real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2) loss += mse_loss * 0.0001 ''' elif args.loss_func == "Hinge": real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) gap = (real_pred_ratings - train_gt_labels) ** 2 hinge_loss = th.where(gap > 1.0, gap*gap, gap).mean() loss = hinge_loss elif args.loss_func == "MSE": ''' seeds = th.arange(pred_ratings.shape[0]) random.shuffle(seeds) for i in range((pred_ratings.shape[0] - 1) // 50 + 1): start = i * 50 end = (i + 1) * 50 if end > (pred_ratings.shape[0] - 1): end = pred_ratings.shape[0] - 1 batch = seeds[start:end] loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss count_loss += loss.item() * 50 / pred_ratings.shape[0] optimizer.zero_grad() loss.backward(retain_graph=True) #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature) ''' loss = th.mean((pred_ratings[:, 0] - nd_possible_rating_values[train_gt_labels]) ** 2) + args.ARR * reg_loss count_loss += loss.item() optimizer.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() #print("iter:",iter_idx, loss) if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) if args.loss_func == "CE": real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) elif args.loss_func == "MSE": real_pred_ratings = pred_ratings[:, 0] rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss/iter_idx, count_rmse/count_num, np.average(dur)) count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') logging_str += ', Test RMSE={:.4f}'.format(test_rmse) test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) dev_step(args, net, dataset=dataset, segment='test', debug = False) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test', debug = True, idx = iter_idx) best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) dataset = DataSetLoader(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio, sample_rate=args.sample_rate) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor( dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger NDCG_logger = MetricLogger( ['recall50', 'recall100', 'recall200', 'ndcg50', 'ndcg100', 'ndcg200'], ['%.4f', '%.4f', '%.4f', '%.4f', '%.4f', '%.4f'], os.path.join(args.save_dir, 'NDCG.csv')) ### declare the loss information best_valid_rmse = np.inf best_valid_ndcg = -np.inf best_test_ndcg = [] no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) train_m = dataset.train_m test_m = dataset.test_m tset = dataset.tset user_num, item_num = train_m.shape[0], train_m.shape[1] #dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(args.device) #dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(args.device) print("Start training ...") train_rating_pairs, train_rating_values = dataset._generate_pair_value( dataset.train_rating_info) def update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data): train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero( dataset.train_rating_info, sampled_data) train_rating_pairs = (np.append(train_rating_pairs[0], train_rating_pairs_zeros[0]), np.append(train_rating_pairs[1], train_rating_pairs_zeros[1])) train_rating_values = np.append(train_rating_values, train_rating_values_zeros) dataset.train_enc_graph = dataset._generate_enc_graph( train_rating_pairs, train_rating_values, add_support=True) dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph return dataset.train_enc_graph def sample_data(interact_status, random_number, sample_rate): random.seed(random_number) interact_status['negative_samples'] = interact_status[ 'negative_items'].apply(lambda x: random.sample(x, sample_rate)) return interact_status[[ 'user_id', 'negative_items', 'negative_samples' ]] seed_list = np.random.randint(0, 10000, (args.train_max_iter, )) Two_Stage = False #sampled_data = sample_data(negitive_all, random_number = seed_list[iter_idx], sample_rate = 3) negitive_all = dataset.negative_all(dataset.train_rating_info) sampled_data = sample_data(negitive_all, random_number=1, sample_rate=99) dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data) dataset.valid_enc_graph = dataset.train_enc_graph for iter_idx in range(1, args.train_max_iter): #sampled_data = sample_data(negitive_all, random_number = 1, sample_rate = 3) #dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data) print("iter:", iter_idx) net.train() pred_ratings, reg_loss, user_out, movie_out, W = net( dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature, Two_Stage) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss count_loss += loss.item() optimizer.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) #print(real_pred_ratings.shape) # 对pred的 if iter_idx < 100: if iter_idx % 10 == 0: recall50_, recall100_, recall200_, ndcg50_, ndcg100_, ndcg200_ = \ dev_step(tset, train_m, test_m, net, dataset, args, nd_possible_rating_values) #dev_cold(u_train,i_train, tset, train_m, test_m) NDCG_logger.log(recall50=recall50_, recall100=recall100_, recall200=recall200_, ndcg50=ndcg50_, ndcg100=ndcg100_, ndcg200=ndcg200_) if iter_idx >= 500: recall50, recall100, recall200, ndcg50, ndcg100, ndcg200 = \ dev_step(tset, train_m, test_m, net, dataset, args ,nd_possible_rating_values) NDCG_logger.log(recall50=recall50_, recall100=recall100_, recall200=recall200_, ndcg50=ndcg50_, ndcg100=ndcg100_, ndcg200=ndcg200_) #dev_cold(u_train,i_train, tset, train_m, test_m) NDCG_logger.close()
def train(args): ### prepare data and set model movielens = MovieLens(args.data_name, testing=args.testing, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) if args.testing: test_dataset = MovieLensDataset(movielens.test_rating_pairs, movielens.test_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) else: test_dataset = MovieLensDataset(movielens.valid_rating_pairs, movielens.valid_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) train_dataset = MovieLensDataset(movielens.train_rating_pairs, movielens.train_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) train_loader = th.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_movielens) test_loader = th.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_movielens) model = IGMC( in_feats=(args.hop + 1) * 2, latent_dim=[32, 32, 32, 32], num_relations=5, # movielens.num_rating, num_bases=4, regression=True, edge_dropout=args.edge_dropout, # side_features=args.use_features, # n_side_features=n_features, # multiply_by=args.multiply_by ).to(args.device) loss_fn = nn.MSELoss().to(args.device) optimizer = optim.Adam(model.parameters(), lr=args.train_lr, weight_decay=0) print("Loading network finished ...\n") ### prepare the logger logger = MetricLogger(args.save_dir, args.valid_log_interval) best_epoch = 0 best_rmse = np.inf ### declare the loss information print("Start training ...") for epoch_idx in range(1, args.train_epochs + 1): print('Epoch', epoch_idx) train_loss = train_epoch(model, loss_fn, optimizer, args.arr_lambda, train_loader, args.device, args.train_log_interval) test_rmse = evaluate(model, test_loader, args.device) eval_info = { 'epoch': epoch_idx, 'train_loss': train_loss, 'test_rmse': test_rmse, } print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format( *eval_info.values())) if epoch_idx % args.train_lr_decay_step == 0: for param in optimizer.param_groups: param['lr'] = args.train_lr_decay_factor * param['lr'] logger.log(eval_info, model, optimizer) if best_rmse > test_rmse: best_rmse = test_rmse best_epoch = epoch_idx eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format( best_rmse, best_epoch) print(eval_info) with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f: f.write(eval_info)