def train(args): dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm) print("Loading data finished ...\n") args.src_key = dataset.name_user args.dst_key = dataset.name_movie args.src_in_units = dataset.user_feature.shape[1] args.dst_in_units = dataset.movie_feature.shape[1] args.nratings = dataset.possible_rating_values.size ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx) net.hybridize() if args.gen_r_use_classification: nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) rating_loss_net = gluon.loss.SoftmaxCELoss() else: rating_mean = dataset.train_rating_values.mean() rating_std = dataset.train_rating_values.std() rating_loss_net = gluon.loss.L2Loss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_rating_pairs = mx.nd.array(dataset.train_rating_pairs, ctx=args.ctx, dtype=np.int64) train_gt_ratings = mx.nd.array(dataset.train_rating_values, ctx=args.ctx, dtype=np.float32) ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 avg_gnorm = 0 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") for iter_idx in range(1, args.train_max_iter): if args.gen_r_use_classification: train_gt_label = mx.nd.array(np.searchsorted( dataset.possible_rating_values, dataset.train_rating_values), ctx=args.ctx, dtype=np.int32) with mx.autograd.record(): pred_ratings = net(dataset.train_graph, train_rating_pairs) if args.gen_r_use_classification: loss = rating_loss_net(pred_ratings, train_gt_label).mean() else: loss = rating_loss_net( mx.nd.reshape(pred_ratings, shape=(-1, )), (train_gt_ratings - rating_mean) / rating_std).mean() #loss.wait_to_read() loss.backward() count_loss += loss.asscalar() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) avg_gnorm += gnorm trainer.step(1.0) #, ignore_stale_grad=True) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print( gluon_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) if args.gen_r_use_classification: real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape( (1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum() else: rmse = mx.nd.square( pred_ratings.reshape((-1, )) * rating_std + rating_mean - train_gt_ratings).sum() count_rmse += rmse.asscalar() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format( iter_idx, avg_gnorm / args.train_log_interval, count_loss / iter_idx, count_rmse / count_num) avg_gnorm = 0 count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and trainer.learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_epoch, best_valid_rmse, best_test_rmse)) if __name__ == '__main__': args = config() devices = list(map(int, args.gpu.split(','))) n_gpus = len(devices) # For GCMC based on sampling, we require node has its own features. # Otherwise (node_id is the feature), the model can not scale dataset = MovieLens(args.data_name, 'cpu', mix_cpu_gpu=args.mix_cpu_gpu, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values # cpu if devices[0] == -1: run(0, 0, args, ['cpu'], dataset) # gpu elif n_gpus == 1: run(0, n_gpus, args, devices, dataset)
def train(args): print(args) dataset = MovieLens( args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio, ) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type="in"), ctx=args.ctx) net.hybridize() nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) rating_loss_net = gluon.loss.SoftmaxCELoss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {"learning_rate": args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger( ["iter", "loss", "rmse"], ["%d", "%.4f", "%.4f"], os.path.join(args.save_dir, "train_loss%d.csv" % args.save_id), ) valid_loss_logger = MetricLogger( ["iter", "rmse"], ["%d", "%.4f"], os.path.join(args.save_dir, "valid_loss%d.csv" % args.save_id), ) test_loss_logger = MetricLogger( ["iter", "rmse"], ["%d", "%.4f"], os.path.join(args.save_dir, "test_loss%d.csv" % args.save_id), ) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 avg_gnorm = 0 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() with mx.autograd.record(): pred_ratings = net( dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature, ) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() loss.backward() count_loss += loss.asscalar() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) avg_gnorm += gnorm trainer.step(1.0) if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print( gluon_net_info(net, save_path=os.path.join( args.save_dir, "net%d.txt" % args.save_id))) real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape( (1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum() count_rmse += rmse.asscalar() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, avg_gnorm / args.train_log_interval, count_loss / iter_idx, count_rmse / count_num, np.average(dur), ) avg_gnorm = 0 count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment="valid") valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ",\tVal RMSE={:.4f}".format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx net.save_parameters(filename=os.path.join( args.save_dir, "best_valid_net{}.params".format( args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment="test") best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ", Test RMSE={:.4f}".format(test_rmse) else: no_better_valid += 1 if (no_better_valid > args.train_early_stopping_patience and trainer.learning_rate <= args.train_min_lr): logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max( trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print("Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}". format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor( dataset.possible_rating_values).to(args.device) rating_loss_net = nn.CrossEntropyLoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger( ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join( args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 count_rmse = 0 count_num = 0 count_loss = 0 print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() net.train() pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph, dataset.user_feature, dataset.movie_feature) loss = rating_loss_net(pred_ratings, train_gt_labels).mean() count_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print( torch_net_info(net, save_path=os.path.join( args.save_dir, 'net%d.txt' % args.save_id))) real_pred_ratings = (th.softmax(pred_ratings, dim=1) * nd_possible_rating_values.view(1, -1)).sum(dim=1) rmse = ((real_pred_ratings - train_gt_ratings)**2).sum() count_rmse += rmse.item() count_num += pred_ratings.shape[0] if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (iter_idx + 1), rmse=count_rmse / count_num) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss / iter_idx, count_rmse / count_num, np.average(dur)) count_rmse = 0 count_num = 0 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info( "Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'. format(best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) if args.data_name == 'jukebox': dataset = JukeboxDataset('dataset/listen_count.txt') else: dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net = net.to(args.device) nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device) rating_loss_net = nn.MSELoss() learning_rate = args.train_lr optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'loss'], ['%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'ndcg','precision','recall','fscore','support'], ['%d','%.4f', '%.4f','%s','%s','%s','%s'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 count_rmse = 1 count_num = 1 count_loss = 0 count_step = 0 dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device) dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device) dataset.valid_enc_graph = dataset.train_enc_graph dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device) dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device) dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device) def batch(iterable, n=1): current_batch = [] for item in iterable: current_batch.append(item) if len(current_batch) == n: yield current_batch current_batch = [] if current_batch: yield current_batch batches = [] print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() net.train() unique_item_list = dataset.train['item_id'].unique().tolist() ufeat, ifeat = net.encoder(dataset.train_enc_graph, dataset.user_feature, dataset.movie_feature) from tqdm import tqdm if iter_idx ==1: for row in tqdm(list(dataset.train.itertuples())): user, item, rating = row.user_id, row.item_id, row.rating userid = dataset.global_user_id_map[user] observed = dataset.train[dataset.train['user_id'] == user]['item_id'].unique().tolist() negatives = set() while len(negatives) < 1: sample = random.choice(unique_item_list) if sample not in observed: negatives.add(sample) batches.append((userid, dataset.global_item_id_map[item], dataset.global_item_id_map[sample])) for bt in tqdm(list(batch(batches, 2**14))): uidfeat = ufeat[[e[0] for e in bt]] posfeat = ifeat[[e[1] for e in bt]] negfeat = ifeat[[e[2] for e in bt]] pos_scores = uidfeat @ net.decoder.Q @ posfeat.T neg_scores = uidfeat @ net.decoder.Q @ negfeat.T lmbd = 1e-5 mf_loss = -nn.BCELoss()(th.sigmoid(pos_scores), th.ones_like(pos_scores)) + nn.LogSigmoid()(pos_scores - neg_scores).mean() mf_loss = -1 * mf_loss regularizer = (th.norm(uidfeat,dim=1)**2).mean() + (th.norm(posfeat,dim=1)**2).mean() + (th.norm(negfeat,dim=1)**2).mean() + (th.norm(net.decoder.Q)) emb_loss = lmbd * regularizer print('mf_loss', mf_loss) print('emb_loss', emb_loss) optimizer.zero_grad() loss = mf_loss + emb_loss count_loss += loss.item() loss.backward() nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip) optimizer.step() ufeat, ifeat = net.encoder(dataset.train_enc_graph, dataset.user_feature, dataset.movie_feature) count_step += 1 print('train done') if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (torch_total_param_num(net))) print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) if iter_idx % args.train_log_interval == 0: train_loss_logger.log(iter=iter_idx, loss=count_loss / (count_step + 1)) logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( iter_idx, count_loss/(count_step + 1), count_rmse/count_num, np.average(dur)) count_rmse = 1 count_num = 1 if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') precision, recall, fscore, support = evaluate_others(args=args, net=net, dataset=dataset, segment='valid') ndcg = evaluate_ndcg(args=args, net=net, dataset=dataset, segment='valid') print('ndcg', ndcg, 'precision', precision, 'recall', recall, 'fscore', fscore, 'support', support) valid_loss_logger.log(iter=iter_idx, ndcg=ndcg, precision=precision, recall=recall, fscore=fscore, support=support) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < learning_rate: learning_rate = new_lr logging.info("\tChange the LR to %g" % new_lr) for p in optimizer.param_groups: p['lr'] = learning_rate no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
def train(args): print(args) dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") args.src_in_units = dataset.user_feature_shape[1] args.dst_in_units = dataset.movie_feature_shape[1] args.rating_vals = dataset.possible_rating_values ### build the net net = Net(args=args) net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx) net.hybridize() rating_loss_net = gluon.loss.SoftmaxCELoss() rating_loss_net.hybridize() trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr}) print("Loading network finished ...\n") ### perpare training data train_gt_labels = dataset.train_labels train_gt_ratings = dataset.train_truths ### prepare the logger train_loss_logger = MetricLogger(['iter', 'idx', 'loss', 'rmse'], ['%d', '%d', '%.4f', '%.4f'], os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) ### declare the loss information best_valid_rmse = np.inf no_better_valid = 0 best_iter = -1 enc_graph = dataset.train_enc_graph nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) g_user_fea = mx.nd.zeros((dataset.num_user,)) g_movie_fea = mx.nd.zeros((dataset.num_movie,)) train_truths = dataset.train_truths train_labels = dataset.train_labels print("Start training ...") dur = [] for iter_idx in range(1, args.train_max_iter): if iter_idx > 3: t0 = time.time() num_edges = dataset.train_truths.shape[0] seed = mx.nd.arange(num_edges, dtype='int64') edges = mx.nd.shuffle(seed) # each iteration will go through all edges for sample_idx in range(0, (num_edges + args.minibatch_size - 1) // args.minibatch_size): edge_ids = edges[sample_idx * args.minibatch_size: (sample_idx + 1) * args.minibatch_size if (sample_idx + 1) * args.minibatch_size < num_edges else num_edges] head_ids, tail_ids = dataset.train_dec_graph.find_edges(edge_ids.asnumpy()) head_subgraphs = {} tail_subgraphs = {} head_node_ids = np.unique(head_ids.asnumpy()) tail_node_ids = np.unique(tail_ids.asnumpy()) for i, _ in enumerate(args.rating_vals): t = enc_graph.canonical_etypes[i * 2] rev_t = enc_graph.canonical_etypes[i * 2 + 1] head_in_edges = enc_graph.in_edges(head_node_ids, 'eid', etype=rev_t) tail_in_edges = enc_graph.in_edges(tail_node_ids, 'eid', etype=t) if head_in_edges.shape[0] > 0: head_subgraphs[rev_t] = head_in_edges if tail_in_edges.shape[0] > 0: tail_subgraphs[t] = tail_in_edges head_subgraph = enc_graph.edge_subgraph(head_subgraphs, preserve_nodes=True) tail_subgraph = enc_graph.edge_subgraph(tail_subgraphs, preserve_nodes=True) edge_ids = edge_ids.as_in_context(args.ctx) true_relation_ratings = train_truths[edge_ids] true_relation_labels = train_labels[edge_ids] head_NID = head_subgraph.nodes['user'].data[dgl.NID] tail_NID = tail_subgraph.nodes['movie'].data[dgl.NID] g_user_fea[head_NID] = mx.nd.arange(head_NID.shape[0], dtype='int32') g_movie_fea[tail_NID] = mx.nd.arange(tail_NID.shape[0], dtype='int32') true_head_idx = g_user_fea[head_ids].as_in_context(args.ctx) true_tail_idx = g_movie_fea[tail_ids].as_in_context(args.ctx) with mx.autograd.record(): pred_ratings = net(head_subgraph, tail_subgraph, true_head_idx, true_tail_idx) loss = rating_loss_net(pred_ratings, true_relation_labels).mean() loss.backward() gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) trainer.step(1.0, ignore_stale_grad=True) real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * nd_possible_rating_values.reshape((1, -1))).sum(axis=1) rmse = mx.nd.square(real_pred_ratings - true_relation_ratings).mean().asscalar() rmse = np.sqrt(rmse) loss = loss.asscalar() if sample_idx % 100 == 0: train_loss_logger.log(iter=iter_idx, idx=sample_idx, loss=loss, rmse=rmse) print("Iter={}, sample_idx={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}".format(iter_idx, sample_idx, gnorm, loss, rmse)) gc.collect() if iter_idx > 3: dur.append(time.time() - t0) if iter_idx == 1: print("Total #Param of net: %d" % (gluon_total_param_num(net))) print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) if iter_idx % args.train_log_interval == 0: logging_str = "Iter={}, time={:.4f}".format( iter_idx, np.average(dur)) if iter_idx % args.train_valid_interval == 0: valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse) logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) if valid_rmse < best_valid_rmse: best_valid_rmse = valid_rmse no_better_valid = 0 best_iter = iter_idx #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id))) test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') best_test_rmse = test_rmse test_loss_logger.log(iter=iter_idx, rmse=test_rmse) logging_str += ', Test RMSE={:.4f}'.format(test_rmse) else: no_better_valid += 1 if no_better_valid > args.train_early_stopping_patience\ and trainer.learning_rate <= args.train_min_lr: logging.info("Early stopping threshold reached. Stop training.") break if no_better_valid > args.train_decay_patience: new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) if new_lr < trainer.learning_rate: logging.info("\tChange the LR to %g" % new_lr) trainer.set_learning_rate(new_lr) no_better_valid = 0 if iter_idx % args.train_log_interval == 0: print(logging_str) print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( best_iter, best_valid_rmse, best_test_rmse)) train_loss_logger.close() valid_loss_logger.close() test_loss_logger.close()
cmd_input = 'python3 ' + ' '.join(sys.argv) with open(os.path.join(args.save_dir, 'cmd_input.txt'), 'a') as f: f.write(cmd_input) f.write("\n") print('Command line input: ' + cmd_input + ' is saved.') return args if __name__ == '__main__': args = config() random.seed(args.seed) np.random.seed(args.seed) movielens = MovieLens(args.data_name, testing=args.testing, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) if args.n_gpus == 1: train(0, args.n_gpus, args, args.devices, movielens) else: procs = [] for proc_id in range(args.n_gpus): p = mp.Process(target=train, args=(proc_id, args.n_gpus, args, args.devices, movielens)) p.start() procs.append(p) for p in procs: p.join()
if config.data == 'CDs': from data import Amazon data_set = Amazon.CDs() elif config.data == 'Books': from data import Amazon data_set = Amazon.Books() elif config.data == 'Children': from data import GoodReads data_set = GoodReads.Children() elif config.data == 'Comics': from data import GoodReads data_set = GoodReads.Comics() elif config.data == 'ML20M': from data import MovieLens data_set = MovieLens.ML20M() elif config.data == 'ML1M': from data import MovieLens data_set = MovieLens.ML1M() #generate datasets in the 80-20-CUT setting train_set, val_set, train_val_set, test_set, num_users, num_items = data_set.generate_dataset( index_shift=1) #generate datasets in the 3-LOS setting if config.setting == 'LOS': assert len(train_set) == len(val_set) and len(test_set) == len( train_set) for i in range(len(train_set)): user = train_set[i] + val_set[i] + test_set[i]
def train(args): ### prepare data and set model movielens = MovieLens(args.data_name, testing=args.testing, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) if args.testing: test_dataset = MovieLensDataset(movielens.test_rating_pairs, movielens.test_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) else: test_dataset = MovieLensDataset(movielens.valid_rating_pairs, movielens.valid_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) train_dataset = MovieLensDataset(movielens.train_rating_pairs, movielens.train_rating_values, movielens.train_graph, args.hop, args.sample_ratio, args.max_nodes_per_hop) train_loader = th.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_movielens) test_loader = th.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_movielens) model = IGMC( in_feats=(args.hop + 1) * 2, latent_dim=[32, 32, 32, 32], num_relations=5, # movielens.num_rating, num_bases=4, regression=True, edge_dropout=args.edge_dropout, # side_features=args.use_features, # n_side_features=n_features, # multiply_by=args.multiply_by ).to(args.device) loss_fn = nn.MSELoss().to(args.device) optimizer = optim.Adam(model.parameters(), lr=args.train_lr, weight_decay=0) print("Loading network finished ...\n") ### prepare the logger logger = MetricLogger(args.save_dir, args.valid_log_interval) best_epoch = 0 best_rmse = np.inf ### declare the loss information print("Start training ...") for epoch_idx in range(1, args.train_epochs + 1): print('Epoch', epoch_idx) train_loss = train_epoch(model, loss_fn, optimizer, args.arr_lambda, train_loader, args.device, args.train_log_interval) test_rmse = evaluate(model, test_loader, args.device) eval_info = { 'epoch': epoch_idx, 'train_loss': train_loss, 'test_rmse': test_rmse, } print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format( *eval_info.values())) if epoch_idx % args.train_lr_decay_step == 0: for param in optimizer.param_groups: param['lr'] = args.train_lr_decay_factor * param['lr'] logger.log(eval_info, model, optimizer) if best_rmse > test_rmse: best_rmse = test_rmse best_epoch = epoch_idx eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format( best_rmse, best_epoch) print(eval_info) with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f: f.write(eval_info)
def train(args): #print(args.ctx) #print(args) dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) print("Loading data finished ...\n") graph=dataset.train_enc_graph msg_in_unit = args.gcn_agg_units // len(dataset.possible_rating_values) movie_in_unit = dataset.movie_feature_shape[1] users_in_unit = dataset.user_feature_shape[1] print((movie_in_unit, users_in_unit, msg_in_unit)) num_u = graph.number_of_nodes('user') dgl.kernel.tvm_enabled = args.tvm != 0 print("TVM enabled?", dgl.kernel.tvm_enabled) def get_copy_reduce(): allfeat = [mx.ndarray.random.randn(movie_in_unit, msg_in_unit,dtype=np.float32) for i in dataset.possible_rating_values] def do_copy_reduce(): funcs=OrderedDict() for i, rating in enumerate(dataset.possible_rating_values): rating = str(rating) graph.nodes['movie'].data['h%d' % i] = allfeat[i] #funcs[rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h')) funcs['rev-%s' % rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h')) # message passing graph.multi_update_all(funcs, "stack") #graph.nodes['user'].data.pop('h') return graph.nodes['user'].data.pop('h').reshape(num_u, -1) return do_copy_reduce #mx.nd.save() #time.sleep(20) #(943, 75) (1682, 75) (72000,) (72000,) (943, 75) def get_copy_reduce_bwd(): x_mx = mx.ndarray.random.randn(movie_in_unit, msg_in_unit,dtype=np.float32) X = zerocopy_to_dgl_ndarray(x_mx) import dgl out = dgl.ndarray.empty((users_in_unit, msg_in_unit)) grad_out = zerocopy_to_dgl_ndarray(mx.ndarray.random.randn(users_in_unit, msg_in_unit,dtype=np.float32)) outgrad_x=mx.ndarray.zeros((movie_in_unit, msg_in_unit),dtype=np.float32) grad_x = zerocopy_to_dgl_ndarray_for_write(outgrad_x) etid = graph.get_etype_id('rev-1') stid, dtid = getattr(graph,"_graph").metagraph.find_edge(etid) gidx=AdaptedHeteroGraph(graph, stid, dtid, etid).get_immutable_gidx(MyContext()) print(grad_out.shape) def do_copy_reduce_bwd(): dgl.kernel.backward_copy_reduce("sum", gidx, 0, X, out, grad_out, grad_x) return outgrad_x return do_copy_reduce_bwd def get_binary_op_dot_bwd(islhs): dot_lhs = mx.ndarray.random.randn(users_in_unit, 75,dtype=np.float32) dot_rhs = mx.ndarray.random.randn(movie_in_unit, 75,dtype=np.float32) dot_out = mx.ndarray.random.randn(45450,dtype=np.float32) dot_outgrad = mx.ndarray.random.randn(45450,dtype=np.float32) dot_lhsgrad = mx.ndarray.zeros((users_in_unit, 75),dtype=np.float32) dot_rhsgrad = mx.ndarray.zeros((movie_in_unit, 75),dtype=np.float32) import dgl A = zerocopy_to_dgl_ndarray(dot_lhs) B = zerocopy_to_dgl_ndarray(dot_rhs) out = zerocopy_to_dgl_ndarray(dot_out) grad_out = zerocopy_to_dgl_ndarray(dot_outgrad) G = graph.local_var() etid = 0 stid, dtid = getattr(G,"_graph").metagraph.find_edge(etid) gidx=AdaptedHeteroGraph(graph, stid, dtid, etid).get_immutable_gidx(MyContext()) def do_binary_op_dot_bwd(): if islhs: grad_A = zerocopy_to_dgl_ndarray_for_write(dot_lhsgrad) dgl.kernel.backward_lhs_binary_op_reduce("none", "dot", gidx, 0, 1, A, B, out, grad_out, grad_A) return dot_lhsgrad else: grad_B = zerocopy_to_dgl_ndarray_for_write(dot_rhsgrad) dgl.kernel.backward_rhs_binary_op_reduce("none", "dot", gidx, 0, 1, A, B, out, grad_out, grad_B) return dot_rhsgrad return do_binary_op_dot_bwd workloads = { 'copyreduce': (get_copy_reduce, 100), 'copyreduce_bwd': (get_copy_reduce_bwd, 10000), 'binary_dot_bwd_lhs': (lambda:get_binary_op_dot_bwd(True),10000), 'binary_dot_bwd_rhs': (lambda:get_binary_op_dot_bwd(False), 10000), } workload = workloads[args.workload][0] times = workloads[args.workload][1] if args.mode == "save": ret=workload()() mx.nd.save(args.workload + ".mxnd",ret) elif args.mode == "compare": r = workload()() loaded = mx.nd.load(args.workload + ".mxnd")[0] print(loaded) print(r.shape, loaded.shape) #print(mx.test_utils.almost_equal(r.asnumpy(),loaded.asnumpy())) for idx, row in enumerate(r): lrow = loaded[idx] for j in range(len(row)): r=row[j].asscalar() l=lrow[j].asscalar() if abs((r-l)/ (r+l + 1e-11)) > 1e-3: print(idx, j, r, l) else: workload = workload() for i in range(3): workload() t0 = time.time() for i in range(times): workload() print(time.time()-t0) print("DONE")
sample_ratio=self.sample_ratio, max_nodes_per_hop=self.max_nodes_per_hop) return subgraph, g_label def collate_movielens(data): g_list, label_list = map(list, zip(*data)) g = dgl.batch_hetero(g_list) g_label = th.stack(label_list) return g, g_label if __name__ == "__main__": from data import MovieLens movielens = MovieLens("ml-100k", testing=True) train_dataset = MovieLensDataset(movielens.train_rating_pairs, movielens.train_rating_values, movielens.train_graph, hop=1, sample_ratio=1.0, max_nodes_per_hop=200) train_loader = th.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=collate_movielens) batch = next(iter(train_loader)) inputs = batch[0].to(th.device('cuda:0'))