def main(): # Note: The run start is in convert.py exp = Experiment(__file__) args = exp.get_arguments(parse_args(), show=True) device = exp.get_device() chrono = exp.chrono() # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) tmp = os.environ['TEMP_DIRECTORY'] run_dir = "{}/run/neumf/{}".format(tmp, config['timestamp']) print("Saving config and results to {}".format(run_dir)) if run_dir != '': os.makedirs(run_dir, exist_ok=True) utils.save_config(config, run_dir) # Load Data # ------------------------------------------------------------------------------------------------------------------ print('Loading data') with chrono.time('loading_data', skip_obs=0): t1 = time.time() train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) # mlperf_log.ncf_print(key=# mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) # mlperf_log.ncf_print(key=# mlperf_log.INPUT_ORDER) # set shuffle=True in DataLoader train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz)) # ------------------------------------------------------------------------------------------------------------------ # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]).to(device) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph # mlperf_log.ncf_print(key=# mlperf_log.OPT_LR, value=args.learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=args.learning_rate, eps=epsilon) # mlperf_log.ncf_print(key=# mlperf_log.MODEL_HP_LOSS_FN, value=# mlperf_log.BCE) criterion = nn.BCEWithLogitsLoss().to(device) model.train() for epoch in range(args.repeat): losses = utils.AverageMeter() with chrono.time('train') as t: for batch_index, (user, item, label) in enumerate(train_dataloader): if batch_index > args.number: break user = torch.autograd.Variable(user, requires_grad=False).to(device) item = torch.autograd.Variable(item, requires_grad=False).to(device) label = torch.autograd.Variable(label, requires_grad=False).to(device) outputs = model(user, item) loss = criterion(outputs, label) exp.log_batch_loss(loss.item()) losses.update(loss.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() exp.log_epoch_loss(losses.sum) # Save stats to file exp.show_eta(epoch, t) exp.report()
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) for epoch in range(args.epochs): model.train() losses = utils.AverageMeter() begin = time.time() loader = tqdm.tqdm(train_dataloader) length = len(loader) if length < 101: print( 'Exiting, cannot profile the required 100 iterations. Please re-run with a larger batch size.' ) cuda.profile_stop() exit() for batch_index, (user, item, label) in enumerate(loader): if batch_index == length // 2 and epoch == 0: print('Starting profiling for 100 iterations.') cuda.profile_start() if batch_index == length // 2 + 100 and epoch == 0: print( 'Profiling completed, stopping profiling and continuing training.' ) cuda.profile_stop() user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) return 0
def main(): # Note: The run start is in convert.py args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print( key=mlperf_log.INPUT_ORDER) # set shuffle=True in DataLoader train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph mlperf_log.ncf_print(key=mlperf_log.TRAIN_LEARN_RATE, value=args.learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon) optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=args.learning_rate, eps=epsilon) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) success = False mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) model.train() losses = utils.AverageMeter() mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=train_dataset.nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": float(np.mean(hits)) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_TARGET, value=args.threshold) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break mlperf_log.ncf_print(key=mlperf_log.RUN_STOP, value={"success": success}) mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
from dataset import CFTrainDataset, load_test_ratings, load_test_negs from convert import (TEST_NEG_FILENAME, TEST_RATINGS_FILENAME, TRAIN_RATINGS_FILENAME) def parse_args(): parser = ArgumentParser(description="Load a Nerual Collaborative" " Filtering model") parser.add_argument('--path', type=str, help='Path to pretrained model') return parser.parse_args() args = parse_args() print('Loading data') train_dataset = CFTrainDataset(os.path.join('ml-20m', TRAIN_RATINGS_FILENAME), 4) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=2048, shuffle=True, num_workers=0, pin_memory=True) test_ratings = load_test_ratings(os.path.join( 'ml-20m', TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join('ml-20m', TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items # Create model layers = [256, 256, 128, 64] model = NeuMF(nb_users, nb_items,
def main(): # Note: The run start is in data_preprocess.py args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/MGPM/{}/{}".format( os.path.basename(os.path.normpath(args.data)), config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() if use_cuda: print("Using cuda ...") else: print("Using CPU ...") t1 = time.time() best_hit, best_ndcg = 0., 0. start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Load Data print('Loading data') print(os.path.join(args.data, TRAIN_RATINGS_FILENAME)) train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), os.path.join(args.data, DATA_SUMMARY_FILENAME), args.negative_samples) mlperf_log.ncf_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) mlperf_log.ncf_print( key=mlperf_log.INPUT_ORDER) # set shuffle=True in DataLoader train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = Multi_Preference_Model(nb_users=nb_users, nb_items=nb_items, embed_dim=32, history_size=9) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph mlperf_log.ncf_print(key=mlperf_log.OPT_LR, value=args.learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 mlperf_log.ncf_print(key=mlperf_log.OPT_NAME, value="Adam") mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=beta1) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=beta2) mlperf_log.ncf_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=epsilon) optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=args.learning_rate, eps=epsilon) mlperf_log.ncf_print(key=mlperf_log.MODEL_HP_LOSS_FN, value=mlperf_log.BCE) # optimizer = torch.optim.SGD(model.parameters(),lr=args.learning_rate,momentum=0.9) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( 'checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/' + model._get_name() + '.pd') model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] best_hit = checkpoint['hit'] best_ndcg = checkpoint['ndcg'] # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG if start_epoch == 0: hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) mlperf_log.ncf_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(start_epoch, args.epochs): mlperf_log.ncf_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) model.train() losses = utils.AverageMeter() mlperf_log.ncf_print(key=mlperf_log.INPUT_HP_NUM_NEG, value=train_dataset.nb_neg) mlperf_log.ncf_print(key=mlperf_log.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, history, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) history = torch.autograd.Variable(history, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda() item = item.cuda() history = history.cuda() label = label.cuda() # outputs, _ = model(user, item,history) outputs = model(user, item, history) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) mlperf_log.ncf_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": float(np.mean(hits)) }) mlperf_log.ncf_print(key=mlperf_log.EVAL_STOP) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if np.mean(hits) >= best_hit or np.mean(ndcgs) >= best_ndcg: best_hit = np.mean(hits) best_ndcg = np.mean(ndcgs) # Save checkpoint. print('Saving checkpoint..') state = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'hit': best_hit, 'ndcg': best_ndcg, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/' + model._get_name() + '.pd') print("Best hit: ", best_hit) print("Best_ndcg: ", best_ndcg) mlperf_log.ncf_print(key=mlperf_log.RUN_STOP) mlperf_log.ncf_print(key=mlperf_log.RUN_FINAL)
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data # TODO: Reading CSVs is slow. Could use HDF or Apache Arrow train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) print('batchsize=%d' % args.batch_size) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) for epoch in range(args.epochs): model.train() losses = utils.AverageMeter() begin = time.time() loader = tqdm.tqdm(train_dataloader) counting_data = 0 counting_forward = 0 counting_zerograd = 0 counting_backward = 0 counting_updateweight = 0 counting_des = 0 for batch_index, (user, item, label) in enumerate(loader): start0 = time.time() user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) start1 = time.time() outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) start2 = time.time() optimizer.zero_grad() start3 = time.time() loss.backward() start4 = time.time() optimizer.step() start5 = time.time() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) start6 = time.time() counting_data += start1 - start0 counting_forward += start2 - start1 counting_zerograd += start3 - start2 counting_backward += start4 - start3 counting_updateweight += start5 - start3 counting_des += start6 - start5 train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch) val_time = time.time() - begin print( 'data: {data:.f4}, forward: {ft:.4f}, zerograd: {zg:.4f}, backward: {bw:.4f},' ' adam: {adam:.4f}, description: {des:.4f}'.format( data=counting_data, ft=counting_forward, zg=counting_zerograd, bw=counting_backward, adam=counting_updateweight, des=counting_des)) print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) return 0
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) # torch.manual_seed(args.seed) mx.random.seed(seed_state=args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf_" + args.data + "/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) #defined in utils.py # Check that GPUs are actually available use_cuda = not args.no_cuda and mx.test_utils.list_gpus() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) #in original file, use 8 core as defaul # the parameter:shuffle means random the samples train_dataloader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) if (use_cuda): ctx = mx.gpu(0) # default to use NO.1 gpu can use docker to select a nvidia else: ctx = mx.cpu(0) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers], ctx=ctx) model.initialize(ctx=ctx) model.hybridize() print(model) # todo 9: to change the function in utils # print("{} parameters".format(utils.count_parameters(model))) # model.collect_params() # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # model.save_parameters(os.path.join("/home/net.params", 'net.params')) # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, processes=args.processes, ctx=ctx) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) ############# hyperparameters # Add optimizer and loss to graph lr = args.learning_rate bs = args.batch_size trainer = mx.gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr}) mxnet_criterion = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( ) # equivalent to lossfunction # training for epoch in range(args.epochs): begin = time.time() # tqdm shows the percentage of the process loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): # TODO 7: search the autograd in mxnet # todo : let user act in gpu user = nd.array(user, ctx=ctx) item = nd.array(item, ctx=ctx) label = nd.array(label, ctx=ctx) # compute the gradient automatically with autograd.record(): outputs = model(user, item) loss = mxnet_criterion(outputs, label.T) loss.backward() trainer.step(bs) for x in loss.mean().asnumpy().tolist(): loss_number = x description = ('Epoch {} Loss {:.4f}'.format(epoch, loss_number)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, output=valid_results_file, epoch=epoch, processes=args.processes, ctx=ctx) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) # Save model text description after modelling with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # model.save_parameters(os.path.join("/home/net.params",'net.params')) return 0
def main(): global msglogger script_dir = os.path.dirname(__file__) args = parse_args() # Distiller loggers msglogger = apputils.config_pylogger('logging.conf', args.name, output_dir=args.output_dir) tflogger = TensorBoardLogger(msglogger.logdir) # tflogger.log_gradients = True # pylogger = PythonLogger(msglogger) if args.seed is not None: msglogger.info("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) args.qe_mode = str(args.qe_mode).split('.')[1] args.qe_clip_acts = str(args.qe_clip_acts).split('.')[1] apputils.log_execution_env_state(sys.argv) if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) if len(args.gpus) > 1: msglogger.error('ERROR: Only single GPU supported for NCF') exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = msglogger.logdir msglogger.info("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data training = not (args.eval or args.qe_calibration or args.activation_histograms) msglogger.info('Loading data') if training: train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items else: train_dataset = None train_dataloader = None nb_users, nb_items = (138493, 26744) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) msglogger.info( 'Load data done [%.1f s]. #user=%d, #item=%d, #train=%s, #test=%d' % (time.time() - t1, nb_users, nb_items, str(train_dataset.mat.nnz) if training else 'N/A', len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers], split_final=args.split_final) if use_cuda: model = model.cuda() msglogger.info(model) msglogger.info("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) compression_scheduler = None start_epoch = 0 optimizer = None if args.load: if training: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.load) if args.reset_optimizer: start_epoch = 0 optimizer = None else: model = apputils.load_lean_checkpoint(model, args.load) # Add loss to graph criterion = nn.BCEWithLogitsLoss() if use_cuda: criterion = criterion.cuda() if training and optimizer is None: optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.compress: compression_scheduler = distiller.file_config(model, optimizer, args.compress) model.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') if args.qe_calibration or args.activation_histograms: calib = { 'portion': args.qe_calibration, 'desc_str': 'quantization calibration stats', 'collect_func': partial(distiller.data_loggers.collect_quant_stats, inplace_runtime_check=True, disable_inplace_attrs=True) } hists = { 'portion': args.activation_histograms, 'desc_str': 'activation histograms', 'collect_func': partial(distiller.data_loggers.collect_histograms, activation_stats=None, nbins=2048, save_hist_imgs=True) } d = calib if args.qe_calibration else hists distiller.utils.assign_layer_fq_names(model) num_users = int(np.floor(len(test_ratings) * d['portion'])) msglogger.info( "Generating {} based on {:.1%} of the test-set ({} users)".format( d['desc_str'], d['portion'], num_users)) test_fn = partial(val_epoch, ratings=test_ratings, negs=test_negs, K=args.topk, use_cuda=use_cuda, processes=args.processes, num_users=num_users) d['collect_func'](model=model, test_fn=test_fn, save_dir=run_dir, classes=None) return 0 if args.eval: if args.quantize_eval and args.qe_calibration is None: model.cpu() quantizer = quantization.PostTrainLinearQuantizer.from_args( model, args) dummy_input = (torch.tensor([1]), torch.tensor([1]), torch.tensor([True], dtype=torch.bool)) quantizer.prepare_model(dummy_input) model.cuda() distiller.utils.assign_layer_fq_names(model) if args.eval_fp16: model = model.half() # Calculate initial Hit Ratio and NDCG begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) val_time = time.time() - begin hit_rate = np.mean(hits) msglogger.info( 'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, val_time = {val_time:.2f}' .format(K=args.topk, hit_rate=hit_rate, ndcg=np.mean(ndcgs), val_time=val_time)) hit_rate = 0 if args.quantize_eval: checkpoint_name = 'quantized' apputils.save_checkpoint(0, 'NCF', model, optimizer=None, extras={'quantized_hr@10': hit_rate}, name='_'.join([args.name, 'quantized']) if args.name else checkpoint_name, dir=msglogger.logdir) return 0 total_samples = len(train_dataloader.sampler) steps_per_epoch = math.ceil(total_samples / args.batch_size) best_hit_rate = 0 best_epoch = 0 for epoch in range(start_epoch, args.epochs): msglogger.info('') model.train() losses = utils.AverageMeter() begin = time.time() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch, optimizer) loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) if compression_scheduler: compression_scheduler.on_minibatch_begin( epoch, batch_index, steps_per_epoch, optimizer) outputs = model(user, item, torch.tensor([False], dtype=torch.bool)) loss = criterion(outputs, label) if compression_scheduler: compression_scheduler.before_backward_pass( epoch, batch_index, steps_per_epoch, loss, optimizer, return_loss_components=False) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, batch_index, steps_per_epoch, optimizer) # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) steps_completed = batch_index + 1 if steps_completed % args.log_freq == 0: stats_dict = OrderedDict() stats_dict['Loss'] = losses.avg stats = ('Performance/Training/', stats_dict) params = model.named_parameters( ) if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.log_freq, [tflogger]) tflogger.log_model_buffers(model, ['tracked_min', 'tracked_max'], 'Quant/Train/Acts/TrackedMinMax', epoch, steps_completed, steps_per_epoch, args.log_freq) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) val_time = time.time() - begin if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) hit_rate = np.mean(hits) mean_ndcgs = np.mean(ndcgs) stats_dict = OrderedDict() stats_dict['HR@{0}'.format(args.topk)] = hit_rate stats_dict['NDCG@{0}'.format(args.topk)] = mean_ndcgs stats = ('Performance/Validation/', stats_dict) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) msglogger.info( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, AvgTrainLoss = {loss.avg:.4f}, ' 'train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=hit_rate, ndcg=mean_ndcgs, loss=losses, train_time=train_time, val_time=val_time)) is_best = False if hit_rate > best_hit_rate: best_hit_rate = hit_rate is_best = True best_epoch = epoch extras = { 'current_hr@10': hit_rate, 'best_hr@10': best_hit_rate, 'best_epoch': best_epoch } apputils.save_checkpoint(epoch, 'NCF', model, optimizer, compression_scheduler, extras, is_best, dir=run_dir) if args.threshold is not None: if np.mean(hits) >= args.threshold: msglogger.info("Hit threshold of {}".format(args.threshold)) break