def main(): log_hardware() args = parse_args() args.distributed, args.world_size = init_distributed(args.local_rank) log_args(args) if args.seed is not None: torch.manual_seed(args.seed) print("Saving results to {}".format(args.checkpoint_dir)) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) # The default of np.random.choice is replace=True, so does pytorch random_() LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True) LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() LOGGER.log(key=tags.RUN_START) train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) valid_negative = test_negs.shape[1] LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=valid_negative) nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings)) all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size) LOGGER.log(key=tags.INPUT_ORDER) # we shuffled later with randperm # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.opt_level == "O2": model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(admm_utils.count_parameters(model))) LOGGER.log(key=tags.OPT_LR, value=args.learning_rate) LOGGER.log(key=tags.OPT_NAME, value="Adam") LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1) LOGGER.log(key=tags.OPT_HP_ADAM_BETA2, value=args.beta2) LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps) LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': LOGGER.log(key=tags.EVAL_START, value=0) start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=hr, ndcg=ndcg)) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": 0, "value": hr}) LOGGER.log(key=tags.EVAL_STOP, value=0) LOGGER.log(key='best_eval_throughput', value=eval_throughput) return success = False max_hr = 0 train_throughputs, eval_throughputs = [], [] LOGGER.log(key=tags.TRAIN_LOOP) for epoch in range(args.epochs): LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) LOGGER.log(key=tags.INPUT_HP_NUM_NEG, value=args.negative_samples) LOGGER.log(key=tags.INPUT_STEP_TRAIN_NEG_GEN) begin = time.time() epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.opt_level == "O2": with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) LOGGER.log(key='train_throughput', value=train_throughput) LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) LOGGER.log(key=tags.EVAL_START, value=epoch) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=hr, ndcg=ndcg, train_time=train_time, val_time=val_time)) LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": epoch, "value": hr}) LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold) LOGGER.log(key=tags.EVAL_STOP, value=epoch) eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) LOGGER.log(key='eval_throughput', value=eval_throughput) if hr > max_hr and args.local_rank == 0: max_hr = hr save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("New best hr! Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) success = True break if args.local_rank == 0: LOGGER.log(key='best_train_throughput', value=max(train_throughputs)) LOGGER.log(key='best_eval_throughput', value=max(eval_throughputs)) LOGGER.log(key='best_accuracy', value=max_hr) LOGGER.log(key='time_to_target', value=time.time() - main_start_time) LOGGER.log(key='time_to_best_model', value=best_model_timestamp - main_start_time) LOGGER.log(key=tags.RUN_STOP, value={"success": success}) LOGGER.log(key=tags.RUN_FINAL)
def main(): args = parse_args() print("init distributed") init_distributed(args) if args.rank == 0: wandb.init() dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') torch.manual_seed(1) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:{}'.format( args.local_rank))) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss(reduction='none').cuda( ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU if args.distributed: model = DDP(model, device_ids=[args.local_rank]) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) loss.backward() optimizer.step() if args.rank == 0: wandb.log({"Test loss": loss}) for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if args.rank == 0: wandb.log({"Test hit rate": hr}) wandb.log({"Test train epoch time": train_time}) wandb.log({"Test train throughput": train_throughput}) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch # save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("New best hr!") # torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple())
def main(): from grace_dl.dist.helper import timer, volume, tensor_bits args = parse_args() init_distributed(args) if args.weak_scaling: args.batch_size *= args.world_size init_wandb(args) init_grace(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() if args.seed is not None: torch.manual_seed(args.seed) train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cuda:0')) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cuda:0')) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cuda:0')) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') # if args.distributed: # model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) if args.local_rank == 0: print(model) print("{} parameters".format(utils.count_parameters(model))) # [print(parameter) for parameter in model.parameters()] if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] # broadcast model states from rank0 to other nodes !!! This is important! [torch.distributed.broadcast(p.data, src=0) for p in model.parameters()] # if args.local_rank == 0: # save_initial_state_path = os.path.join(args.checkpoint_dir, 'model_init.pth') # print("Saving the model to: ", save_initial_state_path) # torch.save(model.state_dict(), save_initial_state_path) for epoch in range(args.epochs): begin = time.time() train_time = 0 epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): batch_start = time.time() for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # check grad sparsity if args.sparsity_check: total_nonzero = 0 total_numel = 0 for index, (name, p) in enumerate(model.named_parameters()): sparsity = 1.0 - torch.sum( p.grad.data.abs() > 0).float() / p.grad.data.numel() total_nonzero += torch.sum(p.grad.data.abs() > 0).float() total_numel += p.grad.data.numel() if args.local_rank == 0: wandb.log( { f"{name}(sparsity)(numel={p.grad.data.numel()})": sparsity, }, commit=False) if args.local_rank == 0: wandb.log( { f"total_sparsity(numel={total_numel})": 1 - total_nonzero / total_numel, }, commit=True) # add grace just before optimizer.step() torch.cuda.synchronize() comm_start = time.time() for index, (name, p) in enumerate(model.named_parameters()): new_grad = args.grc.step(p.grad.data, name) p.grad.data = new_grad torch.cuda.synchronize() timer['comm'] = time.time() - comm_start # [torch.distributed.all_reduce(p.grad.data) for p in model.parameters()] # for param in model.parameters(): # dist.all_reduce(param.grad.data) # param.grad.data /= float(args.world_size) optimizer.step() for p in model.parameters(): p.grad = None if args.throughput: torch.cuda.synchronize() if args.log_time and args.local_rank == 0: timer['batch_time'] = time.time() - batch_start timer['computation'] = timer['batch_time'] - timer['comm'] print("Timer:", timer, '\n') timer['en/decoding'] = 0 timer['batch_time'] = 0 timer['computation'] = 0 timer['comm'] = 0 if args.log_volume and args.local_rank == 0: ratio = volume['compress'] / volume['nocompress'] volume['ratio_acc'].append(ratio) avg_ratio = sum(volume['ratio_acc']) / len(volume['ratio_acc']) print( f"Data volume:: compress {volume['compress']} no_compress {volume['nocompress']} ratio {ratio:.4f} avg_ratio {avg_ratio:.4f}" ) volume['compress'] = 0 volume['nocompress'] = 0 batch_throughput = args.batch_size / (time.time() - batch_start ) # global throughput train_time += time.time() - batch_start if (args.throughput or args.eval_at_every_batch) and args.local_rank == 0: print( f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t " f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}" ) if args.throughput and i == 3: break if args.local_rank == 0: print( f"Train :: Epoch [{epoch}/{args.epochs}] \t Batch [{i}/{num_batches}] \t " f"Time {time.time()-batch_start:.5f} \t Throughput {batch_throughput:.2f}" ) if args.eval_at_every_batch: hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) if args.local_rank == 0: wandb.log({ "eval/hr@10": hr, }) del epoch_users, epoch_items, epoch_label # train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time if args.throughput: train_throughput = batch_throughput train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if args.local_rank == 0: wandb.log( { "train_epoch_time": train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput, 'train_throughput': train_throughput, }, commit=False) if not args.eval_at_every_batch: wandb.log({ "eval/hr@10": hr, }, commit=False) wandb.log({"epoch": epoch}) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.throughput: break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple()) wandb.log({ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time })
def main(): args = parse_args() init_distributed(args) print(args.desc) mlflow.start_run(run_name=args.desc) mlflow.log_param('batch_size', args.batch_size) mlflow.log_param('num_threads', args.threads) mlflow.log_param('num_of_epochs', args.epochs) torch.set_num_threads(args.threads) print(f"{vars(args)} step='PARAMETER'") if args.seed is not None: torch.manual_seed(args.seed) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) #torch.cuda.synchronize() main_start_time = time.time() train_ratings = torch.load(args.data + '/train_ratings.pt', map_location=torch.device('cpu')) test_ratings = torch.load(args.data + '/test_ratings.pt', map_location=torch.device('cpu')) test_negs = torch.load(args.data + '/test_negatives.pt', map_location=torch.device('cpu')) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data( test_ratings, test_negs, args) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time log_data = data = { 'best_eval_throughput': eval_throughput, 'test_hr_at_10': hr } print(log_data) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data( train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1, 1) outputs = model(user, item) loss = traced_criterion(outputs, label).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) log_data = { 'train_throughput': train_throughput, 'hr_at_10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput } print(log_data) mlflow.log_metrics(log_data, epoch) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: log_data = { 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time } print(log_data) mlflow.log_metrics(log_data) mlflow.end_run()
def main(): args = parse_args() init_distributed(args) if args.local_rank == 0: dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: torch.manual_seed(args.seed) print("Saving results to {}".format(args.checkpoint_dir)) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) test_negs = torch.load(args.data+'/test_negatives.pt', map_location=torch.device('cuda:{}'.format(args.local_rank))) valid_negative = test_negs.shape[1] nb_maxs = torch.max(train_ratings, 0)[0] nb_users = nb_maxs[0].item() + 1 nb_items = nb_maxs[1].item() + 1 all_test_users = test_ratings.shape[0] test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(test_ratings, test_negs, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() local_batch = args.batch_size // args.world_size #traced_criterion = torch.jit.trace(criterion.forward, # (torch.rand(local_batch,1),torch.rand(local_batch,1))) traced_criterion = criterion pyprof.init() #import importlib #pyprof.wrap(importlib.import_module(__name__), "traced_criterion") #pyprof.wrap(traced_criterion, "__call__") if args.opt_level == "O2": model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = {k.replace('module.', '') : v for k,v in state_dict.items()} model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, distributed=args.distributed) val_time = time.time() - start eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={'best_eval_throughput' : eval_throughput, 'hr@10' : hr}) return max_hr = 0 best_epoch = 0 train_throughputs, eval_throughputs = [], [] with torch.autograd.profiler.emit_nvtx(): for epoch in range(args.epochs): begin = time.time() epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(train_ratings, nb_items, args) num_batches = len(epoch_users) for i in range(num_batches // args.grads_accumulated): if i == 10: profiler.start() for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j user = epoch_users[batch_idx] item = epoch_items[batch_idx] label = epoch_label[batch_idx].view(-1,1) outputs = model(user, item) nvtx.range_push("layer:Loss") loss = traced_criterion(outputs, label).float() nvtx.range_pop() nvtx.range_push("layer:Mean") loss = torch.mean(loss.view(-1), 0) nvtx.range_pop() if args.opt_level == "O2": with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() nvtx.range_push("layer:Adam") optimizer.step() nvtx.range_pop() if i == 10: profiler.stop() for p in model.parameters(): p.grad = None del epoch_users, epoch_items, epoch_label train_time = time.time() - begin begin = time.time() epoch_samples = len(train_ratings) * (args.negative_samples + 1) train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk, samples_per_user=valid_negative + 1, num_user=all_test_users, epoch=epoch, distributed=args.distributed) val_time = time.time() - begin eval_size = all_test_users * (valid_negative + 1) eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch,), data = {'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput}) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("New best hr! Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time}, step=tuple())